## Imports

In [2]:
from datetime import datetime
import numpy as np
import pickle
import json
import pandas as pd
from tqdm import tqdm
pd.set_option('display.max_columns', None)

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')
!cd gdrive/MyDrive/SPARSe

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Load project and donation data

In [3]:
# donations = pd.read_csv("./gdrive/MyDrive/SPARSe/data/donations.csv")
# projects = pd.read_csv("./gdrive/MyDrive/SPARSe/data/projects.csv")
essays = pd.read_csv("./gdrive/MyDrive/SPARSe/data/essays.csv")

In [4]:
#projects_donations_df - merge between projects and donations
projects_donations_df = projects.merge(donations, on='projectid', how='left')
projects_donations_df['date_posted'] = projects_donations_df.date_posted.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
projects_donations_df['deadline'] = projects_donations_df['date_posted'] + pd.DateOffset(months=4)

In [5]:
# projects_donations_filtered_df - Filter to donations before deadline
projects_donations_filtered_df = projects_donations_df[projects_donations_df.donation_timestamp <= projects_donations_df.deadline]

# find evaluation date
projects_donations_filtered_df['date_posted_plus_4_weeks'] = projects_donations_filtered_df['date_posted'] + pd.DateOffset(weeks=4)
def next_weekday(d, weekday):
    days_ahead = weekday - d.weekday()
    if days_ahead <= 0: # Target day already happened this week
        days_ahead += 7
    return d +  pd.DateOffset(days=days_ahead)

projects_donations_filtered_df['evaluation_date'] = projects_donations_filtered_df.date_posted_plus_4_weeks.apply(lambda x: next_weekday(x, 0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  projects_donations_filtered_df['date_posted_plus_4_weeks'] = projects_donations_filtered_df['date_posted'] + pd.DateOffset(weeks=4)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  projects_donations_filtered_df['evaluation_date'] = projects_donations_filtered_df.date_posted_plus_4_weeks.apply(lambda x: next_weekday(x, 0))


In [6]:
# Dataframe of all donations associated with a projectid - filtered by donations within 4-5 weeks of posting
valid_projects_donations_filtered_df = projects_donations_filtered_df[projects_donations_filtered_df.donation_timestamp < projects_donations_filtered_df.evaluation_date]


## Create Donation Level Features

In [7]:
# dataframe with donation features at project-level
donation_features_df = projects[["projectid", "total_price_excluding_optional_support"]]

donation_counts = valid_projects_donations_filtered_df.groupby("projectid").projectid.count().reset_index(name="total_number_of_donations")
donation_values = valid_projects_donations_filtered_df.groupby('projectid')['donation_to_project'].sum().reset_index(name='total_value_of_donations')

In [8]:
num_promo_matched = valid_projects_donations_filtered_df[['projectid', 'payment_was_promo_matched']]
num_promo_matched.payment_was_promo_matched.replace({'f':0, 't':1}, inplace=True)
num_promo_matched = num_promo_matched.groupby("projectid")['payment_was_promo_matched'].sum().reset_index(name='num_promo_matched')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [9]:
# prop_of_donations_from_same_city
prop_of_donations_from_same_city = valid_projects_donations_filtered_df[['projectid', 'donor_city', 'school_city', 'donation_to_project', 'donor_state', 'school_state']]
prop_of_donations_from_same_city.donor_city = prop_of_donations_from_same_city.donor_city.fillna("NA")
prop_of_donations_from_same_city.donor_state = prop_of_donations_from_same_city.donor_state.fillna("NA")
prop_of_donations_from_same_city['donation_within_city'] = (prop_of_donations_from_same_city.school_city.str.lower() == prop_of_donations_from_same_city.donor_city.str.lower()) & (prop_of_donations_from_same_city.school_state.str.lower() == prop_of_donations_from_same_city.donor_state.str.lower())
prop_of_donations_from_same_city['donation_within_city'] += 0
prop_of_donations_from_same_city = prop_of_donations_from_same_city.groupby(["projectid", "donation_within_city"])['donation_to_project'].sum().reset_index(name='donation_value_within_city')
prop_of_donations_from_same_city = donation_values.merge(
    prop_of_donations_from_same_city[prop_of_donations_from_same_city.donation_within_city == 1], 
    how='left',
    on='projectid'
).fillna(0.0)
prop_of_donations_from_same_city['prop_of_donations_from_same_city'] = prop_of_donations_from_same_city.donation_value_within_city / prop_of_donations_from_same_city.total_value_of_donations
prop_of_donations_from_same_city.prop_of_donations_from_same_city = prop_of_donations_from_same_city.prop_of_donations_from_same_city.fillna(0.0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prop_of_donations_from_same_city['donation_within_city'] = (prop_of_donations_from_same_city.school_city.str.lower() == prop_of_donations_from_same_city.donor_city.str.lower()) & (prop_of_donations_from_same_city.school_state.str.lower() == prop_of_donations_from_same_city.donor_state.str.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

In [10]:
# prop_of_donations_from_same_state
prop_of_donations_from_same_state = valid_projects_donations_filtered_df[['projectid', 'donor_state', 'school_state', 'donation_to_project']]
prop_of_donations_from_same_state.donor_state = prop_of_donations_from_same_state.donor_state.fillna("NA").replace({'  ': "NA"})
prop_of_donations_from_same_state['donation_within_state'] = prop_of_donations_from_same_state.school_state.str.lower() == prop_of_donations_from_same_state.donor_state.str.lower()
prop_of_donations_from_same_state['donation_within_state'] += 0
prop_of_donations_from_same_state = prop_of_donations_from_same_state.groupby(["projectid", "donation_within_state"])['donation_to_project'].sum().reset_index(name='donation_value_within_state')
prop_of_donations_from_same_state = donation_values.merge(
    prop_of_donations_from_same_state[prop_of_donations_from_same_state.donation_within_state == 1], 
    how='left',
    on='projectid'
).fillna(0.0)
prop_of_donations_from_same_state['prop_of_donations_from_same_state'] = prop_of_donations_from_same_state.donation_value_within_state / prop_of_donations_from_same_state.total_value_of_donations
prop_of_donations_from_same_state.prop_of_donations_from_same_state = prop_of_donations_from_same_state.prop_of_donations_from_same_state.fillna(0.0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prop_of_donations_from_same_state['donation_within_state'] = prop_of_donations_from_same_state.school_state.str.lower() == prop_of_donations_from_same_state.donor_state.str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prop_of_donations_from_same_state['donation_within_state'] += 0


In [11]:
# prop_of_teacher_donors
prop_of_teacher_donors = valid_projects_donations_filtered_df[['projectid', 'is_teacher_acct', 'donation_to_project']]
prop_of_teacher_donors.is_teacher_acct.replace({'f':0, 't':1}, inplace=True)
prop_of_teacher_donors = prop_of_teacher_donors.groupby(["projectid", "is_teacher_acct"])['donation_to_project'].sum().reset_index(name='donation_value_from_teacher')
prop_of_teacher_donors = donation_values.merge(
    prop_of_teacher_donors[prop_of_teacher_donors.is_teacher_acct == 1], 
    how='left',
    on='projectid'
).fillna(0.0)
prop_of_teacher_donors['prop_of_teacher_donors'] = prop_of_teacher_donors.donation_value_from_teacher / prop_of_teacher_donors.total_value_of_donations
prop_of_teacher_donors.prop_of_teacher_donors = prop_of_teacher_donors.prop_of_teacher_donors.fillna(0.0)

In [12]:
# prop_via_giving_page
prop_via_giving_page = valid_projects_donations_filtered_df[['projectid', 'via_giving_page', 'donation_to_project']]
prop_via_giving_page.via_giving_page.replace({'f':0, 't':1}, inplace=True)
prop_via_giving_page = prop_via_giving_page.groupby(["projectid", "via_giving_page"])['donation_to_project'].sum().reset_index(name='donation_value_via_giving_page')
prop_via_giving_page = donation_values.merge(
    prop_via_giving_page[prop_via_giving_page.via_giving_page == 1], 
    how='left',
    on='projectid'
).fillna(0.0)
prop_via_giving_page['prop_via_giving_page'] = prop_via_giving_page.donation_value_via_giving_page / prop_via_giving_page.total_value_of_donations
prop_via_giving_page.prop_via_giving_page = prop_via_giving_page.prop_via_giving_page.fillna(0.0)

Merge


In [13]:
# Merge total number of donations
donation_features_df = donation_features_df.merge(donation_counts, on="projectid", how="left").fillna(0)
donation_features_df.total_number_of_donations = donation_features_df.total_number_of_donations.astype(int)

# Merge total value of donations
donation_features_df = donation_features_df.merge(donation_values, on="projectid", how="left").fillna(0)

# Merge num promo matched
donation_features_df = donation_features_df.merge(num_promo_matched, on="projectid", how="left").fillna(0)

# Merge prop_of_donations_from_same_city
donation_features_df = donation_features_df.merge(prop_of_donations_from_same_city[['projectid', 'prop_of_donations_from_same_city']], on="projectid", how="left").fillna(0)

# Merge prop_of_donations_from_same_state
donation_features_df = donation_features_df.merge(prop_of_donations_from_same_state[['projectid', 'prop_of_donations_from_same_state']], on="projectid", how="left").fillna(0)

# Merge prop_of_teacher_donors
donation_features_df = donation_features_df.merge(prop_of_teacher_donors[['projectid', 'prop_of_teacher_donors']], on="projectid", how="left").fillna(0)

# Merge prop_via_giving_page
donation_features_df = donation_features_df.merge(prop_via_giving_page[['projectid', 'prop_via_giving_page']], on="projectid", how="left").fillna(0)

Proportions



In [14]:
donation_features_df["avg_value_per_donation"] = (donation_features_df["total_value_of_donations"] / donation_features_df["total_number_of_donations"]).fillna(0)

donation_features_df["total_prop_funded"] = (donation_features_df["total_value_of_donations"] / donation_features_df["total_price_excluding_optional_support"]).fillna(1).replace([np.inf, -np.inf], 1.0)

donation_features_df["prop_promo_matched"] = (donation_features_df["num_promo_matched"] / donation_features_df["total_number_of_donations"]).fillna(0)

## Export Donations Features

In [15]:
donation_features_df.to_csv('Model_Donations_Features_v1.csv')

## Create Project Level Features

In [16]:
project_features_df = projects[['projectid', 'school_state', 'school_metro', 'school_charter', 'school_magnet', 'school_year_round', 'school_nlns',
       'school_kipp', 'school_charter_ready_promise', 'teacher_prefix',
       'teacher_teach_for_america', 'teacher_ny_teaching_fellow', 'primary_focus_area', 'secondary_focus_area', 'resource_type',
       'poverty_level', 'grade_level', 'total_price_excluding_optional_support', 'students_reached',
       'eligible_double_your_impact_match', 'eligible_almost_home_match']]

## Imputing project level features

In [17]:
missing_str = "missing"

# Replace La school state code with LA
project_features_df.school_state.replace({'La': 'LA'}, inplace=True)

# Replace school metro nan with missing
project_features_df.school_metro.fillna(missing_str, inplace=True)

# Replace teacher prefix nan with missing
project_features_df.teacher_prefix.fillna(missing_str, inplace=True)

# Replace primary focus area nan with missing
project_features_df.primary_focus_area.fillna(missing_str, inplace=True)

# Replace secondary focus area nan with missing
project_features_df.secondary_focus_area.fillna(missing_str, inplace=True)

# Replace resource type nan with missing
project_features_df.resource_type.fillna(missing_str, inplace=True)

# Replace grade level nan with missing
project_features_df.grade_level.fillna(missing_str, inplace=True)

# Replace students reached with 0
project_features_df.students_reached.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


## Convert to dummy variables

In [18]:
dummy_columns = ['school_state', 'school_metro', 'teacher_prefix', 'primary_focus_area', 'secondary_focus_area', 'resource_type',
       'poverty_level', 'grade_level']
project_features_dummy_df = pd.get_dummies(project_features_df, columns=dummy_columns)

# Replace 'f' with 0 and 't' with 1
project_features_dummy_df.replace({'f': 0, 't': 1}, inplace=True)

In [19]:
# add basic columns to project df
# projects['date_posted'] = projects.date_posted.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
# projects['date_posted_plus_4_weeks'] = projects['date_posted'] + pd.DateOffset(weeks=4)
# projects['evaluation_date'] = projects.date_posted_plus_4_weeks.apply(lambda x: next_weekday(x, 0))

## Export Projects Features

In [20]:
project_features_dummy_df.to_csv('Model_Projects_Features_v1.csv')

## Create Essay Level Features

In [6]:
essay_features_df = essays[['projectid', 'title', 'short_description', 'essay']]

# # num_words_in_title
essay_features_df.title = essay_features_df.title.fillna("")
essay_features_df['num_words_in_title'] = essay_features_df.title.str.split().apply(len)

# # num_words_in_short_description
essay_features_df.short_description = essay_features_df.short_description.fillna("")
essay_features_df['num_words_in_short_description'] = essay_features_df.short_description.str.split().apply(len)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  essay_features_df['num_words_in_title'] = essay_features_df.title.str.split().apply(len)


In [7]:
# num_words_in_essay
essay_features_df.essay = essay_features_df.essay.fillna("")
# essay_features_df['num_words_in_essay'] = essay_features_df.essay.str.split().apply(len)

# essay_features_df = essay_features_df[['projectid', 'num_words_in_title', 'num_words_in_short_description', 'num_words_in_essay']]

In [23]:
essay_features_df['num_words_in_essay'] = 0
for i in tqdm(range(len(essay_features_df)//1000 + 1)):
  essay_features_df.iloc[i * 1000 : (i+1) * 1000]['num_words_in_essay'] = essay_features_df.iloc[i * 1000 : (i+1) * 1000]['essay'].str.split().apply(len)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  essay_features_df.iloc[i * 1000 : (i+1) * 1000]['num_words_in_essay'] = essay_features_df.iloc[i * 1000 : (i+1) * 1000]['essay'].str.split().apply(len)
100%|██████████| 665/665 [00:20<00:00, 31.85it/s]


In [27]:
essay_features_df = essay_features_df[['projectid', 'num_words_in_title', 'num_words_in_short_description', 'num_words_in_essay']]

## Export Essay Features

In [29]:
essay_features_df.to_csv('Model_Essay_Features_v1.csv')

## Merge Features

In [24]:
project_features_dummy_df = pd.read_csv('./gdrive/MyDrive/SPARSe/features_labels/Model_Projects_Features_v1.csv').drop(['Unnamed: 0'], axis=1)

In [25]:
donation_features_df = pd.read_csv('./gdrive/MyDrive/SPARSe/features_labels/Model_Donations_Features_v1.csv').drop(['Unnamed: 0'], axis=1)

In [28]:
essay_features_df = pd.read_csv('./gdrive/MyDrive/SPARSe/features_labels/Model_Essay_Features_v1.csv').drop(['Unnamed: 0'], axis=1)

In [30]:
features_df = project_features_dummy_df.merge(donation_features_df, on=['projectid', 'total_price_excluding_optional_support'], how='inner')

In [31]:
features_df = features_df.merge(essay_features_df, on='projectid', how='left')

In [33]:
features_df.to_csv('Model_Features_v1.csv')

## Create Labels

In [23]:
funded_amt_df = projects_donations_filtered_df.groupby('projectid')['donation_to_project'].sum().reset_index(name='funded_amt')
funding_frac_df = projects.merge(funded_amt_df, on='projectid', how='left')[['projectid', 'funded_amt', 'total_price_excluding_optional_support']].fillna(0.0)
funding_frac_df['funding_frac'] = funding_frac_df.funded_amt / funding_frac_df.total_price_excluding_optional_support
funding_frac_df = funding_frac_df.fillna(1.0)
funding_frac_df.replace([np.inf, -np.inf], 1.0, inplace=True)
funding_frac_df['is_fully_funded'] = funding_frac_df.funding_frac >= 1.0

In [24]:
labels_binary_df = funding_frac_df[['projectid', 'is_fully_funded']].replace({False: 0, True: 1}, inplace=False)
labels_binary_df.to_csv('Model_Labels_Binary_v0.csv')

In [25]:
labels_continuous_df = funding_frac_df[['projectid', 'funding_frac']]
labels_continuous_df.to_csv('Model_Labels_Continuous_v0.csv')

## Create Train/Test Splits

In [None]:
train_test_val_project_ids = json.load(open('update2_baseline_3trainvalsplit-enddate_2013_11_24-10wk.json', 'r'))

In [None]:
features_val3df = features_df[features_df.projectid.isin(train_test_val_project_ids['valid3'])]
features_train3df = features_df[features_df.projectid.isin(train_test_val_project_ids['train3'])]

labels_val3df = labels_df[labels_df.projectid.isin(train_test_val_project_ids['valid3'])]
labels_train3df = labels_df[labels_df.projectid.isin(train_test_val_project_ids['train3'])]

In [None]:
features_val3df.to_csv('Features_val3.csv')
features_train3df.to_csv('Features_train3.csv')
labels_val3df.to_csv('Labels_val3.csv')
labels_train3df.to_csv('Labels_train3.csv')