## Imports

In [1]:
from datetime import datetime
import numpy as np
import pickle
import json
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')
!cd gdrive/MyDrive/SPARSe

Mounted at /content/gdrive


## Load project and donation data

In [5]:
donations = pd.read_csv("./data/donations.csv")
projects = pd.read_csv("./data/projects.csv")
essays = pd.read_csv("./data/essays.csv")

In [6]:
#projects_donations_df - merge between projects and donations
projects_donations_df = projects.merge(donations, on='projectid', how='left')
projects_donations_df['date_posted'] = projects_donations_df.date_posted.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
projects_donations_df['deadline'] = projects_donations_df['date_posted'] + pd.DateOffset(months=4)

In [10]:
# projects_donations_filtered_df - Filter to donations before deadline
projects_donations_filtered_df = projects_donations_df[projects_donations_df.donation_timestamp <= projects_donations_df.deadline]

# find evaluation date
projects_donations_filtered_df['date_posted_plus_4_weeks'] = projects_donations_filtered_df['date_posted'] + pd.DateOffset(weeks=4)
def next_weekday(d, weekday):
    days_ahead = weekday - d.weekday()
    if days_ahead <= 0: # Target day already happened this week
        days_ahead += 7
    return d +  pd.DateOffset(days=days_ahead)

projects_donations_filtered_df['evaluation_date'] = projects_donations_filtered_df.date_posted_plus_4_weeks.apply(lambda x: next_weekday(x, 0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  projects_donations_filtered_df['date_posted_plus_4_weeks'] = projects_donations_filtered_df['date_posted'] + pd.DateOffset(weeks=4)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  projects_donations_filtered_df['evaluation_date'] = projects_donations_filtered_df.date_posted_plus_4_weeks.apply(lambda x: next_weekday(x, 0))


In [28]:
# Dataframe of all donations associated with a projectid - filtered by donations within 4-5 weeks of posting
valid_projects_donations_filtered_df = projects_donations_filtered_df[projects_donations_filtered_df.donation_timestamp < projects_donations_filtered_df.evaluation_date]


## Create Donation Level Features

In [133]:
# dataframe with donation features at project-level
donation_features_df = projects[["projectid", "total_price_excluding_optional_support"]]

donation_counts = valid_projects_donations_filtered_df.groupby("projectid").projectid.count().reset_index(name="total_number_of_donations")
donation_values = valid_projects_donations_filtered_df.groupby('projectid')['donation_to_project'].sum().reset_index(name='total_value_of_donations')

In [151]:
num_promo_matched = valid_projects_donations_filtered_df[['projectid', 'payment_was_promo_matched']]
num_promo_matched.payment_was_promo_matched.replace({'f':0, 't':1}, inplace=True)
num_promo_matched = num_promo_matched.groupby("projectid")['payment_was_promo_matched'].sum().reset_index(name='num_promo_matched')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


Merge


In [155]:

# Merge total number of donations
donation_features_df = donation_features_df.merge(donation_counts, on="projectid", how="left").fillna(0)
donation_features_df.total_number_of_donations = donation_features_df.total_number_of_donations.astype(int)

# Merge total value of donations
donation_features_df = donation_features_df.merge(donation_values, on="projectid", how="left").fillna(0)

# Merge num promo matched
donation_features_df = donation_features_df.merge(num_promo_matched, on="projectid", how="left").fillna(0)

Proportions



In [156]:
donation_features_df["avg_value_per_donation"] = (donation_features_df["total_value_of_donations"] / donation_features_df["total_number_of_donations"]).fillna(0)

donation_features_df["total_prop_funded"] = (donation_features_df["total_value_of_donations"] / donation_features_df["total_price_excluding_optional_support"]).fillna(1).replace([np.inf, -np.inf], 1.0)

donation_features_df["prop_promo_matched"] = (donation_features_df["num_promo_matched"] / donation_features_df["total_number_of_donations"]).fillna(0)

## Create Project Level Features

In [265]:
project_features_df = projects[['projectid', 'school_state', 'school_metro', 'school_charter', 'school_magnet', 'school_year_round', 'school_nlns',
       'school_kipp', 'school_charter_ready_promise', 'teacher_prefix',
       'teacher_teach_for_america', 'teacher_ny_teaching_fellow', 'primary_focus_area', 'secondary_focus_area', 'resource_type',
       'poverty_level', 'grade_level', 'total_price_excluding_optional_support', 'students_reached',
       'eligible_double_your_impact_match', 'eligible_almost_home_match']]

## Imputing project level features

In [266]:
missing_str = "missing"

# Replace La school state code with LA
project_features_df.school_state.replace({'La': 'LA'}, inplace=True)

# Replace school metro nan with missing
project_features_df.school_metro.fillna(missing_str, inplace=True)

# Replace teacher prefix nan with missing
project_features_df.teacher_prefix.fillna(missing_str, inplace=True)

# Replace primary focus area nan with missing
project_features_df.primary_focus_area.fillna(missing_str, inplace=True)

# Replace secondary focus area nan with missing
project_features_df.secondary_focus_area.fillna(missing_str, inplace=True)

# Replace resource type nan with missing
project_features_df.resource_type.fillna(missing_str, inplace=True)

# Replace grade level nan with missing
project_features_df.grade_level.fillna(missing_str, inplace=True)

# Replace students reached with 0
project_features_df.students_reached.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


## Convert to dummy variables

In [267]:
dummy_columns = ['school_state', 'school_metro', 'teacher_prefix', 'primary_focus_area', 'secondary_focus_area', 'resource_type',
       'poverty_level', 'grade_level']
project_features_dummy_df = pd.get_dummies(project_features_df, columns=dummy_columns)

# Replace 'f' with 0 and 't' with 1
project_features_dummy_df.replace({'f': 0, 't': 1}, inplace=True)

In [268]:
# add basic columns to project df
# projects['date_posted'] = projects.date_posted.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
# projects['date_posted_plus_4_weeks'] = projects['date_posted'] + pd.DateOffset(weeks=4)
# projects['evaluation_date'] = projects.date_posted_plus_4_weeks.apply(lambda x: next_weekday(x, 0))

## Merge donation features with projects

In [269]:
features_df = project_features_dummy_df.merge(donation_features_df, on=['projectid', 'total_price_excluding_optional_support'], how='inner')

In [272]:
features_df.to_csv('Model_Features_v0.csv')

## Create Labels

In [273]:
funded_amt_df = projects_donations_filtered_df.groupby('projectid')['donation_to_project'].sum().reset_index(name='funded_amt')
funding_frac_df = projects.merge(funded_amt_df, on='projectid', how='left')[['projectid', 'funded_amt', 'total_price_excluding_optional_support']].fillna(0.0)
funding_frac_df['funding_frac'] = funding_frac_df.funded_amt / funding_frac_df.total_price_excluding_optional_support
funding_frac_df = funding_frac_df.fillna(1.0)
funding_frac_df.replace([np.inf, -np.inf], 1.0, inplace=True)
funding_frac_df['is_fully_funded'] = funding_frac_df.funding_frac >= 1.0

In [274]:
labels_df = funding_frac_df[['projectid', 'is_fully_funded']].replace({False: 0, True: 1}, inplace=False)
labels_df.to_csv('Model_Labels_v0.csv')

## Create Train/Test Splits

In [275]:
train_test_val_project_ids = json.load(open('update2_baseline_3trainvalsplit-enddate_2013_11_24-10wk.json', 'r'))

In [276]:
features_val3df = features_df[features_df.projectid.isin(train_test_val_project_ids['valid3'])]
features_train3df = features_df[features_df.projectid.isin(train_test_val_project_ids['train3'])]

labels_val3df = labels_df[labels_df.projectid.isin(train_test_val_project_ids['valid3'])]
labels_train3df = labels_df[labels_df.projectid.isin(train_test_val_project_ids['train3'])]

In [277]:
features_val3df.to_csv('Features_val3.csv')
features_train3df.to_csv('Features_train3.csv')
labels_val3df.to_csv('Labels_val3.csv')
labels_train3df.to_csv('Labels_train3.csv')