# Recommendation System - Model 1

## Import packages and dataframes

In [1]:
import numpy as np
import pandas as pd
import random as rd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder as OHE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

rd.seed(123)

In [2]:
# import all the files

donationsDF = pd.read_csv('Donations.csv')
donorsDF = pd.read_csv('Donors.csv')
projectsDF = pd.read_csv('Projects.csv')
schoolsDF = pd.read_csv('Schools.csv')
resourcesDF = pd.read_csv('Resources.csv')

  interactivity=interactivity, compiler=compiler, result=result)


## Data Cleaning: Remove one-time donors

In [7]:
donorsDF_merged = donorsDF.merge(donationsDF, on = 'Donor ID', how = 'inner') # donation ID is the primary key here

In [9]:
# Get the count of donations a donor has made
df_temp1 = donorsDF_merged.groupby(['Donor ID']).size().reset_index(name='Num Donations')
df_temp2 = df_temp1[df_temp1['Num Donations'] > 1] # removing all the single-donors

print("We got rid of %d single-donors." % (df_temp1.shape[0] - df_temp2.shape[0]))
print("The shape of the remaining dataframe:", df_temp2.shape)

Unnamed: 0,Donor ID,Num Donations
0,00000ce845c00cbf0686c992fc369df4,1
1,00002783bc5d108510f3f9666c8b1edd,1


## Recurring donors for modeling

In [11]:
uniqDonorIDList = list(df_temp2['Donor ID']) # list of recurring donors
df_final = donorsDF_merged[donorsDF_merged['Donor ID'].isin(uniqDonorIDList)] # recurring donoros

del df_temp1, df_temp2, donorsDF_merged
df_final.shape

(3215610, 11)

In [12]:
def removeDuplicates(df, col):
    df = df.drop_duplicates(subset = [col]) # removes duplicates from the column `col`
    return df

In [13]:
df_final = removeDuplicates(df_final, 'Donation ID') # removes duplicate Donation IDs

# donation ID is the primary key here
df_final = df_final.merge(projectsDF, on = "Project ID", how = "inner") # adding projects to the final dataframe

df_final = df_final.merge(schoolsDF, on = "School ID", how = "inner") # adding schools to the final dataframe

(3215570, 11)

In [79]:
# discretizing (binning) the project cost

projCost = pd.DataFrame((df_final['Project Cost'] // 500 + 1) * 500).astype('int')

projCost[projCost['Project Cost'] > 3000] = 3000

projCost['Project Cost'] = projCost['Project Cost'].astype('str')
projCost[projCost['Project Cost'] == '3000'] = "3000+"

df_final['Project Cost'] = projCost
df_final = df_final.rename(columns = {"Project Cost": "Project Cost (Up to)"})

print(df_final.shape)

(3156862, 36)


Unnamed: 0,Donor ID,Donor City,Donor State,Donor Is Teacher,Donor Zip,Project ID,Donation ID,Donation Included Optional Donation,Donation Amount,Donor Cart Sequence,...,Project Current Status,Project Fully Funded Date,School Name,School Metro Type,School Percentage Free Lunch,School State,School Zip,School City,School County,School District
666004,b2555e38cb649277fdbddc4c1acd290c,Boston,Massachusetts,Yes,21,21f9f1a452b90b08572c051ec4dcd29d,1147c2d6a73b69ac7ebc16dbf029b315,Yes,45.0,7,...,Fully Funded,2017-02-09,Curtis Guild Elementary School,urban,81.0,Massachusetts,2128,East Boston,Suffolk,Boston Public School District
178482,770ab5fafe8342a82c8c9110692bc444,Charleston,South Carolina,Yes,294,bd5d1c0e5b3b58265726af22def13b81,570b4ed4da4d41ea1a8a0c08b2bb9f54,Yes,50.0,9,...,Fully Funded,2018-03-09,Burke High School,urban,85.0,South Carolina,29403,Charleston,Charleston,Charleston Co School District
2073681,700ddab4673f35076f7e95631dccd55d,Grand Forks,North Dakota,No,582,1a67dfb38989e49ad744aedc856184cd,8bcd99179c69a2693d8a3f47707fd75c,Yes,100.0,4,...,Fully Funded,2016-08-26,Emerado Public School,rural,70.0,North Dakota,58228,Emerado,Grand Forks,Emerado School District 127
749250,de8b8ed34d1800ca93f7ef480857d872,Cleveland,Ohio,No,441,599253729f90d605c122b2d64863a90e,cc19bc57da9d55329707291f797bcba9,No,1.0,1004,...,Fully Funded,2018-02-21,STEAM Academy,unknown,87.0,Missouri,63042,Hazelwood,St Louis,Ferguson-Florissant Sd R2
2152102,383df43a131fc39c8d8a59b2efcf1ebd,West Palm Beach,Florida,No,334,f25019f00401d05477abec0f427cae79,278d99217a3ffcc1ab5529110ebc6d99,Yes,50.0,44,...,Fully Funded,2018-03-27,Cholee Lake Elementary School,suburban,91.0,Florida,33413,Greenacres,Palm Beach,Palm Beach Co School District


In [82]:
#we are only considering all the donations that came to projects in California

df_final = removeDuplicates(df_final, 'Donation ID') # removes duplicate Donation IDs
print("The original dataframe size: ", df_final.shape)

df_final_cal = df_final[df_final['School State'] == 'California']
print("The updated dataframe size including only California: ", df_final_cal.shape)

df_final_cal.sample(2)

The original dataframe size:  (3156862, 36)
The updated dataframe size including only California:  (479056, 36)


Unnamed: 0,Donor ID,Donor City,Donor State,Donor Is Teacher,Donor Zip,Project ID,Donation ID,Donation Included Optional Donation,Donation Amount,Donor Cart Sequence,...,Project Current Status,Project Fully Funded Date,School Name,School Metro Type,School Percentage Free Lunch,School State,School Zip,School City,School County,School District
6964,8f4befc408d5fb279be9e32611e0e0b7,Palo Alto,California,No,943,233a24939a563317cf242eb271f0d157,7b8c28289fd207c74928b72d1ee03979,No,99.0,9,...,Fully Funded,2015-08-23,Costano School-49Ers Academy,suburban,92.0,California,94303,E Palo Alto,San Mateo,Ravenswood City Elem Sch Dist
2394914,8e802e1abaa9d6f0cb58c26bc16c2a54,Mission Viejo,California,No,926,c993ac9f7fd87570f3da7a695a47e4b7,abf819a6b7010d55ffcf5f37795decd4,Yes,25.0,2,...,Fully Funded,2016-09-29,La Paz Intermediate School,suburban,21.0,California,92691,Mission Viejo,Orange,Saddleback Valley Unified Sd


### Training and Test Split

In [83]:
columnsList = ['Donor ID', 'Donation Received Date', 'Donor City', 'Donor State',
       'Donor Is Teacher', 'Donor Zip', 'Project ID', 'Donation ID',
       'Donation Included Optional Donation', 'Donation Amount',
       'Donor Cart Sequence', 'School ID', 'Teacher ID',
       'Teacher Project Posted Sequence', 'Project Type', 'Project Title',
       'Project Essay', 'Project Short Description', 'Project Need Statement',
       'Project Subject Category Tree', 'Project Subject Subcategory Tree',
       'Project Grade Level Category', 'Project Resource Category',
       'Project Cost (Up to)', 'Project Posted Date', 'Project Expiration Date',
       'Project Current Status', 'Project Fully Funded Date', 'School Name',
       'School Metro Type', 'School Percentage Free Lunch', 'School State',
       'School Zip', 'School City', 'School County', 'School District']

df_final_cal = df_final_cal[columnsList]

df_final_cal = df_final_cal.sort_values(by = ['Donor ID', 'Donation Received Date']) # sorts dataframe by donors and their
                                                                            # first donation date


In [94]:
donorsKeep = df_final_cal.drop_duplicates(subset=['Donor ID'], keep='first') # keep just the first donation for each donor
donorsKeep = donorsKeep.sort_values(['Donation Received Date']) # sort the donation by donation date

Unnamed: 0,Donor ID,Donation Received Date,Donor City,Donor State,Donor Is Teacher,Donor Zip,Project ID,Donation ID,Donation Included Optional Donation,Donation Amount,...,Project Current Status,Project Fully Funded Date,School Name,School Metro Type,School Percentage Free Lunch,School State,School Zip,School City,School County,School District
1267688,cca23670ed0d03bddfcd9c16fa9dacb7,2012-12-17 11:22:56,Las Vegas,Nevada,No,891,05d7c3d0703441e6b16ba5dfdd878c47,e353b95ffda5f4650224b2f638721c26,Yes,100.0,...,Fully Funded,2013-02-28,Oliveira Elementary School,suburban,20.0,California,94536,Fremont,Alameda,Fremont Unif School District
2758782,8d345d4ded01a6c7f3b79e9686dfe741,2012-12-17 19:56:44,Oakland,California,Yes,946,4f921539c0a89df04b8a946fc31fb9b2,d2acc567eb8e09ccf87f85e8505d149f,Yes,20.0,...,Fully Funded,2013-08-29,Oakland Charter High School,urban,79.0,California,94601,Oakland,Alameda,Oakland Unified School Dist
2758777,45c45882b12febaf50ea7d25c464c33c,2012-12-19 18:23:19,San Bruno,California,No,940,4f921539c0a89df04b8a946fc31fb9b2,9cc5978ca69e102cea481a5db08a7f87,Yes,25.0,...,Fully Funded,2013-08-29,Oakland Charter High School,urban,79.0,California,94601,Oakland,Alameda,Oakland Unified School Dist
2758776,2ec1193b14119b158c1d3bfd67b2940b,2012-12-22 02:03:39,Oakland,California,No,946,4f921539c0a89df04b8a946fc31fb9b2,9d84a921ab8616dfad88ef15206bb2de,Yes,25.0,...,Fully Funded,2013-08-29,Oakland Charter High School,urban,79.0,California,94601,Oakland,Alameda,Oakland Unified School Dist
1704614,cc9cf018f0b68657c0ab7891df16c0fe,2013-01-01 14:35:18,San Francisco,California,Yes,941,7045b057d157ea53de41f5207ded655d,2e4ad8076f94da8b945e44c8fa8b9bb8,No,25.0,...,Fully Funded,2013-02-18,Lafayette Elementary School,urban,33.0,California,94121,San Francisco,San Francisco,San Francisco Unified Sch Dist


In [87]:
donationsTrainNum = int(round(0.7 * len(donorsKeep), 0)) # the top ~70% is the training data

donationsTrain = donorsKeep.iloc[:donationsTrainNum,:] # training data
donationsValid = donorsKeep.iloc[donationsTrainNum:,] # the bottom ~30% is the validation data

del donorsKeep

In [89]:
donorsTrainList = list(donationsTrain['Donor ID'])
donorsValidList = list(donationsValid['Donor ID'])

In [90]:
donationsTrainFinal = df_final_cal[df_final_cal['Donor ID'].isin(donorsTrainList)] # expanding the training data
donationsValidFinal = df_final_cal[df_final_cal['Donor ID'].isin(donorsValidList)] # expanding the validation data

donationsTrainFinal = donationsTrainFinal.sort_values(by = ['Donor ID', 'Donation Received Date']) 
donationsValidFinal = donationsValidFinal.sort_values(by = ['Donor ID', 'Donation Received Date'])

In [91]:
# remove duplicates
donationsTrainFinal = removeDuplicates(donationsTrainFinal, 'Donation ID') # donation ID is the primary key here
donationsValidFinal = removeDuplicates(donationsValidFinal, 'Donation ID')

In [92]:
print("Training data shape: \n", donationsTrainFinal.shape)
print("Validation data shape: \n", donationsValidFinal.shape)

print("Percentage of training data: ", donationsTrainFinal.shape[0] / len(df_final_cal))

Training data shape: 
 (379181, 36)
Validation data shape: 
 (99875, 36)
Percentage of training data:  0.7915170668982332


In [51]:
del donationsDF, donorsDF, projectsDF, schoolsDF, resourcesDF

## Building the recommendation system



In [101]:
# because we are doing content-based filtering, here are the features of the projects we are interested in:

ohFeatures = ['Project ID', 'Project Expiration Date', 'Project Cost (Up to)', 'Project Type','Project Subject Category Tree',
       'Project Grade Level Category', 'Project Resource Category',
       'Project Current Status',
       'School Metro Type', 'School State']

# features we will do one hot encoding on
dropCol = ['Project Cost (Up to)', 'Project Type', 'Project Subject Category Tree', 'Project Grade Level Category',
           'Project Resource Category', 'Project Current Status',
           'School Metro Type', 'School State']

def oheDataFrame(dataframe): # the dataframe could either be training or validation set
    projFeatures = dataframe[ohFeatures] # only considering the selected features (ohFeatures)
    projFeatures = removeDuplicates(projFeatures, 'Project ID')
    projFeatures = projFeatures.dropna() # drop the null values as well
    projFeatures = projFeatures.sort_values(by = ['Project ID'])

    featuresList = list(projFeatures.columns)
    
    enc = OHE(handle_unknown = 'ignore')
    enc.fit(projFeatures.iloc[:,2:]) # fit the one hot vector on our dataframe
    
    # convert the one-hot matrix into a dataframe
    oneHotMatrix = pd.DataFrame(enc.transform(projFeatures.iloc[:,2:]).toarray())
    projOHM = pd.concat([projFeatures.reset_index(drop = True), oneHotMatrix.reset_index(drop = True)], axis = 1)
    projOHM = projOHM.drop(dropCol, axis = 1) # dropping the categorical columns because they have already been vectorized

    # data dictionary to track the one hot matrix

    dataDict = list(enc.get_feature_names(dropCol)) # please note that any new input should strictly should the indexes
                                                    # mentioned in data dictionary

    
    # scale and normalize the dataset

    ohmDF = pd.DataFrame(StandardScaler().fit_transform(projOHM.iloc[:,2:]))
    ohmDFwProj = pd.concat([projOHM[['Project ID', 'Project Expiration Date']], ohmDF], axis = 1)
    ohmDFwProj['Project Expiration Date'] =  pd.to_datetime(ohmDFwProj['Project Expiration Date'])
    
    return ohmDFwProj

In [42]:
# this functions returns all the project that the donor has donated to
# input is a donor ID (a string)
# output is an array of all the donated projects

def donatedProjects(main_dataframe, donorID):
    df = main_dataframe[main_dataframe['Donor ID'] == donorID]
    df = df.sort_values(['Donation Received Date']) # sort the donated projects by date (in ascending order)
    df = df.drop_duplicates(subset = ['Project ID']) # remove repetitive donations on the same project
    donatedProjsList = np.array(df['Project ID']) # we only need the donated projects
    
    return donatedProjsList

# test the function here
# donatedProjects(donationsTrainFinal, 'd0dc7dcaae4b97cd465ec7b53c86cd6a')

In [44]:
# this function returns the one-hot encoded project features for a given project ID
# input is a project ID
# output is an array of the project features

def projectFeatures(projDF, projectID):
    feat = np.array(projDF[projDF['Project ID'] == projectID]) #projDF is the database of the project featuers
    
    return feat

In [45]:
def cosineSimilarity(projDF, obv, numRec): #only works for one observation (project) at a time

    project = obv[0][2:] # project features
    project = project.reshape(1,len(project)) # convert 1D to 2D array
    
    projectID = obv[0][0]
    projectDate = obv[0][1]
    
    # getting the cosine similarity between our feature matrix and test sample
    cosSim = cosine_similarity(projDF.iloc[:,2:], project) # one-hot encoded dataframe is the dataframe here
    recProjects = pd.concat([projDF.iloc[:,:2], pd.DataFrame(cosSim)], axis = 1)

    recProjects = recProjects[recProjects['Project Expiration Date'] > projectDate] # only recommend projects that expire after the donation date
    
    recProjects = recProjects.sort_values(by = 0, ascending = False) # sorting the similarity in descending order
    top10Rec = list(recProjects.head(numRec + 1).iloc[1:,0]) # the first column is the project ID
    
    return top10Rec
    

In [46]:
def randomRecommendation(dataframe, first_proj, numRec):
    feature = 'Project ID'
    projects = removeDuplicates(dataframe, feature)
    projects = list(projects['Project ID'])
    
    random = True
    while random:
        recProjs = rd.sample(projects, numRec)
        if first_proj not in recProjs:
            random = False
    
    return recProjs

In [47]:
def precisionRandomRec(main_dataframe, donorID, numRec):
    donated_proj = donatedProjects(main_dataframe, donorID) # all the projects the donor has donated to
                                                            # main dataframe here is either the final training or validation dataframe 
    
    first_donation = donated_proj[0] # first donation of the donor
    recProjs = randomRecommendation(main_dataframe, first_donation, numRec)
    hits = 0

    for i in donated_proj[1:]:
        if i in recProjs:
            hits += 1

    return hits

In [48]:
def getTopPrecision(main_dataframe, projectsDF, donorID, numRec):
    donated_proj = donatedProjects(main_dataframe, donorID) # all the projects the donor has donated to
                                                            # main dataframe here is either the final training or validation dataframe 
    
    first_donation = donated_proj[0] # first donation of the donor
    
    
    proj_features = projectFeatures(projectsDF, first_donation)
    hits = 0
    rec_projs = cosineSimilarity(projectsDF, proj_features, numRec) # recommendations based on the first donation

    for i in donated_proj[1:]:
        if i in rec_projs:
            hits += 1

    return hits

In [167]:
# this function retrieves the precision of training and validation data as well as random recommendation system

def getPrecision(donor_list, precision_type, main_dataframe, ohe_dataframe):
    # donor_list is the list of donors in either training or validation dataset
    # precision_type is the type for which the recommendation system is called
    # main_dataframe is the original training or validation df
    # ohe_dataframe is the OHE main_dataframe
    
    rd.seed(123)
    
    num_recs = [5, 10, 25, 50, 100] # number of recommendations
    training_datapoints = 1000
    hits = 0
    precision_results = {}
    
    for i in num_recs:
        for j in rd.sample(donor_list, training_datapoints):
            if precision_type == 'random':
                hits += precisionRandomRec(main_dataframe, j, i)
            elif precision_type == 'training' or precision_type == 'validation':
                hits += getTopPrecision(main_dataframe, ohe_dataframe, j, i)
    
        precision = hits / (i * training_datapoints)
        precision_results[i] = precision
    
    return precision_results

## Recommendation System Execution

In [170]:
# calling all the functions above

ohe_training_df = oheDataFrame(donationsTrainFinal) # projects in the training dataset
ohe_validation_df = oheDataFrame(donationsValidFinal) # projects in the validation dataset

# random recommendation system for the validation dataset
randomrec_precision_results = getPrecision(donorsValidList, 'random', donationsValidFinal, ohe_validation_df)

# precision results for training dataset
train_rec_precision_results = getPrecision(donorsTrainList, 'training', donationsTrainFinal, ohe_training_df)
valid_rec_precision_results = getPrecision(donorsValidList, 'validation', donationsValidFinal, ohe_validation_df)

{5: 0.0}

In [172]:
print(randomrec_precision_results)
print(train_rec_precision_results)
print(valid_rec_precision_results)

{5: 0, 10: 0, 25: 0, 50: 0, 100: 0}
{5: 0.0008, 10: 0.0011, 25: 0.00116, 50: 0.00092, 100: 0.00068}
{5: 0.002, 10: 0.0014, 25: 0.00088, 50: 0.00082, 100: 0.00061}


In [217]:
randomrec_results = pd.DataFrame.from_dict(randomrec_precision_results, orient = 'index').reset_index()
randomrec_results.columns = ['# of Recommendations', 'Random Recommendation Precision']

In [224]:
print("Training Set Results")
print("----" * 5)

rec_train_results = pd.DataFrame.from_dict(train_rec_precision_results, orient = 'index').reset_index()
rec_train_results.columns = ['# of Recommendations', 'Training Precision']
rec_train_results['Training Precision'] = rec_train_results['Training Precision'] * 100

rec_train_results['Training Lower Bound'] = rec_train_results['Training Precision'] - 1.96 * np.sqrt(rec_train_results['Training Precision'] * (1 - rec_train_results['Training Precision']) / (rec_train_results['# of Recommendations'] * 1000))
rec_train_results['Training Upper Bound'] = rec_train_results['Training Precision'] + 1.96 * np.sqrt(rec_train_results['Training Precision'] * (1 - rec_train_results['Training Precision']) / (rec_train_results['# of Recommendations'] * 1000))

rec_train_results

Training Set Results
--------------------


Unnamed: 0,# of Recommendations,Training Precision,Training Lower Bound,Training Upper Bound
0,5,0.08,0.07248,0.08752
1,10,0.11,0.103867,0.116133
2,25,0.116,0.11203,0.11997
3,50,0.092,0.089467,0.094533
4,100,0.068,0.06644,0.06956


In [225]:
print("Validation Set Results")
print("----" * 5)

rec_val_results = pd.DataFrame.from_dict(valid_rec_precision_results, orient = 'index').reset_index()
rec_val_results.columns = ['# of Recommendations', 'Validation Precision']
rec_val_results['Validation Precision'] = rec_val_results['Validation Precision'] * 100

rec_val_results['Validation Lower Bound'] = rec_val_results['Validation Precision'] - 1.96 * np.sqrt(rec_val_results['Validation Precision'] * (1 - rec_val_results['Validation Precision']) / (rec_val_results['# of Recommendations'] * 1000))
rec_val_results['Validation Upper Bound'] = rec_val_results['Validation Precision'] + 1.96 * np.sqrt(rec_val_results['Validation Precision'] * (1 - rec_val_results['Validation Precision']) / (rec_val_results['# of Recommendations'] * 1000))

rec_val_results

Validation Set Results
--------------------


Unnamed: 0,# of Recommendations,Validation Precision,Validation Lower Bound,Validation Upper Bound
0,5,0.2,0.188913,0.211087
1,10,0.14,0.133199,0.146801
2,25,0.088,0.084488,0.091512
3,50,0.082,0.079595,0.084405
4,100,0.061,0.059517,0.062483


In [232]:
results_final = randomrec_results.merge(rec_train_results, on = '# of Recommendations').merge(rec_val_results, on = '# of Recommendations')
results_final

Unnamed: 0,# of Recommendations,Random Recommendation Precision,Training Precision,Training Lower Bound,Training Upper Bound,Validation Precision,Validation Lower Bound,Validation Upper Bound
0,5,0,0.08,0.07248,0.08752,0.2,0.188913,0.211087
1,10,0,0.11,0.103867,0.116133,0.14,0.133199,0.146801
2,25,0,0.116,0.11203,0.11997,0.088,0.084488,0.091512
3,50,0,0.092,0.089467,0.094533,0.082,0.079595,0.084405
4,100,0,0.068,0.06644,0.06956,0.061,0.059517,0.062483
