# Recommendation System - Model 1

## Import packages and dataframes

In [None]:
import numpy as np
import pandas as pd
import random as rd
from sklearn.preprocessing import OneHotEncoder as OHE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

rd.seed(123)

In [None]:
# import all the files

donationsDF = pd.read_csv('Donations.csv')
donorsDF = pd.read_csv('Donors.csv')
projectsDF = pd.read_csv('Projects.csv')
schoolsDF = pd.read_csv('Schools.csv')
resourcesDF = pd.read_csv('Resources.csv')

## Snapshot of the dataframes

In [3]:
donationsDF.head(2)

Unnamed: 0,Project ID,Donation ID,Donor ID,Donation Included Optional Donation,Donation Amount,Donor Cart Sequence,Donation Received Date
0,000009891526c0ade7180f8423792063,688729120858666221208529ee3fc18e,1f4b5b6e68445c6c4a0509b3aca93f38,No,178.37,11,2016-08-23 13:15:57
1,000009891526c0ade7180f8423792063,dcf1071da3aa3561f91ac689d1f73dee,4aaab6d244bf3599682239ed5591af8a,Yes,25.0,2,2016-06-06 20:05:23


In [4]:
donorsDF.head(2)

Unnamed: 0,Donor ID,Donor City,Donor State,Donor Is Teacher,Donor Zip
0,00000ce845c00cbf0686c992fc369df4,Evanston,Illinois,No,602
1,00002783bc5d108510f3f9666c8b1edd,Appomattox,other,No,245


In [5]:
projectsDF.head(2)

Unnamed: 0,Project ID,School ID,Teacher ID,Teacher Project Posted Sequence,Project Type,Project Title,Project Essay,Project Short Description,Project Need Statement,Project Subject Category Tree,Project Subject Subcategory Tree,Project Grade Level Category,Project Resource Category,Project Cost,Project Posted Date,Project Expiration Date,Project Current Status,Project Fully Funded Date
0,7685f0265a19d7b52a470ee4bac883ba,e180c7424cb9c68cb49f141b092a988f,4ee5200e89d9e2998ec8baad8a3c5968,25,Teacher-Led,Stand Up to Bullying: Together We Can!,Did you know that 1-7 students in grades K-12 ...,Did you know that 1-7 students in grades K-12 ...,"My students need 25 copies of ""Bullying in Sch...",Applied Learning,"Character Education, Early Development",Grades PreK-2,Technology,361.8,2013-01-01,2013-05-30,Fully Funded,2013-01-11
1,f9f4af7099061fb4bf44642a03e5c331,08b20f1e2125103ed7aa17e8d76c71d4,cca2d1d277fb4adb50147b49cdc3b156,3,Teacher-Led,Learning in Color!,"Help us have a fun, interactive listening cent...","Help us have a fun, interactive listening cent...","My students need a listening center, read alon...","Applied Learning, Literacy & Language","Early Development, Literacy",Grades PreK-2,Technology,512.85,2013-01-01,2013-05-31,Expired,


## Remove one-time donors

In [183]:
donorsDF_merged = donorsDF.merge(donationsDF, on = 'Donor ID', how = 'inner') # donation ID is the primary key here

In [184]:
donorsDF_merged.head(1)

Unnamed: 0,Donor ID,Donor City,Donor State,Donor Is Teacher,Donor Zip,Project ID,Donation ID,Donation Included Optional Donation,Donation Amount,Donor Cart Sequence,Donation Received Date
0,00000ce845c00cbf0686c992fc369df4,Evanston,Illinois,No,602,5bab6101eed588c396a59f6bd64274b6,598691d82438952e2e4f3ed50531fd2a,Yes,50.0,1,2013-12-17 21:47:14


In [185]:
# Get the count of donations a donor has made

df_temp1 = donorsDF_merged.groupby( ['Donor ID']).size().reset_index(name='Num Donations')

df_temp1.head(2)

Unnamed: 0,Donor ID,Num Donations
0,00000ce845c00cbf0686c992fc369df4,1
1,00002783bc5d108510f3f9666c8b1edd,1


In [186]:
df_temp2 = df_temp1[df_temp1['Num Donations'] > 1] # removing all the single-donors

print("We got rid of %d single-donors." % (df_temp1.shape[0] - df_temp2.shape[0]))

We got rid of 1466290 single-donors.


## Building dataframe for modeling

In [204]:
uniqDonorIDList = list(df_temp2['Donor ID']) # list of recurring donors
df_final = donorsDF_merged[donorsDF_merged['Donor ID'].isin(uniqDonorIDList)]


df_final.shape

(3215610, 11)

In [205]:
df_final = df_final.drop_duplicates(subset = ['Donation ID']) # removes duplicate Donation IDs

df_final.shape

(3215570, 11)

In [208]:
df_final = df_final.merge(projectsDF, on = "Project ID", how = "inner") # adding projects to the final dataframe

df_final.head(2)

Unnamed: 0,Donor ID,Donor City,Donor State,Donor Is Teacher,Donor Zip,Project ID,Donation ID,Donation Included Optional Donation,Donation Amount,Donor Cart Sequence,...,Project Need Statement,Project Subject Category Tree,Project Subject Subcategory Tree,Project Grade Level Category,Project Resource Category,Project Cost,Project Posted Date,Project Expiration Date,Project Current Status,Project Fully Funded Date
0,00002d44003ed46b066607c5455a999a,Winton,California,Yes,953,2f53e5f31890e647048ac217cda3b83f,7831f66a25935db8b7424170355edabf,No,26.0,10,...,My students need books to inspire them to be a...,"History & Civics, Literacy & Language","Civics & Government, Literacy",Grades 6-8,Books,373.53,2017-04-01,2017-08-01,Fully Funded,2017-05-12
1,00002d44003ed46b066607c5455a999a,Winton,California,Yes,953,2f53e5f31890e647048ac217cda3b83f,ee15bb7f179142fdff3f24cff8e88483,No,100.0,9,...,My students need books to inspire them to be a...,"History & Civics, Literacy & Language","Civics & Government, Literacy",Grades 6-8,Books,373.53,2017-04-01,2017-08-01,Fully Funded,2017-05-12


In [210]:
df_final = df_final.merge(schoolsDF, on = "School ID", how = "inner") # adding projects to the final dataframe

df_final.head(2)

Unnamed: 0,Donor ID,Donor City,Donor State,Donor Is Teacher,Donor Zip,Project ID,Donation ID,Donation Included Optional Donation,Donation Amount,Donor Cart Sequence,...,Project Current Status,Project Fully Funded Date,School Name,School Metro Type,School Percentage Free Lunch,School State,School Zip,School City,School County,School District
0,00002d44003ed46b066607c5455a999a,Winton,California,Yes,953,2f53e5f31890e647048ac217cda3b83f,7831f66a25935db8b7424170355edabf,No,26.0,10,...,Fully Funded,2017-05-12,Herbert Hoover Middle School,urban,82.0,California,95340,Merced,Merced,Merced City School District
1,00002d44003ed46b066607c5455a999a,Winton,California,Yes,953,2f53e5f31890e647048ac217cda3b83f,ee15bb7f179142fdff3f24cff8e88483,No,100.0,9,...,Fully Funded,2017-05-12,Herbert Hoover Middle School,urban,82.0,California,95340,Merced,Merced,Merced City School District


### Training and test split

In [213]:
columnsList = ['Donor ID', 'Donation Received Date', 'Donor City', 'Donor State',
       'Donor Is Teacher', 'Donor Zip', 'Project ID', 'Donation ID',
       'Donation Included Optional Donation', 'Donation Amount',
       'Donor Cart Sequence', 'School ID', 'Teacher ID',
       'Teacher Project Posted Sequence', 'Project Type', 'Project Title',
       'Project Essay', 'Project Short Description', 'Project Need Statement',
       'Project Subject Category Tree', 'Project Subject Subcategory Tree',
       'Project Grade Level Category', 'Project Resource Category',
       'Project Cost', 'Project Posted Date', 'Project Expiration Date',
       'Project Current Status', 'Project Fully Funded Date', 'School Name',
       'School Metro Type', 'School Percentage Free Lunch', 'School State',
       'School Zip', 'School City', 'School County', 'School District']

df_final = df_final[columnsList]

df_final = df_final.sort_values(by = ['Donor ID', 'Donation Received Date']) # sorts dataframe by donors and their
                                                                            # first donation date


In [214]:
donorsKeep = df_final.drop_duplicates(subset=['Donor ID'], keep='first') # keep just the first donation for each donor

donorsKeep = donorsKeep.sort_values(['Donation Received Date']) # sort the donation by donation date

donorsKeep.head(5)

Unnamed: 0,Donor ID,Donation Received Date,Donor City,Donor State,Donor Is Teacher,Donor Zip,Project ID,Donation ID,Donation Included Optional Donation,Donation Amount,...,Project Current Status,Project Fully Funded Date,School Name,School Metro Type,School Percentage Free Lunch,School State,School Zip,School City,School County,School District
2233258,fcf96e729bf1b03dfbd8f07a3b4ee173,2012-10-08 13:53:00,Staten Island,New York,Yes,103,3cef9c6b38eb98b50ab9d82616a95fef,fdb6b2832434cba2fc52452829ca28d1,Yes,25.0,...,Fully Funded,2013-02-11,IS 51 Edwin Markham,urban,77.0,New York,10302,New York City,Richmond (Staten Island),New York City Dept Of Ed
2233253,ea1db63aa514055a1faa005e4f37df5f,2012-10-08 18:44:11,Staten Island,New York,No,103,3cef9c6b38eb98b50ab9d82616a95fef,894e5893a7f544e283284e556b3524fc,No,50.0,...,Fully Funded,2013-02-11,IS 51 Edwin Markham,urban,77.0,New York,10302,New York City,Richmond (Staten Island),New York City Dept Of Ed
2233255,f5c13e4c10889c2d171d955e9908859e,2012-10-08 19:34:08,Staten Island,New York,No,103,3cef9c6b38eb98b50ab9d82616a95fef,5ccff2917509fe2f4e6b163f35952f59,No,10.0,...,Fully Funded,2013-02-11,IS 51 Edwin Markham,urban,77.0,New York,10302,New York City,Richmond (Staten Island),New York City Dept Of Ed
2757295,2f46f2a396311a0b73d7550a8c945211,2012-10-09 16:54:45,,New York,No,100,2a84cb3c1439994b368f4b51dc099678,60cf97b0d67b92e22b00f8f9914fb1f0,No,50.0,...,Fully Funded,2013-02-18,Confluence Prep Academy,urban,90.0,Missouri,63103,Saint Louis,St Louis City,Missouri Dept Of Education
2757319,f9f9312786079a621fec74f35ea75606,2012-10-10 22:24:21,Saint Louis,Missouri,Yes,631,2a84cb3c1439994b368f4b51dc099678,0e792579fbe9b81f04e2fab8eda822d1,Yes,20.0,...,Fully Funded,2013-02-18,Confluence Prep Academy,urban,90.0,Missouri,63103,Saint Louis,St Louis City,Missouri Dept Of Education


In [215]:
donationsTrainNum = int(round(0.8 * len(donorsKeep), 0)) # the top ~80% is the training data

In [216]:
donationsTrain = donorsKeep.iloc[:donationsTrainNum,:]

donationsValid = donorsKeep.iloc[donationsTrainNum:,] # the bottom ~20% is the validation data

In [356]:
donationsTrain.head(5)

Unnamed: 0,Donor ID,Donation Received Date,Donor City,Donor State,Donor Is Teacher,Donor Zip,Project ID,Donation ID,Donation Included Optional Donation,Donation Amount,...,Project Current Status,Project Fully Funded Date,School Name,School Metro Type,School Percentage Free Lunch,School State,School Zip,School City,School County,School District
2233258,fcf96e729bf1b03dfbd8f07a3b4ee173,2012-10-08 13:53:00,Staten Island,New York,Yes,103,3cef9c6b38eb98b50ab9d82616a95fef,fdb6b2832434cba2fc52452829ca28d1,Yes,25.0,...,Fully Funded,2013-02-11,IS 51 Edwin Markham,urban,77.0,New York,10302,New York City,Richmond (Staten Island),New York City Dept Of Ed
2233253,ea1db63aa514055a1faa005e4f37df5f,2012-10-08 18:44:11,Staten Island,New York,No,103,3cef9c6b38eb98b50ab9d82616a95fef,894e5893a7f544e283284e556b3524fc,No,50.0,...,Fully Funded,2013-02-11,IS 51 Edwin Markham,urban,77.0,New York,10302,New York City,Richmond (Staten Island),New York City Dept Of Ed
2233255,f5c13e4c10889c2d171d955e9908859e,2012-10-08 19:34:08,Staten Island,New York,No,103,3cef9c6b38eb98b50ab9d82616a95fef,5ccff2917509fe2f4e6b163f35952f59,No,10.0,...,Fully Funded,2013-02-11,IS 51 Edwin Markham,urban,77.0,New York,10302,New York City,Richmond (Staten Island),New York City Dept Of Ed
2757295,2f46f2a396311a0b73d7550a8c945211,2012-10-09 16:54:45,,New York,No,100,2a84cb3c1439994b368f4b51dc099678,60cf97b0d67b92e22b00f8f9914fb1f0,No,50.0,...,Fully Funded,2013-02-18,Confluence Prep Academy,urban,90.0,Missouri,63103,Saint Louis,St Louis City,Missouri Dept Of Education
2757319,f9f9312786079a621fec74f35ea75606,2012-10-10 22:24:21,Saint Louis,Missouri,Yes,631,2a84cb3c1439994b368f4b51dc099678,0e792579fbe9b81f04e2fab8eda822d1,Yes,20.0,...,Fully Funded,2013-02-18,Confluence Prep Academy,urban,90.0,Missouri,63103,Saint Louis,St Louis City,Missouri Dept Of Education


In [217]:
donorsTrainList = list(donationsTrain['Donor ID'])

# donorsTrainList

In [218]:
donationsTrainFinal = df_final[df_final['Donor ID'].isin(donorsTrainList)] # expanding the training data

In [355]:
donationsTrainFinal = donationsTrainFinal.sort_values(by = ['Donor ID', 'Donation Received Date']) 

donationsTrainFinal.head(10)

Unnamed: 0,Donor ID,Donation Received Date,Donor City,Donor State,Donor Is Teacher,Donor Zip,Project ID,Donation ID,Donation Included Optional Donation,Donation Amount,...,Project Current Status,Project Fully Funded Date,School Name,School Metro Type,School Percentage Free Lunch,School State,School Zip,School City,School County,School District
9,00002d44003ed46b066607c5455a999a,2016-10-25 20:15:11,Winton,California,Yes,953,e2beb818569f66adaa4ced21ca299ac6,08ed72ce14c548e8131a7dd7b8561988,Yes,10.0,...,Fully Funded,2016-12-26,Herbert Hoover Middle School,urban,82.0,California,95340,Merced,Merced,Merced City School District
500,00002d44003ed46b066607c5455a999a,2017-01-16 01:11:20,Winton,California,Yes,953,eb6d91cbeab5037ca2f45fc3f6a4de8c,158f00637416b7c41fcde429b10c3c41,Yes,15.51,...,Fully Funded,2017-01-16,Clara B Ford Academy,suburban,95.0,Michigan,48127,Dearborn Hts,Wayne,Michigan Dept Of Education
164,00002d44003ed46b066607c5455a999a,2017-01-16 14:20:10,Winton,California,Yes,953,64f54f1efcbeb986114a7a13e6b27257,e3162fdec64ed91de1aa096cf75ba032,Yes,100.0,...,Expired,,Donn B Chenoweth Elementary School,urban,57.0,California,95340,Merced,Merced,Merced City School District
175,00002d44003ed46b066607c5455a999a,2017-01-16 14:26:19,Winton,California,Yes,953,dfdaf35bb33f9c105530c82984960ff3,45ffd222223dfb2ed7728014eafe328c,Yes,9.69,...,Fully Funded,2017-01-16,Shoally Creek Elementary,unknown,61.0,South Carolina,29316,Boiling Spgs,Spartanburg,Spartanburg School District 2
25,00002d44003ed46b066607c5455a999a,2017-01-16 15:46:57,Winton,California,Yes,953,2f7996f08052785e9b146f72c0c4990d,c73207489356a7bcf3ef0b8660c61b04,Yes,13.75,...,Fully Funded,2017-01-16,Portland Elementary School,urban,92.0,Kentucky,40212,Louisville,Jefferson,Jefferson Co School District
3,00002d44003ed46b066607c5455a999a,2017-02-01 18:53:25,Winton,California,Yes,953,e09933470f4256cc2643341c1d299e55,93c0af8b821e432857e5a63687524dd9,No,150.0,...,Fully Funded,2017-02-19,Herbert Hoover Middle School,urban,82.0,California,95340,Merced,Merced,Merced City School District
4,00002d44003ed46b066607c5455a999a,2017-02-03 14:20:57,Winton,California,Yes,953,e09933470f4256cc2643341c1d299e55,23d388c20fb9ba1a714632d51aa8ad32,No,10.0,...,Fully Funded,2017-02-19,Herbert Hoover Middle School,urban,82.0,California,95340,Merced,Merced,Merced City School District
1,00002d44003ed46b066607c5455a999a,2017-04-01 01:08:50,Winton,California,Yes,953,2f53e5f31890e647048ac217cda3b83f,ee15bb7f179142fdff3f24cff8e88483,No,100.0,...,Fully Funded,2017-05-12,Herbert Hoover Middle School,urban,82.0,California,95340,Merced,Merced,Merced City School District
0,00002d44003ed46b066607c5455a999a,2017-05-02 12:34:09,Winton,California,Yes,953,2f53e5f31890e647048ac217cda3b83f,7831f66a25935db8b7424170355edabf,No,26.0,...,Fully Funded,2017-05-12,Herbert Hoover Middle School,urban,82.0,California,95340,Merced,Merced,Merced City School District
171,00002d44003ed46b066607c5455a999a,2017-10-18 14:34:11,Winton,California,Yes,953,c5821d32012efd7df4f6fa12e230e991,615d39f9c40cd3137a9854c29ad0dbe2,No,100.0,...,Fully Funded,2017-12-10,ADA Givens Elementary School,urban,81.0,California,95340,Merced,Merced,Merced City School District


In [220]:
donorsValidList = list(donationsValid['Donor ID'])

donationsValidFinal = df_final[df_final['Donor ID'].isin(donorsValidList)] # expanding the validation data as well

donationsValidFinal = donationsValidFinal.sort_values(by = ['Donor ID', 'Donation Received Date'])

donationsValidFinal.head(10)

Unnamed: 0,Donor ID,Donation Received Date,Donor City,Donor State,Donor Is Teacher,Donor Zip,Project ID,Donation ID,Donation Included Optional Donation,Donation Amount,...,Project Current Status,Project Fully Funded Date,School Name,School Metro Type,School Percentage Free Lunch,School State,School Zip,School City,School County,School District
1523,00006084c3d92d904a22e0a70f5c119a,2017-04-07 17:50:19,Brick,New Jersey,Yes,87,7771a382bf28d33e0046d74f495afda7,0e7de979325e84ad03163e26d59c075a,Yes,25.0,...,Fully Funded,2017-04-10,Red Bank Middle School,suburban,89.0,New Jersey,7701,Red Bank,Monmouth,Red Bank Borough Public Sd
1515,00006084c3d92d904a22e0a70f5c119a,2017-04-07 17:50:20,Brick,New Jersey,Yes,87,7771a382bf28d33e0046d74f495afda7,4768c31e21e89f79ada2a05e7b1336df,Yes,25.0,...,Fully Funded,2017-04-10,Red Bank Middle School,suburban,89.0,New Jersey,7701,Red Bank,Monmouth,Red Bank Borough Public Sd
1518,00006084c3d92d904a22e0a70f5c119a,2017-04-07 17:50:20,Brick,New Jersey,Yes,87,7771a382bf28d33e0046d74f495afda7,7e436d1b26e60246a9bd81ac059c6e9b,Yes,50.0,...,Fully Funded,2017-04-10,Red Bank Middle School,suburban,89.0,New Jersey,7701,Red Bank,Monmouth,Red Bank Borough Public Sd
1519,00006084c3d92d904a22e0a70f5c119a,2017-04-07 17:50:20,Brick,New Jersey,Yes,87,7771a382bf28d33e0046d74f495afda7,698df43826119cf99d9f24488039dd64,Yes,25.0,...,Fully Funded,2017-04-10,Red Bank Middle School,suburban,89.0,New Jersey,7701,Red Bank,Monmouth,Red Bank Borough Public Sd
1521,00006084c3d92d904a22e0a70f5c119a,2017-04-07 17:50:20,Brick,New Jersey,Yes,87,7771a382bf28d33e0046d74f495afda7,6a618527b395daf5830e41ea4417341d,Yes,25.0,...,Fully Funded,2017-04-10,Red Bank Middle School,suburban,89.0,New Jersey,7701,Red Bank,Monmouth,Red Bank Borough Public Sd
1522,00006084c3d92d904a22e0a70f5c119a,2017-04-07 17:50:20,Brick,New Jersey,Yes,87,7771a382bf28d33e0046d74f495afda7,aca915b2420c0b1851da1e42642e9e27,Yes,25.0,...,Fully Funded,2017-04-10,Red Bank Middle School,suburban,89.0,New Jersey,7701,Red Bank,Monmouth,Red Bank Borough Public Sd
1516,00006084c3d92d904a22e0a70f5c119a,2017-04-07 17:50:21,Brick,New Jersey,Yes,87,7771a382bf28d33e0046d74f495afda7,7001340658440b07f622d175c6991179,Yes,50.0,...,Fully Funded,2017-04-10,Red Bank Middle School,suburban,89.0,New Jersey,7701,Red Bank,Monmouth,Red Bank Borough Public Sd
1517,00006084c3d92d904a22e0a70f5c119a,2017-04-09 17:57:41,Brick,New Jersey,Yes,87,7771a382bf28d33e0046d74f495afda7,dc010aa5685ccfb8d6db4098730bae57,Yes,50.0,...,Fully Funded,2017-04-10,Red Bank Middle School,suburban,89.0,New Jersey,7701,Red Bank,Monmouth,Red Bank Borough Public Sd
1525,00006084c3d92d904a22e0a70f5c119a,2017-04-09 17:58:43,Brick,New Jersey,Yes,87,7771a382bf28d33e0046d74f495afda7,bfc1f1fc38ff8b57894f283afb8fc29d,Yes,50.0,...,Fully Funded,2017-04-10,Red Bank Middle School,suburban,89.0,New Jersey,7701,Red Bank,Monmouth,Red Bank Borough Public Sd
1520,00006084c3d92d904a22e0a70f5c119a,2017-04-09 17:59:32,Brick,New Jersey,Yes,87,7771a382bf28d33e0046d74f495afda7,f0ae22e51a49d1f54b5d50a0b0a11910,Yes,50.0,...,Fully Funded,2017-04-10,Red Bank Middle School,suburban,89.0,New Jersey,7701,Red Bank,Monmouth,Red Bank Borough Public Sd


In [221]:
print("Training data shape: \n", donationsTrainFinal.shape)
print("Validation data shape: \n", donationsValidFinal.shape)

Training data shape: 
 (2711805, 36)
Validation data shape: 
 (445059, 36)


## One-Hot Vectorization of Features

In [227]:
# checking for null values in the dataframe

trainNullCol = donationsTrainFinal.columns[donationsTrainFinal.isnull().any()]
donationsTrainFinal[trainNullCol].isnull().sum()

Donor City                          119127
Donor Zip                            69242
Project Title                           18
Project Essay                            6
Project Short Description                7
Project Subject Category Tree           76
Project Subject Subcategory Tree        76
Project Resource Category               89
Project Expiration Date                  1
Project Fully Funded Date           254865
School Percentage Free Lunch          8137
School City                          16736
School County                           17
dtype: int64

In [251]:
# because we are doing content-based filtering, here are the features of the projects we are interested in:

ohFeatures = ['Project ID', 'Project Cost', 'Project Type','Project Subject Category Tree',
       'Project Grade Level Category', 'Project Resource Category',
       'Project Current Status',
       'School Metro Type', 'School State']

In [324]:
# drop duplicates of projects because 'Project ID' has to be the primary key

projFeatures = donationsTrainFinal[ohFeatures].drop_duplicates(subset = ['Project ID'])

projFeatures = projFeatures.dropna() # drop the null values as well

print(projFeatures.shape)

(748856, 9)
(748835, 9)


In [326]:
# no null values or duplicates

trainNullCol = projFeatures.columns[projFeatures.isnull().any()]
projFeatures[trainNullCol].isnull().sum()

Series([], dtype: float64)

In [329]:
enc = OHE(handle_unknown='ignore')
enc.fit(projFeatures.iloc[:,2:]) # fit the one hot vector on our dataframe

OneHotEncoder(handle_unknown='ignore')

In [330]:
enc.categories_ # data dictionary of the OHM

[array(['Professional Development', 'Student-Led', 'Teacher-Led'],
       dtype=object),
 array(['Applied Learning', 'Applied Learning, Health & Sports',
        'Applied Learning, History & Civics',
        'Applied Learning, Literacy & Language',
        'Applied Learning, Math & Science',
        'Applied Learning, Music & The Arts',
        'Applied Learning, Special Needs',
        'Applied Learning, Warmth, Care & Hunger', 'Health & Sports',
        'Health & Sports, Applied Learning',
        'Health & Sports, History & Civics',
        'Health & Sports, Literacy & Language',
        'Health & Sports, Math & Science',
        'Health & Sports, Music & The Arts',
        'Health & Sports, Special Needs',
        'Health & Sports, Warmth, Care & Hunger', 'History & Civics',
        'History & Civics, Applied Learning',
        'History & Civics, Health & Sports',
        'History & Civics, Literacy & Language',
        'History & Civics, Math & Science',
        'History & Civics,

In [331]:
oneHotMatrix = pd.DataFrame(enc.transform(projFeatures.iloc[:,2:]).toarray()) # convert the one-hot matrix into a dataframe

In [332]:
# this is what the OHM looks like

oneHotMatrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,125,126,127,128,129,130,131,132,133,134
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
748830,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
748831,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
748832,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
748833,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Merging one-hot matrix with the dataframe

In [334]:
projOHM = pd.concat([projFeatures.reset_index(drop = True), oneHotMatrix.reset_index(drop = True)], axis = 1)

projOHM.head(1)


Unnamed: 0,Project ID,Project Cost,Project Type,Project Subject Category Tree,Project Grade Level Category,Project Resource Category,Project Current Status,School Metro Type,School State,0,...,125,126,127,128,129,130,131,132,133,134
0,e2beb818569f66adaa4ced21ca299ac6,555.28,Teacher-Led,"History & Civics, Literacy & Language",Grades 6-8,Books,Fully Funded,urban,California,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [335]:
dropCol = ['Project Type', 'Project Subject Category Tree', 'Project Grade Level Category',
           'Project Resource Category', 'Project Current Status',
           'School Metro Type', 'School State']

In [336]:
# because we wanted to keep the project cost as one of the features,
# we vectorized the rest of features and vertically stacked the project cost

projOHM = projOHM.drop(dropCol, axis = 1)

projOHM.head(1)

Unnamed: 0,Project ID,Project Cost,0,1,2,3,4,5,6,7,...,125,126,127,128,129,130,131,132,133,134
0,e2beb818569f66adaa4ced21ca299ac6,555.28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [337]:
# data dictionary to track the one hot matrix

dataDict = enc.get_feature_names(dropCol) # please note that any new input should strictly should the indexes
                                            # mentioned in data dictionary

## Building the recommendation system

In [362]:
# scale and normalize the dataset

ohmDF = pd.DataFrame(StandardScaler().fit_transform(projOHM.iloc[:,1:]))

ohmDFwProj = pd.concat([projOHM['Project ID'], ohmDF], axis = 1)

ohmDFwProj.head(5) # column here is a OH feature

Unnamed: 0,Project ID,0,1,2,3,4,5,6,7,8,...,126,127,128,129,130,131,132,133,134,135
0,e2beb818569f66adaa4ced21ca299ac6,-0.119033,-0.096627,-0.08812,0.131332,-0.199033,-0.077693,-0.054146,-0.169561,-0.108255,...,-0.05082,-0.130419,-0.257288,-0.11009,-0.035414,-0.147628,-0.151159,-0.0696,-0.124467,-0.030785
1,eb6d91cbeab5037ca2f45fc3f6a4de8c,-0.362253,-0.096627,-0.08812,0.131332,-0.199033,-0.077693,-0.054146,-0.169561,-0.108255,...,-0.05082,-0.130419,-0.257288,-0.11009,-0.035414,-0.147628,-0.151159,-0.0696,-0.124467,-0.030785
2,64f54f1efcbeb986114a7a13e6b27257,-0.078427,-0.096627,-0.08812,0.131332,-0.199033,-0.077693,-0.054146,-0.169561,-0.108255,...,-0.05082,-0.130419,-0.257288,-0.11009,-0.035414,-0.147628,-0.151159,-0.0696,-0.124467,-0.030785
3,dfdaf35bb33f9c105530c82984960ff3,-0.15999,-0.096627,-0.08812,0.131332,-0.199033,-0.077693,-0.054146,-0.169561,-0.108255,...,-0.05082,-0.130419,-0.257288,-0.11009,-0.035414,-0.147628,-0.151159,-0.0696,-0.124467,-0.030785
4,2f7996f08052785e9b146f72c0c4990d,-0.299805,-0.096627,-0.08812,0.131332,-0.199033,-0.077693,-0.054146,-0.169561,-0.108255,...,-0.05082,-0.130419,-0.257288,-0.11009,-0.035414,-0.147628,-0.151159,-0.0696,-0.124467,-0.030785


## Testing the recommendation system with a sample row

In [408]:
# taking the first row as a sample row for the recommendation system

testRow = np.array(ohmDFwProj[ohmDFwProj['Project ID'] == '3cef9c6b38eb98b50ab9d82616a95fef'].iloc[:,1:])

testRow

array([[ 2.63045891e+00, -9.66267932e-02, -8.81200031e-02,
         1.31332116e-01, -1.99032647e-01, -7.76930099e-02,
        -5.41458916e-02, -1.69560979e-01, -1.08254662e-01,
        -9.97530861e-02, -1.37792314e-01, -5.89252110e-03,
        -2.05421870e-01, -3.86680503e-02, -1.70254945e-02,
        -6.70452556e-02, -3.75968233e-02, -3.63657089e-02,
        -9.62600549e-02, -8.33342793e-03,  7.31629304e+00,
        -2.34054926e-02, -1.24471398e-02, -1.35730204e-01,
        -6.85559550e-02, -6.63686365e-02, -4.68638110e-02,
        -2.31120298e-03, -5.48744187e-01, -8.32001624e-02,
        -2.66132927e-02, -1.09153905e-01, -3.93863256e-01,
        -1.45679975e-01, -1.97880439e-01, -5.89252110e-03,
        -4.18610681e-01, -1.18002386e-01, -5.86381993e-02,
        -8.54647794e-02, -1.63091207e-01, -1.28923007e-01,
        -1.29403910e-01, -5.03720056e-03, -2.42202285e-01,
        -1.31262055e-02, -1.38202666e-02, -1.71037959e-02,
        -4.03796508e-02, -2.83063777e-03, -1.78663935e-0

In [409]:
# getting the cosine similarity between our feature matrix and test sample

test2 = cosine_similarity(ohmDF, testRow)

test2

array([[ 0.05133926],
       [ 0.00629368],
       [-0.04958605],
       ...,
       [-0.02853242],
       [-0.05765863],
       [-0.03658352]])

In [394]:
# tracing back the cosine similarity with their corresponding project ID
# so that we know what project does the similarity correspond to

test3 = pd.concat([projOHM.iloc[:,:1], pd.DataFrame(test2)], axis = 1)

test3 = test3.sort_values(by = 0, ascending = False) # sorting the similarity in descending order

test3.columns = ['Project ID', 'Cosine Similarity']

top10Rec = list(test3.head(11).iloc[1:,:]['Project ID'])

test3.head(11)

Unnamed: 0,Project ID,Cosine Similarity
170271,3cef9c6b38eb98b50ab9d82616a95fef,1.0
114313,431edad02a4797f303dfbcd18493e2a7,0.998557
244906,a24d6991c07ecb939b30f2855438da0a,0.99828
67210,7226568095dacc94be06cf76753a2391,0.994437
80281,c83f962d03001868d5420c8b0ba36065,0.990401
16870,c6884836e7923417f063e31c0f416baa,0.989306
696315,4f993a5ac32fcab50223127568d6e3d8,0.986821
616590,8b637975035d004fd64810c887542c75,0.985751
156567,b24c99ab36c3736acb2526de682b1561,0.979065
156566,85246dcb4b016a3ea68607fcf4c11ceb,0.979065


In [366]:
# checking if the recommendation system fares well
# here the recommended project has the same features, except the project cost

projFeatures[projFeatures['Project ID'] == '3cef9c6b38eb98b50ab9d82616a95fef']

Unnamed: 0,Project ID,Project Cost,Project Type,Project Subject Category Tree,Project Grade Level Category,Project Resource Category,Project Current Status,School Metro Type,School State
2233245,3cef9c6b38eb98b50ab9d82616a95fef,3378.82,Teacher-Led,History & Civics,Grades 6-8,Trips,Fully Funded,urban,New York


In [407]:
# checking if the recommendation system fares well v2

projFeatures[projFeatures['Project ID'] == 'a24d6991c07ecb939b30f2855438da0a']

Unnamed: 0,Project ID,Project Cost,Project Type,Project Subject Category Tree,Project Grade Level Category,Project Resource Category,Project Current Status,School Metro Type,School State
2402926,a24d6991c07ecb939b30f2855438da0a,2662.35,Teacher-Led,History & Civics,Grades 6-8,Trips,Fully Funded,urban,New York


In [377]:
donor_donated_projs = list(df_final[df_final['Donor ID'] == 'fcf96e729bf1b03dfbd8f07a3b4ee173']['Project ID'])

In [406]:
hits = 0
donationCount = []

for i in donor_donated_projs:
    if i not in donationCount: # eliminates repetitive donations to the same recommended project
        donationCount.append(i)
        if i in top10Rec:
            hits += 1
        
        
print("Our model got %d hits out of %d projects that the donor has donated." % (hits, len(donor_donated_projs)))

Our model got 1 hits out of 6 projects that the donor has donated
