# Collaborative Recommendation System

## Import packages and dataframes

In [25]:
import numpy as np
import pandas as pd
import random as rd
from sklearn.preprocessing import OneHotEncoder as OHE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)

rd.seed(123)

In [2]:
# import all the files

donationsDF = pd.read_csv('Donations.csv')
donorsDF = pd.read_csv('Donors.csv')
projectsDF = pd.read_csv('Projects.csv')
schoolsDF = pd.read_csv('Schools.csv')
resourcesDF = pd.read_csv('Resources.csv')

In [3]:
acs_df = pd.read_csv('zip_wise_acs_2015.csv')

## Snapshot of the dataframes

In [None]:
donationsDF.head(2)

In [None]:
donorsDF['Donor Zip'].sort_values

In [None]:
projectsDF.head(2)

## Remove one-time donors

In [3]:
donorsDF_merged = donorsDF.merge(donationsDF, on = 'Donor ID', how = 'inner') # donation ID is the primary key here
# Get the count of donations a donor has made
df_temp1 = donorsDF_merged.groupby( ['Donor ID']).size().reset_index(name='Num Donations')
df_temp2 = df_temp1[df_temp1['Num Donations'] > 1] # removing all the single-donors

print("We got rid of %d single-donors." % (df_temp1.shape[0] - df_temp2.shape[0]))

We got rid of 1466290 single-donors.


## Building dataframe for modeling

In [4]:
uniqDonorIDList = list(df_temp2['Donor ID']) # list of recurring donors
df_final = donorsDF_merged[donorsDF_merged['Donor ID'].isin(uniqDonorIDList)]
df_final = df_final.drop_duplicates(subset = ['Donation ID']) # removes duplicate Donation IDs
df_final = df_final.merge(projectsDF, on = "Project ID", how = "inner") # adding projects to the final dataframe
df_final = df_final.merge(schoolsDF, on = "School ID", how = "inner") # adding projects to the final dataframe
df_final.shape

(3156864, 36)

### Training and test split

In [6]:
columnsList = ['Donor ID', 'Donation Received Date', 'Donor City', 'Donor State',
       'Donor Is Teacher', 'Donor Zip', 'Project ID', 'Donation ID',
       'Donation Included Optional Donation', 'Donation Amount',
       'Donor Cart Sequence', 'School ID', 'Teacher ID',
       'Teacher Project Posted Sequence', 'Project Type', 'Project Title',
       'Project Essay', 'Project Short Description', 'Project Need Statement',
       'Project Subject Category Tree', 'Project Subject Subcategory Tree',
       'Project Grade Level Category', 'Project Resource Category',
       'Project Cost', 'Project Posted Date', 'Project Expiration Date',
       'Project Current Status', 'Project Fully Funded Date', 'School Name',
       'School Metro Type', 'School Percentage Free Lunch', 'School State',
       'School Zip', 'School City', 'School County', 'School District']

df_final = df_final[columnsList]

df_final = df_final.sort_values(by = ['Donor ID', 'Donation Received Date']) # sorts dataframe by donors and their
                                                                            # first donation date
donorsKeep = df_final.drop_duplicates(subset=['Donor ID'], keep='first') # keep just the first donation for each donor

donorsKeep = donorsKeep.sort_values(['Donation Received Date']) # sort the donation by donation date

In [7]:
# Adding categories for all data so that test-train level mismatch is avoided
categorical_columns = ['Project Type','Project Subject Category Tree','Project Grade Level Category', 'Project Resource Category','Project Current Status','School Metro Type', 'School State']
        
for col in categorical_columns:
        cat_val = [str(cat) for cat in list(donorsKeep[col].unique()) if cat is not None]

        donorsKeep[col] = donorsKeep[col].astype(pd.CategoricalDtype(categories=cat_val))

In [8]:
donationsTrainNum = int(round(0.8 * len(donorsKeep), 0)) # the top ~80% is the training data
donationsTrain = donorsKeep.iloc[:donationsTrainNum,:]

donationsValid = donorsKeep.iloc[donationsTrainNum:,] # the bottom ~20% is the validation data

donorsTrainList = list(donationsTrain['Donor ID'])

# donorsTrainList
donationsTrainFinal = df_final[df_final['Donor ID'].isin(donorsTrainList)] # expanding the training data

donationsTrainFinal = donationsTrainFinal.sort_values(by = ['Donor ID', 'Donation Received Date']) 

donationsTrainFinal.head(10)

Unnamed: 0,Donor ID,Donation Received Date,Donor City,Donor State,Donor Is Teacher,Donor Zip,Project ID,Donation ID,Donation Included Optional Donation,Donation Amount,...,Project Current Status,Project Fully Funded Date,School Name,School Metro Type,School Percentage Free Lunch,School State,School Zip,School City,School County,School District
9,00002d44003ed46b066607c5455a999a,2016-10-25 20:15:11,Winton,California,Yes,953,e2beb818569f66adaa4ced21ca299ac6,08ed72ce14c548e8131a7dd7b8561988,Yes,10.0,...,Fully Funded,2016-12-26,Herbert Hoover Middle School,urban,82.0,California,95340,Merced,Merced,Merced City School District
500,00002d44003ed46b066607c5455a999a,2017-01-16 01:11:20,Winton,California,Yes,953,eb6d91cbeab5037ca2f45fc3f6a4de8c,158f00637416b7c41fcde429b10c3c41,Yes,15.51,...,Fully Funded,2017-01-16,Clara B Ford Academy,suburban,95.0,Michigan,48127,Dearborn Hts,Wayne,Michigan Dept Of Education
164,00002d44003ed46b066607c5455a999a,2017-01-16 14:20:10,Winton,California,Yes,953,64f54f1efcbeb986114a7a13e6b27257,e3162fdec64ed91de1aa096cf75ba032,Yes,100.0,...,Expired,,Donn B Chenoweth Elementary School,urban,57.0,California,95340,Merced,Merced,Merced City School District
175,00002d44003ed46b066607c5455a999a,2017-01-16 14:26:19,Winton,California,Yes,953,dfdaf35bb33f9c105530c82984960ff3,45ffd222223dfb2ed7728014eafe328c,Yes,9.69,...,Fully Funded,2017-01-16,Shoally Creek Elementary,unknown,61.0,South Carolina,29316,Boiling Spgs,Spartanburg,Spartanburg School District 2
25,00002d44003ed46b066607c5455a999a,2017-01-16 15:46:57,Winton,California,Yes,953,2f7996f08052785e9b146f72c0c4990d,c73207489356a7bcf3ef0b8660c61b04,Yes,13.75,...,Fully Funded,2017-01-16,Portland Elementary School,urban,92.0,Kentucky,40212,Louisville,Jefferson,Jefferson Co School District
3,00002d44003ed46b066607c5455a999a,2017-02-01 18:53:25,Winton,California,Yes,953,e09933470f4256cc2643341c1d299e55,93c0af8b821e432857e5a63687524dd9,No,150.0,...,Fully Funded,2017-02-19,Herbert Hoover Middle School,urban,82.0,California,95340,Merced,Merced,Merced City School District
4,00002d44003ed46b066607c5455a999a,2017-02-03 14:20:57,Winton,California,Yes,953,e09933470f4256cc2643341c1d299e55,23d388c20fb9ba1a714632d51aa8ad32,No,10.0,...,Fully Funded,2017-02-19,Herbert Hoover Middle School,urban,82.0,California,95340,Merced,Merced,Merced City School District
1,00002d44003ed46b066607c5455a999a,2017-04-01 01:08:50,Winton,California,Yes,953,2f53e5f31890e647048ac217cda3b83f,ee15bb7f179142fdff3f24cff8e88483,No,100.0,...,Fully Funded,2017-05-12,Herbert Hoover Middle School,urban,82.0,California,95340,Merced,Merced,Merced City School District
0,00002d44003ed46b066607c5455a999a,2017-05-02 12:34:09,Winton,California,Yes,953,2f53e5f31890e647048ac217cda3b83f,7831f66a25935db8b7424170355edabf,No,26.0,...,Fully Funded,2017-05-12,Herbert Hoover Middle School,urban,82.0,California,95340,Merced,Merced,Merced City School District
171,00002d44003ed46b066607c5455a999a,2017-10-18 14:34:11,Winton,California,Yes,953,c5821d32012efd7df4f6fa12e230e991,615d39f9c40cd3137a9854c29ad0dbe2,No,100.0,...,Fully Funded,2017-12-10,ADA Givens Elementary School,urban,81.0,California,95340,Merced,Merced,Merced City School District


In [9]:
donorsValidList = list(donationsValid['Donor ID'])

donationsValidFinal = df_final[df_final['Donor ID'].isin(donorsValidList)] # expanding the validation data as well

donationsValidFinal = donationsValidFinal.sort_values(by = ['Donor ID', 'Donation Received Date'])

donationsValidFinal.head(10)

Unnamed: 0,Donor ID,Donation Received Date,Donor City,Donor State,Donor Is Teacher,Donor Zip,Project ID,Donation ID,Donation Included Optional Donation,Donation Amount,...,Project Current Status,Project Fully Funded Date,School Name,School Metro Type,School Percentage Free Lunch,School State,School Zip,School City,School County,School District
1523,00006084c3d92d904a22e0a70f5c119a,2017-04-07 17:50:19,Brick,New Jersey,Yes,87,7771a382bf28d33e0046d74f495afda7,0e7de979325e84ad03163e26d59c075a,Yes,25.0,...,Fully Funded,2017-04-10,Red Bank Middle School,suburban,89.0,New Jersey,7701,Red Bank,Monmouth,Red Bank Borough Public Sd
1515,00006084c3d92d904a22e0a70f5c119a,2017-04-07 17:50:20,Brick,New Jersey,Yes,87,7771a382bf28d33e0046d74f495afda7,4768c31e21e89f79ada2a05e7b1336df,Yes,25.0,...,Fully Funded,2017-04-10,Red Bank Middle School,suburban,89.0,New Jersey,7701,Red Bank,Monmouth,Red Bank Borough Public Sd
1518,00006084c3d92d904a22e0a70f5c119a,2017-04-07 17:50:20,Brick,New Jersey,Yes,87,7771a382bf28d33e0046d74f495afda7,7e436d1b26e60246a9bd81ac059c6e9b,Yes,50.0,...,Fully Funded,2017-04-10,Red Bank Middle School,suburban,89.0,New Jersey,7701,Red Bank,Monmouth,Red Bank Borough Public Sd
1519,00006084c3d92d904a22e0a70f5c119a,2017-04-07 17:50:20,Brick,New Jersey,Yes,87,7771a382bf28d33e0046d74f495afda7,698df43826119cf99d9f24488039dd64,Yes,25.0,...,Fully Funded,2017-04-10,Red Bank Middle School,suburban,89.0,New Jersey,7701,Red Bank,Monmouth,Red Bank Borough Public Sd
1521,00006084c3d92d904a22e0a70f5c119a,2017-04-07 17:50:20,Brick,New Jersey,Yes,87,7771a382bf28d33e0046d74f495afda7,6a618527b395daf5830e41ea4417341d,Yes,25.0,...,Fully Funded,2017-04-10,Red Bank Middle School,suburban,89.0,New Jersey,7701,Red Bank,Monmouth,Red Bank Borough Public Sd
1522,00006084c3d92d904a22e0a70f5c119a,2017-04-07 17:50:20,Brick,New Jersey,Yes,87,7771a382bf28d33e0046d74f495afda7,aca915b2420c0b1851da1e42642e9e27,Yes,25.0,...,Fully Funded,2017-04-10,Red Bank Middle School,suburban,89.0,New Jersey,7701,Red Bank,Monmouth,Red Bank Borough Public Sd
1516,00006084c3d92d904a22e0a70f5c119a,2017-04-07 17:50:21,Brick,New Jersey,Yes,87,7771a382bf28d33e0046d74f495afda7,7001340658440b07f622d175c6991179,Yes,50.0,...,Fully Funded,2017-04-10,Red Bank Middle School,suburban,89.0,New Jersey,7701,Red Bank,Monmouth,Red Bank Borough Public Sd
1517,00006084c3d92d904a22e0a70f5c119a,2017-04-09 17:57:41,Brick,New Jersey,Yes,87,7771a382bf28d33e0046d74f495afda7,dc010aa5685ccfb8d6db4098730bae57,Yes,50.0,...,Fully Funded,2017-04-10,Red Bank Middle School,suburban,89.0,New Jersey,7701,Red Bank,Monmouth,Red Bank Borough Public Sd
1525,00006084c3d92d904a22e0a70f5c119a,2017-04-09 17:58:43,Brick,New Jersey,Yes,87,7771a382bf28d33e0046d74f495afda7,bfc1f1fc38ff8b57894f283afb8fc29d,Yes,50.0,...,Fully Funded,2017-04-10,Red Bank Middle School,suburban,89.0,New Jersey,7701,Red Bank,Monmouth,Red Bank Borough Public Sd
1520,00006084c3d92d904a22e0a70f5c119a,2017-04-09 17:59:32,Brick,New Jersey,Yes,87,7771a382bf28d33e0046d74f495afda7,f0ae22e51a49d1f54b5d50a0b0a11910,Yes,50.0,...,Fully Funded,2017-04-10,Red Bank Middle School,suburban,89.0,New Jersey,7701,Red Bank,Monmouth,Red Bank Borough Public Sd


In [10]:
# Subset california data
donations_cal_train = donationsTrainFinal[donationsTrainFinal["Donor State"] == 'California']
donations_cal_valid = donationsValidFinal[donationsValidFinal["Donor State"] == 'California'].sort_values(by=['Donation Received Date'])

In [11]:
print("Training data shape: \n", donations_cal_train.shape)
print("Validation data shape: \n", donations_cal_valid.shape)

Training data shape: 
 (447507, 36)
Validation data shape: 
 (47695, 36)


In [12]:
from sklearn import preprocessing
#x = np.reshape(donations_cal_train["Project Cost"].values,(-1,447507))
# min_max_scaler = preprocessing.MinMaxScaler()
# x_scaled = min_max_scaler.fit_transform(x)
# Proj_cost_Scaled = x_scaled.tolist

donations_cal_train['Project_cost_scaled'] = preprocessing.MinMaxScaler().fit_transform(donations_cal_train['Project Cost'].values.reshape(-1,1))

In [13]:
donations_cal_train.head(2)

Unnamed: 0,Donor ID,Donation Received Date,Donor City,Donor State,Donor Is Teacher,Donor Zip,Project ID,Donation ID,Donation Included Optional Donation,Donation Amount,...,Project Fully Funded Date,School Name,School Metro Type,School Percentage Free Lunch,School State,School Zip,School City,School County,School District,Project_cost_scaled
9,00002d44003ed46b066607c5455a999a,2016-10-25 20:15:11,Winton,California,Yes,953,e2beb818569f66adaa4ced21ca299ac6,08ed72ce14c548e8131a7dd7b8561988,Yes,10.0,...,2016-12-26,Herbert Hoover Middle School,urban,82.0,California,95340,Merced,Merced,Merced City School District,0.001848
500,00002d44003ed46b066607c5455a999a,2017-01-16 01:11:20,Winton,California,Yes,953,eb6d91cbeab5037ca2f45fc3f6a4de8c,158f00637416b7c41fcde429b10c3c41,Yes,15.51,...,2017-01-16,Clara B Ford Academy,suburban,95.0,Michigan,48127,Dearborn Hts,Wayne,Michigan Dept Of Education,0.000871


In [14]:
donations_cal_train.columns

Index(['Donor ID', 'Donation Received Date', 'Donor City', 'Donor State',
       'Donor Is Teacher', 'Donor Zip', 'Project ID', 'Donation ID',
       'Donation Included Optional Donation', 'Donation Amount',
       'Donor Cart Sequence', 'School ID', 'Teacher ID',
       'Teacher Project Posted Sequence', 'Project Type', 'Project Title',
       'Project Essay', 'Project Short Description', 'Project Need Statement',
       'Project Subject Category Tree', 'Project Subject Subcategory Tree',
       'Project Grade Level Category', 'Project Resource Category',
       'Project Cost', 'Project Posted Date', 'Project Expiration Date',
       'Project Current Status', 'Project Fully Funded Date', 'School Name',
       'School Metro Type', 'School Percentage Free Lunch', 'School State',
       'School Zip', 'School City', 'School County', 'School District',
       'Project_cost_scaled'],
      dtype='object')

In [30]:
donations_cal_train.head(2)

Unnamed: 0,Donor ID,Donation Received Date,Donor City,Donor State,Donor Is Teacher,Donor Zip,Project ID,Donation ID,Donation Included Optional Donation,Donation Amount,...,Project Fully Funded Date,School Name,School Metro Type,School Percentage Free Lunch,School State,School Zip,School City,School County,School District,Project_cost_scaled
9,00002d44003ed46b066607c5455a999a,2016-10-25 20:15:11,Winton,California,Yes,953,e2beb818569f66adaa4ced21ca299ac6,08ed72ce14c548e8131a7dd7b8561988,Yes,10.0,...,2016-12-26,Herbert Hoover Middle School,urban,82.0,California,95340,Merced,Merced,Merced City School District,<built-in method tolist of numpy.ndarray objec...
500,00002d44003ed46b066607c5455a999a,2017-01-16 01:11:20,Winton,California,Yes,953,eb6d91cbeab5037ca2f45fc3f6a4de8c,158f00637416b7c41fcde429b10c3c41,Yes,15.51,...,2017-01-16,Clara B Ford Academy,suburban,95.0,Michigan,48127,Dearborn Hts,Wayne,Michigan Dept Of Education,<built-in method tolist of numpy.ndarray objec...


In [15]:
from scipy import sparse

donations_cal_train_gp = donations_cal_train.groupby(['Donor ID', 'Project ID'])['Donation Amount'].sum().reset_index()

donations_cal_train_gp["Donated"] = 1

rows=donations_cal_train_gp["Donor ID"].drop_duplicates().sort_values().reset_index().drop('index',axis=1).\
        reset_index().rename(columns={'index':'row_num'})
cols=donations_cal_train_gp["Project ID"].drop_duplicates().sort_values().reset_index().drop('index',axis=1).\
        reset_index().rename(columns={'index':'col_num'})

donations_cal_train_gp_new =donations_cal_train_gp.merge(rows,how='left',on="Donor ID").merge(cols,how='left',on="Project ID")

data = list(donations_cal_train_gp_new.Donated)
row = list(donations_cal_train_gp_new.row_num)
col = list(donations_cal_train_gp_new.col_num)

# Create sparse matrix of donor project pair
S = sparse.coo_matrix((data,(row,col)))
R = S.tocsr()

#Profile_profile = sparse.csr_matrix(df_features,dtype='int64')
#donations_cal_train_pvt = donations_cal_train_gp.pivot(index='Donor ID', columns='Project ID', values='Donated')
#donations_cal_train_pvt.head()
# donations_cal_train_new = donations_cal_train_gp.set_index(['Donor ID', 'Project ID'])
# mat = sparse.coo_matrix((donations_cal_train_new['Donated'],(donations_cal_train_new.index.labels[0], donations_cal_train_new.index.labels[1])))
# R = mat.tocsr()

# Create sparse matrix of project profile
ohFeatures = ['Project ID', 'Project_cost_scaled', 'Project Type','Project Subject Category Tree',\
              'Project Grade Level Category', 'Project Resource Category','Project Current Status',\
              'School Metro Type', 'School State']
projFeatures = donations_cal_train[ohFeatures].drop_duplicates(subset = ['Project ID'])

cat_col = ['Project Type','Project Subject Category Tree','Project Grade Level Category', \
           'Project Resource Category','Project Current Status','School Metro Type', 'School State']
for col in cat_col:
    cat_val = [str(cat) for cat in list(donations_cal_train[col].unique()) if cat is not None]
    projFeatures[col] = projFeatures[col].astype(pd.CategoricalDtype(categories=cat_val))
    
traindonor_cal_ohm = pd.get_dummies(projFeatures, columns=['Project Type',\
                                                           'Project Subject Category Tree',\
                                                           'Project Grade Level Category', \
                                                           'Project Resource Category','Project Current Status',\
                                                           'School Metro Type', 'School State'])
#traindonor_cal_ohm.drop(['Project Type','Project Subject Category Tree','Project Grade Level Category', 'Project Resource Category','Project Current Status','School Metro Type', 'School State'], axis=1, inplace=True)
traindonor_cal_ohm= traindonor_cal_ohm.fillna(0)
traindonor_cal_ohm.set_index('Project ID', inplace=True)
Proj_Profile = sparse.csr_matrix(traindonor_cal_ohm,dtype='int64')

# Dot product to create Donor profile
Donor_profile = R.dot(Proj_Profile)

# Normalize donor profile based on number of donations made
donations_made = pd.DataFrame(donations_cal_train_gp_new.groupby('Donor ID').Donated.count())
donations_made.rename(index=str, columns={'Donor ID':'Donor ID', 'Donated':'Num_donations'}, inplace= True)
donations = [x for x in donations_made.Num_donations]
donations = np.asarray(donations)
donations = np.reshape(donations, (-Donor_profile.shape[0], 1))
Donor_profile = Donor_profile.multiply(1/donations)

In [16]:
# donations_cal_train_gp_new.head()
# donations_cal_train_gp[donations_cal_train_gp['Donor ID']=='fffb3518ef82b296d28fe0dc6b50f270']
# print(donations_cal_train_gp_new[donations_cal_train_gp_new["row_num"]==70583]["Donor ID"].values[0])
# print(donations_cal_train_gp_new[donations_cal_train_gp_new["col_num"]==58804]["Project ID"].values[0])
# print(R)
print(Donor_profile)

  (0, 72)	0.125
  (0, 23)	0.125
  (0, 79)	0.125
  (0, 16)	0.125
  (0, 124)	0.25
  (0, 121)	0.125
  (0, 61)	0.25
  (0, 13)	0.125
  (0, 60)	0.25
  (0, 11)	0.25
  (0, 56)	0.125
  (0, 12)	0.125
  (0, 86)	0.5
  (0, 81)	0.75
  (0, 62)	0.25
  (0, 55)	0.5
  (0, 28)	0.125
  (0, 102)	0.125
  (0, 82)	0.25
  (0, 78)	0.875
  (0, 64)	0.125
  (0, 57)	0.375
  (0, 7)	0.125
  (0, 1)	1.0
  (1, 86)	0.5
  :	:
  (70587, 4)	0.25
  (70587, 86)	0.75
  (70587, 62)	0.5
  (70587, 57)	0.25
  (70587, 9)	0.25
  (70587, 91)	0.25
  (70587, 81)	0.5
  (70587, 78)	1.0
  (70587, 61)	0.25
  (70587, 58)	0.5
  (70587, 12)	0.5
  (70587, 1)	1.0
  (70588, 86)	0.5
  (70588, 82)	0.5
  (70588, 79)	0.5
  (70588, 62)	0.5
  (70588, 55)	0.5
  (70588, 18)	0.5
  (70588, 95)	0.5
  (70588, 81)	0.5
  (70588, 78)	0.5
  (70588, 60)	0.5
  (70588, 57)	0.5
  (70588, 11)	0.5
  (70588, 1)	1.0


In [32]:
donors_val = donations_cal_valid.groupby(["Donor ID"])["Project ID"].count().reset_index(name="count")
donors_val_list = donors_val[donors_val["count"]>1]["Donor ID"].tolist()
print(donors_val_list[1])
donations_cal_valid['Project_cost_scaled'] = preprocessing.MinMaxScaler().fit_transform(donations_cal_valid['Project Cost'].values.reshape(-1,1))
donations_cal_valid[donations_cal_valid["Donor ID"] =="0008ab1b251bf22f7d11fb6540327825"]
donor_1 = donations_cal_valid[donations_cal_valid["Donor ID"] =="0008ab1b251bf22f7d11fb6540327825"].head(1)


0008ab1b251bf22f7d11fb6540327825


In [33]:
# Single donor profile
donor_1_gp = donor_1.groupby(['Donor ID', 'Project ID'])['Donation Amount'].sum().reset_index()

donor_1_gp["Donated"] = 1

rows=donor_1_gp["Donor ID"].drop_duplicates().sort_values().reset_index().drop('index',axis=1).\
        reset_index().rename(columns={'index':'row_num'})
cols=donor_1_gp["Project ID"].drop_duplicates().sort_values().reset_index().drop('index',axis=1).\
        reset_index().rename(columns={'index':'col_num'})

donor_1_gp_new =donor_1_gp.merge(rows,how='left',on="Donor ID").merge(cols,how='left',on="Project ID")

data = list(donor_1_gp_new.Donated)
row = list(donor_1_gp_new.row_num)
col = list(donor_1_gp_new.col_num)

# Create sparse matrix of donor project pair
S = sparse.coo_matrix((data,(row,col)))
R = S.tocsr()

#Profile_profile = sparse.csr_matrix(df_features,dtype='int64')
#donations_cal_train_pvt = donations_cal_train_gp.pivot(index='Donor ID', columns='Project ID', values='Donated')
#donations_cal_train_pvt.head()
# donations_cal_train_new = donations_cal_train_gp.set_index(['Donor ID', 'Project ID'])
# mat = sparse.coo_matrix((donations_cal_train_new['Donated'],(donations_cal_train_new.index.labels[0], donations_cal_train_new.index.labels[1])))
# R = mat.tocsr()
# Get categories


# Create sparse matrix of project profile
ohFeatures = ['Project ID', 'Project_cost_scaled', 'Project Type','Project Subject Category Tree','Project Grade Level Category', 'Project Resource Category','Project Current Status','School Metro Type', 'School State']
projFeatures = donor_1[ohFeatures].drop_duplicates(subset = ['Project ID'])
cat_col = ['Project Type','Project Subject Category Tree','Project Grade Level Category', 'Project Resource Category','Project Current Status','School Metro Type', 'School State']
for col in cat_col:
    cat_val = [str(cat) for cat in list(donations_cal_train[col].unique()) if cat is not None]
    projFeatures[col] = projFeatures[col].astype(pd.CategoricalDtype(categories=cat_val))

donor_1_ohm = pd.get_dummies(projFeatures, columns=['Project Type','Project Subject Category Tree','Project Grade Level Category', 'Project Resource Category','Project Current Status','School Metro Type', 'School State'])
#traindonor_cal_ohm.drop(['Project Type','Project Subject Category Tree','Project Grade Level Category', 'Project Resource Category','Project Current Status','School Metro Type', 'School State'], axis=1, inplace=True)

donor_1_ohm= donor_1_ohm.fillna(0)
donor_1_ohm.set_index('Project ID', inplace=True)
#print(donor_1_ohm.columns)
Proj_Profile = sparse.csr_matrix(donor_1_ohm,dtype='int64')

# Dot product to create Donor profile
Donor_1_profile = R.dot(Proj_Profile)

In [34]:

cos_sim = cosine_similarity(Donor_profile,Donor_1_profile)
print(cos_sim.argmax())
#cos_sim.argsort()[-5:][::-1]
#idx = (-cos_sim).argsort()[:5]
#print(type(cos_sim))
#cos_df = pd.DataFrame(cos_sim)
#print(cos_df)

11356


In [22]:
type(cos_sim)
numpy.sort

numpy.ndarray

In [35]:

euc_dis = euclidean_distances(Donor_profile,Donor_1_profile)
euc_dis.argmin()


11356

In [36]:
# Selected donor
print(donors_val_list[1])
donations_cal_valid[donations_cal_valid["Donor ID"] =="0008ab1b251bf22f7d11fb6540327825"].head(1)
#[['Project ID', 'Project Cost', 'Project Type','Project Subject Category Tree','Project Grade Level Category', 'Project Resource Category','Project Current Status','School Metro Type', 'School State']]


0008ab1b251bf22f7d11fb6540327825


Unnamed: 0,Donor ID,Donation Received Date,Donor City,Donor State,Donor Is Teacher,Donor Zip,Project ID,Donation ID,Donation Included Optional Donation,Donation Amount,Donor Cart Sequence,School ID,Teacher ID,Teacher Project Posted Sequence,Project Type,Project Title,Project Essay,Project Short Description,Project Need Statement,Project Subject Category Tree,Project Subject Subcategory Tree,Project Grade Level Category,Project Resource Category,Project Cost,Project Posted Date,Project Expiration Date,Project Current Status,Project Fully Funded Date,School Name,School Metro Type,School Percentage Free Lunch,School State,School Zip,School City,School County,School District,Project_cost_scaled
41460,0008ab1b251bf22f7d11fb6540327825,2017-09-05 10:01:46,Sacramento,California,Yes,958,136a79ea37c63eeb4c170a5c1a4f90c7,fb468ffea57c853bea1648076b54df47,Yes,25.0,1,adeff7eebec13b6b8872f63735b7dad3,0008ab1b251bf22f7d11fb6540327825,3,Teacher-Led,Newcomer Students Picking Up STEAM.,My students are third and fourth grade newcome...,My students are third and fourth grade newcome...,My students need STEM building kits and block ...,"Math & Science, Music & The Arts","Applied Sciences, Visual Arts",Grades 3-5,Supplies,495.21,2017-09-05,2018-01-04,Fully Funded,2017-09-06,Woodridge Elementary School,suburban,95.0,California,95842,Sacramento,Sacramento,Twin Rivers Unified Sch Dist,0.018279


In [37]:
#11356 Cosine
print(donations_cal_train_gp_new[donations_cal_train_gp_new["row_num"]==11356]["Donor ID"].values[0])
donations_cal_train[donations_cal_train["Donor ID"] =="29230db7d19074f071979f6beb5d911c"]
#[['Project ID', 'Project Cost', 'Project Type','Project Subject Category Tree','Project Grade Level Category', 'Project Resource Category','Project Current Status','School Metro Type', 'School State']]


29230db7d19074f071979f6beb5d911c


Unnamed: 0,Donor ID,Donation Received Date,Donor City,Donor State,Donor Is Teacher,Donor Zip,Project ID,Donation ID,Donation Included Optional Donation,Donation Amount,Donor Cart Sequence,School ID,Teacher ID,Teacher Project Posted Sequence,Project Type,Project Title,Project Essay,Project Short Description,Project Need Statement,Project Subject Category Tree,Project Subject Subcategory Tree,Project Grade Level Category,Project Resource Category,Project Cost,Project Posted Date,Project Expiration Date,Project Current Status,Project Fully Funded Date,School Name,School Metro Type,School Percentage Free Lunch,School State,School Zip,School City,School County,School District,Project_cost_scaled
3023513,29230db7d19074f071979f6beb5d911c,2015-12-21 13:57:57,Valencia,California,No,913,09c7aaf4908ab2cbe8b4931b2190f665,1a01fb88b81832d8919b5d39ee4754c3,No,50.0,1,1de3deaed03787f560ac5fb631202043,ae9f436a0cef55c3030292f10498a603,2,Teacher-Led,Making Makers: Bits for Innovation,I teach Astronomy and AP Physics to high schoo...,I teach Astronomy and AP Physics to high schoo...,My students need 4 Little Bits Electronics kit...,"Math & Science, Music & The Arts","Applied Sciences, Visual Arts",Grades 9-12,Technology,1669.61,2015-12-13,2016-04-11,Fully Funded,2016-03-11,Lake Brantley High School,suburban,35.0,Florida,32714,Altamonte Spg,Seminole,Seminole Co Public School Dist,0.006206
3023512,29230db7d19074f071979f6beb5d911c,2015-12-23 23:56:45,Valencia,California,No,913,09c7aaf4908ab2cbe8b4931b2190f665,88cb94dd1bf48df0d924468ce9fee892,No,50.0,2,1de3deaed03787f560ac5fb631202043,ae9f436a0cef55c3030292f10498a603,2,Teacher-Led,Making Makers: Bits for Innovation,I teach Astronomy and AP Physics to high schoo...,I teach Astronomy and AP Physics to high schoo...,My students need 4 Little Bits Electronics kit...,"Math & Science, Music & The Arts","Applied Sciences, Visual Arts",Grades 9-12,Technology,1669.61,2015-12-13,2016-04-11,Fully Funded,2016-03-11,Lake Brantley High School,suburban,35.0,Florida,32714,Altamonte Spg,Seminole,Seminole Co Public School Dist,0.006206


In [38]:
print(Donor_1_profile)

  (0, 1)	1
  (0, 28)	1
  (0, 57)	1
  (0, 61)	1
  (0, 78)	1
  (0, 82)	1
  (0, 86)	1


61920


40813

In [40]:
# Donor profile of cos similar donor
# Single donor profile
donor_cos = donations_cal_train[donations_cal_train["Donor ID"]=="29230db7d19074f071979f6beb5d911c"]
donor_cos_gp = donor_cos.groupby(['Donor ID', 'Project ID'])['Donation Amount'].sum().reset_index()

donor_cos_gp["Donated"] = 1

rows=donor_cos_gp["Donor ID"].drop_duplicates().sort_values().reset_index().drop('index',axis=1).\
        reset_index().rename(columns={'index':'row_num'})
cols=donor_cos_gp["Project ID"].drop_duplicates().sort_values().reset_index().drop('index',axis=1).\
        reset_index().rename(columns={'index':'col_num'})

donor_cos_gp_new =donor_cos_gp.merge(rows,how='left',on="Donor ID").merge(cols,how='left',on="Project ID")

data = list(donor_cos_gp_new.Donated)
row = list(donor_cos_gp_new.row_num)
col = list(donor_cos_gp_new.col_num)

# Create sparse matrix of donor project pair
S = sparse.coo_matrix((data,(row,col)))
R = S.tocsr()


# Create sparse matrix of project profile
ohFeatures = ['Project ID', 'Project_cost_scaled', 'Project Type','Project Subject Category Tree','Project Grade Level Category', 'Project Resource Category','Project Current Status','School Metro Type', 'School State']
projFeatures = donor_cos[ohFeatures].drop_duplicates(subset = ['Project ID'])
cat_col = ['Project Type','Project Subject Category Tree','Project Grade Level Category', 'Project Resource Category','Project Current Status','School Metro Type', 'School State']
for col in cat_col:
    cat_val = [str(cat) for cat in list(donations_cal_train[col].unique()) if cat is not None]
    projFeatures[col] = projFeatures[col].astype(pd.CategoricalDtype(categories=cat_val))

donor_cos_ohm = pd.get_dummies(projFeatures, columns=['Project Type','Project Subject Category Tree','Project Grade Level Category', 'Project Resource Category','Project Current Status','School Metro Type', 'School State'])
#traindonor_cal_ohm.drop(['Project Type','Project Subject Category Tree','Project Grade Level Category', 'Project Resource Category','Project Current Status','School Metro Type', 'School State'], axis=1, inplace=True)

donor_cos_ohm= donor_cos_ohm.fillna(0)
donor_cos_ohm.set_index('Project ID', inplace=True)
#print(donor_1_ohm.columns)
Proj_Profile = sparse.csr_matrix(donor_cos_ohm,dtype='int64')

# Dot product to create Donor profile
Donor_cos_profile = R.dot(Proj_Profile)

In [41]:
# Donor profile of euc donor

# Single donor profile
donor_euc = donations_cal_train[donations_cal_train["Donor ID"]=="29230db7d19074f071979f6beb5d911c"]
donor_euc_gp = donor_euc.groupby(['Donor ID', 'Project ID'])['Donation Amount'].sum().reset_index()

donor_euc_gp["Donated"] = 1

rows=donor_euc_gp["Donor ID"].drop_duplicates().sort_values().reset_index().drop('index',axis=1).\
        reset_index().rename(columns={'index':'row_num'})
cols=donor_euc_gp["Project ID"].drop_duplicates().sort_values().reset_index().drop('index',axis=1).\
        reset_index().rename(columns={'index':'col_num'})

donor_euc_gp_new =donor_euc_gp.merge(rows,how='left',on="Donor ID").merge(cols,how='left',on="Project ID")

data = list(donor_euc_gp_new.Donated)
row = list(donor_euc_gp_new.row_num)
col = list(donor_euc_gp_new.col_num)

# Create sparse matrix of donor project pair
S = sparse.coo_matrix((data,(row,col)))
R = S.tocsr()


# Create sparse matrix of project profile
ohFeatures = ['Project ID', 'Project_cost_scaled', 'Project Type','Project Subject Category Tree','Project Grade Level Category', 'Project Resource Category','Project Current Status','School Metro Type', 'School State']
projFeatures = donor_euc[ohFeatures].drop_duplicates(subset = ['Project ID'])
cat_col = ['Project Type','Project Subject Category Tree','Project Grade Level Category', 'Project Resource Category','Project Current Status','School Metro Type', 'School State']
for col in cat_col:
    cat_val = [str(cat) for cat in list(donations_cal_train[col].unique()) if cat is not None]
    projFeatures[col] = projFeatures[col].astype(pd.CategoricalDtype(categories=cat_val))

donor_euc_ohm = pd.get_dummies(projFeatures, columns=['Project Type','Project Subject Category Tree','Project Grade Level Category', 'Project Resource Category','Project Current Status','School Metro Type', 'School State'])
#traindonor_cal_ohm.drop(['Project Type','Project Subject Category Tree','Project Grade Level Category', 'Project Resource Category','Project Current Status','School Metro Type', 'School State'], axis=1, inplace=True)

donor_euc_ohm= donor_euc_ohm.fillna(0)
donor_euc_ohm.set_index('Project ID', inplace=True)
#print(donor_1_ohm.columns)
Proj_Profile = sparse.csr_matrix(donor_euc_ohm,dtype='int64')

# Dot product to create Donor profile
Donor_euc_profile = R.dot(Proj_Profile)

In [47]:
# Create sparse matrix of project profile
ohFeatures = ['Project ID', 'Project_cost_scaled', 'Project Type','Project Subject Category Tree','Project Grade Level Category', 'Project Resource Category','Project Current Status','School Metro Type', 'School State']
projFeatures = donations_cal_valid[ohFeatures].drop_duplicates(subset = ['Project ID'])
cat_col = ['Project Type','Project Subject Category Tree','Project Grade Level Category', 'Project Resource Category','Project Current Status','School Metro Type', 'School State']
for col in cat_col:
    cat_val = [str(cat) for cat in list(donations_cal_train[col].unique()) if cat is not None]
    projFeatures[col] = projFeatures[col].astype(pd.CategoricalDtype(categories=cat_val))

donor_cos_ohm = pd.get_dummies(projFeatures, columns=['Project Type','Project Subject Category Tree','Project Grade Level Category', 'Project Resource Category','Project Current Status','School Metro Type', 'School State'])
#traindonor_cal_ohm.drop(['Project Type','Project Subject Category Tree','Project Grade Level Category', 'Project Resource Category','Project Current Status','School Metro Type', 'School State'], axis=1, inplace=True)

donor_cos_ohm= donor_cos_ohm.fillna(0)
donor_cos_ohm.set_index('Project ID', inplace=True)
#print(donor_1_ohm.columns)
Proj_Profile = sparse.csr_matrix(donor_cos_ohm,dtype='int64')

In [51]:
cos_sim = cosine_similarity(Proj_Profile,Donor_euc_profile)
#print(cos_sim)
top_10_rec = (-cos_sim).argsort()[:10]

In [63]:
print(type(cos_sim))

<class 'numpy.ndarray'>


In [65]:
cos_sim.reshape(-1)[0]

0.0006793840589075194

In [66]:
top_10_rec

array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0]], dtype=int64)

In [None]:
#donations_cal_train_gp.head(2)
donations_cal_train = donationsTrainFinal[donationsTrainFinal["Donor State"] == 'California']
test = donations_cal_train.groupby(['Donor ID', 'Project ID'])['Donation Amount'].sum().reset_index()
test['Project ID'].nunique()
#donations_cal_train_gp['Project ID'].unique.count

In [None]:
projFeatures['Project ID'].nunique()
#print(list(traindonor_cal_ohm.columns))
#traindonor_cal_ohm.head(1)

In [None]:
# Train donor OHE
donorstrain =donationsTrainFinal[["Donor ID","Donor City","Donor State","Donor Is Teacher","Donor Zip"]]
donorstrain.head(1)
print(len(donorstrain))
donorstrainuniq = donorstrain.drop_duplicates(subset = ['Donor ID'])
print(len(donorstrainuniq))

In [None]:
# Val donor OHE
donorsval = donationsValidFinal[["Donor ID","Donor City","Donor State","Donor Is Teacher","Donor Zip"]]
donorsval.head(1)
print(len(donorsval))
donorsvaluniq = donorsval.drop_duplicates(subset = ['Donor ID'])
print(len(donorsvaluniq))


In [None]:
# Train donor missing values
donornullcol = donorstrainuniq.columns[donorstrainuniq.isnull().any()]
donorstrainuniq[donornullcol].isnull().sum()

In [None]:
# Val donor missing values
donornullcol = donorsvaluniq.columns[donorsvaluniq.isnull().any()]
donorsvaluniq[donornullcol].isnull().sum()

In [None]:
# Train donors replace missing value
donorstrainuniq.dropna(inplace = True)

In [None]:
# Val donors replace missing value
donorsvaluniq.dropna(inplace = True)

In [None]:
# Train donors check missing values
donorstrainuniq[donornullcol].isnull().sum()

In [None]:
# Val donors check missing values
donorsvaluniq[donornullcol].isnull().sum()

In [None]:
# columns of train and val donors
print(donorstrainuniq.columns)
print(donorsvaluniq.columns)

In [None]:
#acs_df.head()
# acs_df=acs_df.rename(columns = {'ZIP':'Donor Zip'})

In [None]:
# acs_df['Donor Zip'] = acs_df['Donor Zip'].astype(int)

In [None]:
# acs_df.info()

In [None]:
#donorstrainuniq['Donor Zip'] = donorstrainuniq['Donor Zip'].astype(str).astype(int)
# donorstrainuniq['Donor Zip'] = donorstrainuniq['Donor Zip'].astype(str)
# donorsvaluniq['Donor Zip'] = donorsvaluniq['Donor Zip'].astype(str)

In [None]:
# len(donorstrainuniq["Donor Zip"].unique())

In [None]:
# donors_train = donorstrainuniq.merge(acs_df, on = "Donor Zip", how = "left")
#donors_val = donorsvaluniq.merge(acs_df, on = "Donor Zip", how = "inner")

#donors_train = pd.merge(donorstrainuniq, acs_df,how='left', on='Donor Zip')

In [None]:
# donors_train.isnull().sum()

In [None]:
# acs_df.sort_values(by='Donor Zip', ascending=True)

In [None]:
# donors_train[donors_train['Donor State'] != 'other'].sort_values(by='Donor Zip', ascending = True)

In [None]:
# donors_train['Donor Zip'].sort_values(by='Donor Zip', ascending = True)

In [None]:
# donors_train['Percent_families_BPL'].fillna((donors_train['Percent_families_BPL'].mean()), inplace=True)
# donors_train['Percent_over_5yrs_not_speak_english_well'].fillna((donors_train['Percent_over_5yrs_not_speak_english_well'].mean()), inplace=True)
# donors_train['Percent_HS_grad_25yrs_above'].fillna((donors_train['Percent_HS_grad_25yrs_above'].mean()), inplace=True)
# donors_train['Percent_with_health_insurance'].fillna((donors_train['Percent_with_health_insurance'].mean()), inplace=True)

# donors_val['Percent_families_BPL'].fillna((donors_val['Percent_families_BPL'].mean()), inplace=True)
# donors_val['Percent_over_5yrs_not_speak_english_well'].fillna((donors_val['Percent_over_5yrs_not_speak_english_well'].mean()), inplace=True)
# donors_val['Percent_HS_grad_25yrs_above'].fillna((donors_val['Percent_HS_grad_25yrs_above'].mean()), inplace=True)
# donors_val['Percent_with_health_insurance'].fillna((donors_val['Percent_with_health_insurance'].mean()), inplace=True)

#donors_train.dropna(inplace=True)
#donors_val.dropna(inplace=True)

In [None]:
traindonor_ohm = pd.get_dummies(donorstrainuniq, columns=['Donor State', 'Donor Is Teacher'])

traindonor_ohm.head()

In [None]:
traindonor_ohm.drop(['Donor City','Donor Zip'], axis=1, inplace=True)
#traindonor_ohm.drop('Donor Zip', axis=1, inplace=True)

In [None]:
valdonor_ohm = pd.get_dummies(donorsvaluniq, columns=['Donor State', 'Donor Is Teacher'])

valdonor_ohm.head()

In [None]:
valdonor_ohm.drop(['Donor City','Donor Zip'], axis=1, inplace=True)


In [None]:
#import gc
#del valdonor_ohm
#del traindonor_ohm
#gc.collect()

In [None]:
#traindonor_ohm.set_index('Donor ID', inplace=True)

In [None]:
traindonor_ohm.head()

In [None]:
#valdonor_ohm.set_index('Donor ID', inplace=True)

In [None]:
valdonor_ohm.head()

In [None]:
# from sklearn.decomposition import PCA
# import matplotlib.pyplot as plt
# pca=PCA()
# X_trans = pca.fit_transform(traindonor_ohm)

# plt.figure(figsize=(12,8))
# plt.plot(np.cumsum(pca.explained_variance_ratio_));
# plt.title("Cumulative variance retained for top N components");
# plt.xlabel("Number of components");
# plt.ylabel("Cumulative Variance");

In [None]:
train_scaled_DF =pd.DataFrame(StandardScaler().fit_transform(traindonor_ohm.iloc[:,1:]))

In [None]:
val_scaled_DF = pd.DataFrame(StandardScaler().fit_transform(traindonor_ohm.iloc[:,1:]))

In [None]:
train_scaled_DF.head()

In [None]:
cos_val_list_main =[]
for index1, row1 in train_scaled_DF.iterrows():
    cos_val_list=[]
    for index2,row2 in train_scaled_DF.iterrows():
# #         row1 = np.reshape(row1,(-1,2))
# #         row2 = np.reshape(row2,(-1,2))
#         print(np.ararow1)
#         print(type(row1))
#         print(row2)
#         print(type(row2))
        cosval = cosine_similarity(np.array(row1).reshape(1,-1), np.array(row2).reshape(1,-1))
        cos_val_list.append(cosval)
    cos_val_list_main.append(cos_val_list)
    
cos_val_df =pd.DataFrame(cos_val_list_main, columns = range(len(train_scaled_DF)))
    

In [None]:
cos_val_df.head()

## One-Hot Vectorization of Features

In [None]:
# checking for null values in the dataframe

trainNullCol = donationsTrainFinal.columns[donationsTrainFinal.isnull().any()]
donationsTrainFinal[trainNullCol].isnull().sum()

In [None]:
# because we are doing content-based filtering, here are the features of the projects we are interested in:

ohFeatures = ['Project ID', 'Project Cost', 'Project Type','Project Subject Category Tree',
       'Project Grade Level Category', 'Project Resource Category',
       'Project Current Status',
       'School Metro Type', 'School State']

In [None]:
# drop duplicates of projects because 'Project ID' has to be the primary key

projFeatures = donationsTrainFinal[ohFeatures].drop_duplicates(subset = ['Project ID'])

projFeatures = projFeatures.dropna() # drop the null values as well

print(projFeatures.shape)

In [None]:
# no null values or duplicates

trainNullCol = projFeatures.columns[projFeatures.isnull().any()]
projFeatures[trainNullCol].isnull().sum()

In [None]:
enc = OHE(handle_unknown='ignore')
enc.fit(projFeatures.iloc[:,2:]) # fit the one hot vector on our dataframe

In [None]:
enc.categories_ # data dictionary of the OHM

In [None]:
oneHotMatrix = pd.DataFrame(enc.transform(projFeatures.iloc[:,2:]).toarray()) # convert the one-hot matrix into a dataframe

In [None]:
# this is what the OHM looks like

oneHotMatrix

## Merging one-hot matrix with the dataframe

In [None]:
projOHM = pd.concat([projFeatures.reset_index(drop = True), oneHotMatrix.reset_index(drop = True)], axis = 1)

projOHM.head(1)


In [None]:
dropCol = ['Project Type', 'Project Subject Category Tree', 'Project Grade Level Category',
           'Project Resource Category', 'Project Current Status',
           'School Metro Type', 'School State']

In [None]:
# because we wanted to keep the project cost as one of the features,
# we vectorized the rest of features and vertically stacked the project cost

projOHM = projOHM.drop(dropCol, axis = 1)

projOHM.head(1)

In [None]:
# data dictionary to track the one hot matrix

dataDict = enc.get_feature_names(dropCol) # please note that any new input should strictly should the indexes
                                            # mentioned in data dictionary

In [None]:
del projOHM

In [None]:
gc.collect()

## Building the recommendation system

In [None]:
# scale and normalize the dataset

ohmDF = pd.DataFrame(StandardScaler().fit_transform(projOHM.iloc[:,1:]))

In [None]:
# taking the first row as a sample row for the recommendation system

testRow = np.array(ohmDF.iloc[:1,:])

testRow

In [None]:
# getting the cosine similarity between our feature matrix and test sample

test2 = cosine_similarity(ohmDF, testRow)

test2

In [None]:
# tracing back the cosine similarity with their corresponding project ID
# so that we know what project does the similarity correspond to

test3 = pd.concat([projOHM.iloc[:,:1], pd.DataFrame(test2)], axis = 1)

test3 = test3.sort_values(by = 0, ascending = False) # sorting the similarity in descending order

In [None]:
# checking if the recommendation system fares well
# here the recommended project has the same features, except the project cost

projFeatures[projFeatures['Project ID'] == '7d889853a64bcb23d0abc52bb21770b7']

In [None]:
# checking if the recommendation system fares well v2

projFeatures[projFeatures['Project ID'] == 'e2beb818569f66adaa4ced21ca299ac6']

# Collaborative filtering

In [None]:
# scale and normalize the dataset
# Train dataset
train_ohmDF = pd.DataFrame(StandardScaler().fit_transform(traindonor_ohm.iloc[:,1:]))
# Validation dataset
val_ohmDF = pd.DataFrame(StandardScaler().fit_transform(valdonor_ohm.iloc[:,1:]))
# Normalizing gives memory error

In [None]:
# Valid donor selected : 00006084c3d92d904a22e0a70f5c119a

In [None]:
# taking the first row of test data as a sample row for the recommendation system

testRow = np.array(val_ohmDF.iloc[:1,:])

testRow

In [None]:
# getting the cosine similarity between our train donors matrix and test sample

test2 = cosine_similarity(train_ohmDF, testRow)

test2
# tracing back the cosine similarity with their corresponding donor ID
# so that we know which donor does the similarity correspond to

test3 = pd.concat([train_ohmDF.iloc[:,:1], pd.DataFrame(test2)], axis = 1)
test3.columns = [1,2]
test3.head()
test3 = test3.sort_values(by =2, ascending = False) # sorting the similarity in descending order

# checking if the recommendation system fares well
# here the recommended project has the same features, except the project cost

projFeatures[projFeatures['Project ID'] == '7d889853a64bcb23d0abc52bb21770b7']

In [None]:
# getting the cosine similarity between our train donors matrix and test sample

test2 = cosine_similarity(train_ohmDF, testRow)

test2

In [None]:
# tracing back the cosine similarity with their corresponding donor ID
# so that we know which donor does the similarity correspond to

test3 = pd.concat([train_ohmDF.iloc[:,:1], pd.DataFrame(test2)], axis = 1)
test3.columns = [1,2]
test3.head()

In [None]:
test3 = test3.sort_values(by =2, ascending = False) # sorting the similarity in descending order

In [None]:
# checking if the recommendation system fares well
# here the recommended project has the same features, except the project cost

projFeatures[projFeatures['Project ID'] == '7d889853a64bcb23d0abc52bb21770b7']

In [None]:
# checking if the recommendation system fares well v2

projFeatures[projFeatures['Project ID'] == 'e2beb818569f66adaa4ced21ca299ac6']