In [1]:
# Introduction

In [2]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt

In [3]:
#Read all the csv files
donations = pd.read_csv('Donations.csv')
donors = pd.read_csv('Donors.csv',low_memory=False)
schools = pd.read_csv('Schools.csv',error_bad_lines=False)
teachers = pd.read_csv('Teachers.csv',error_bad_lines=False)
projects = pd.read_csv('Projects.csv',error_bad_lines=False)
resources = pd.read_csv('Resources.csv',error_bad_lines=False,warn_bad_lines=False)

#### Pre processing

In [12]:

test_mode = True

# Merge donor and donations table
donations = donations.merge(donors, on= 'Donor ID', how ='left')
df = donations.merge(projects, on= 'Project ID', how = 'left')

# Missing values
donations['Donation Amount'] = donations['Donation Amount'].fillna(0)

# Limiting the size of the data
if test_mode:
    df = df.head(10000)

donations_df = df

# Event strength
donations_df['eventStrength'] = donations_df['Donation Amount']


def eventstrength_log(x):
    return math.log(1+x, 2)

df_final = donations_df.groupby(['Donor ID','Project ID'])['eventStrength'].sum().apply(eventstrength_log).reset_index()
df_final.head()

# Update projects dataset
project_cols = projects.columns
projects = df[project_cols].drop_duplicates()

print('# of projects: %d' % len(projects))
print('# of unique user/project donations: %d' % len(df_final))

# of projects: 1889
# of unique user/project donations: 8648


In [19]:
# Split training and test data
df_train, df_test = train_test_split(df_final, test_size=0.20, random_state=42)

print('# donations on Train set: %d' % len(df_train))
print('# donations on Test set: %d' % len(df_test))


#Set index to Donor ID
df_index = df.set_index('Donor ID')
df_train_index = df_train.set_index('Donor ID')
df_test_index = df_test.set_index('Donor ID')

# donations on Train set: 6918
# donations on Test set: 1730


#### 1. Process text data

In [20]:
#Preprocessing text data
text_data = ["Project Title","Project Essay"]
for cols in text_data:
    projects[cols] = projects[cols].astype(str) 
    projects[cols] = projects[cols].astype(str).fillna('') 
    projects[cols] = projects[cols].str.lower() 
    
text = projects["Project Title"] + ' ' + projects["Project Essay"]

vectorizer = TfidfVectorizer(strip_accents='unicode',
                             analyzer='word',
                             lowercase=True, # Convert all uppercase to lowercase
                             stop_words='english', # Remove commonly found english words ('it', 'a', 'the') which do not typically contain much signal
                             max_df = 0.9, # Only consider words that appear in fewer than max_df percent of all documents
                             # max_features=5000 # Maximum features to be extracted                    
                            )                        
project_ids = projects['Project ID'].tolist()

tfidf_matrix = vectorizer.fit_transform(text)
tfidf_feature_names = vectorizer.get_feature_names()
tfidf_matrix

<1889x12326 sparse matrix of type '<class 'numpy.float64'>'
	with 176393 stored elements in Compressed Sparse Row format>

#### 2. Build donor profile

In [21]:
def get_project_profile(project_id):
    idx = project_ids.index(project_id)
    project_profile = tfidf_matrix[idx:idx+1]
    return project_profile

def get_project_profiles(ids):
    project_profiles_list = [get_project_profile(x) for x in np.ravel([ids])]
    project_profiles = scipy.sparse.vstack(project_profiles_list)
    return project_profiles

def build_donors_profile(donor_id, donations_indexed_df):
    donations_donor_df = donations_indexed_df.loc[donor_id]
    donor_project_profiles = get_project_profiles(donations_donor_df['Project ID'])
    donor_project_strengths = np.array(donations_donor_df['eventStrength']).reshape(-1,1)
    #Weighted average of project profiles by the donations strength
    donor_project_strengths_weighted_avg = np.sum(donor_project_profiles.multiply(donor_project_strengths), axis=0) / (np.sum(donor_project_strengths)+1)
    donor_profile_norm = sklearn.preprocessing.normalize(donor_project_strengths_weighted_avg)
    return donor_profile_norm

from tqdm import tqdm

def build_donors_profiles(): 
    donations_indexed_df = df_final[df_final['Project ID'].isin(projects['Project ID'])].set_index('Donor ID')
    donor_profiles = {}
    for donor_id in tqdm(donations_indexed_df.index.unique()):
        donor_profiles[donor_id] = build_donors_profile(donor_id, donations_indexed_df)
    return donor_profiles

donor_profiles = build_donors_profiles()
print("# of donors with profiles: %d" % len(donor_profiles))

100%|██████████| 8015/8015 [00:29<00:00, 276.09it/s]


# of donors with profiles: 8015


In [22]:
mydonor1 = "6d5b22d39e68c656071a842732c63a0c"
mydonor2 = "0016b23800f7ea46424b3254f016007a"
mydonor1_profile = pd.DataFrame(sorted(zip(tfidf_feature_names, 
                        donor_profiles[mydonor1].flatten().tolist()), 
                        key=lambda x: -x[1])[:10],
                        columns=['token', 'relevance'])
mydonor2_profile = pd.DataFrame(sorted(zip(tfidf_feature_names, 
                        donor_profiles[mydonor2].flatten().tolist()), 
                        key=lambda x: -x[1])[:10],
                        columns=['token', 'relevance'])

In [23]:
mydonor1_profile

Unnamed: 0,token,relevance
0,music,0.450057
1,auditorium,0.355256
2,cart,0.272809
3,chair,0.223861
4,equipment,0.211338
5,musicians,0.179244
6,time,0.172908
7,moving,0.137749
8,ohms,0.134065
9,prepare,0.131274


In [24]:
mydonor2_profile

Unnamed: 0,token,relevance
0,pollinators,0.670222
1,plants,0.305398
2,module,0.223407
3,pollination,0.21187
4,seeds,0.180609
5,writing,0.166816
6,books,0.137455
7,reading,0.115003
8,weaved,0.111704
9,bees,0.101842


### Content Based Recommender

In [33]:
class ContentBasedRecommender:
    MODEL_NAME = 'content-Based'
    
    def __init__(self, projects_df=None):
        self.project_ids = project_ids
        self.projects_df = projects_df
        
    def get_model_name(self):
        return self.MODEL_NAME
    
    def _get_similar_projects_donor_profile(self, donor_id, topn=1000):
        cosine_similarities = cosine_similarity(donor_profiles[donor_id],tfidf_matrix)
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        similar_projects = sorted([(project_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
        return similar_projects
    
    def recommend_projects(self, donor_id, projects_to_ignore=[], topn=10, verbose=False):
        similar_projects = self._get_similar_projects_donor_profile(donor_id)
        similar_projects_filtered = list(filter(lambda x: x[0] not in projects_to_ignore, similar_projects))
        recommendations_df = pd.DataFrame(similar_projects_filtered, columns=['Project ID', 'recStrength']).head(topn)
        recommendations_df = recommendations_df.merge(self.projects_df, how = 'left', left_on = 'Project ID', right_on = 'Project ID')[['recStrength', 'Project ID', 'Project Title', 'Project Essay']]
    
        return recommendations_df

In [34]:
cbr_model = ContentBasedRecommender(projects)
cbr_model.recommend_projects(mydonor1)

Unnamed: 0,recStrength,Project ID,Project Title,Project Essay
0,1.0,000009891526c0ade7180f8423792063,ohms musician chair cart,the music students in our classes perform freq...
1,0.390997,00774002faa0df63829b6fc0847ee30b,support our new music program!,i have spent 12 years as an educator rebuildin...
2,0.338676,0038114e50521b17b92f66c7a3c6e677,"shake, rattle and drumroll","""music is what feelings sound like."" -g. cates..."
3,0.331034,004565b1b2a4204ce409f903a0977afc,composing in the music class,true music is created not by the teacher but b...
4,0.324355,000a314ed829aff456c23859ec1fc71a,first grade students love learning about music!,every morning my first grade students come to ...
5,0.322923,0045aff5baeed21cd0b8d2a2adc728b1,let's project music!,"in today's fast paced environment, students ne..."
6,0.31591,0020634b8bedd0c6693a292f7372c69b,"music: a language for the mind, body, and soul","""music is a moral law. it gives soul to the u..."
7,0.314845,002dd99ebd3334bd6799fd1e75ccdc17,inspiring future musicians!,i walk in the door so excited to get the stude...
8,0.310103,00822befbd6cf4f069d20d462c06675a,"1, 2, 3...play with me!!",some students have never put their hands on a ...
9,0.297516,0066cfb9ded063e2078cf3973e2fa6aa,music for oyler,"my students do not have money, but they do hav..."


In [35]:
cbr_model.recommend_projects(mydonor2)

Unnamed: 0,recStrength,Project ID,Project Title,Project Essay
0,1.0,004c7c5e1a8cbce0ee63d14574096aeb,power partnerships: plants and pollinators!,"my students are creative, curious, and excited..."
1,0.211962,0016309bd7290ade640f436ad894dab2,let's plant and learn,our school is a title 1 school. 100% of stude...
2,0.189111,004986d49a0b6a0f1b6bbe2e5f42b485,what time is it? it's time to plant,my students are active and eager learners who ...
3,0.188095,004b8c9575a1d1a37df067d8dc016df0,"don't plant it, clone it! the cloning of an a...",being a small rural school we do a lot of trad...
4,0.17352,00022a0f4f0062d861b26fcd96abc68c,pollinating their minds! stem in action,"""science is a way of life...science is the pro..."
5,0.159015,00236e176405ce085a6f7200e148dd7e,help us learn about life science,my second grade students love to come to schoo...
6,0.158071,006ad0535b78bb00ffee54200e747fa5,intriguing reading for intelligent writing,i teach 28 fourth graders in a neighborhood sc...
7,0.150389,006b49a52fdba1ef30d71c075ce0f203,bookworms rule the world,in my classroom we are working hard to become ...
8,0.144724,0062b388efbc3b5e23dcdf6faf6344ef,"read, read to learn!","as a teacher in a diverse, low-income, high-po..."
9,0.139937,0012f7359b9705f46355a1c2b8ecbc1d,leveled books to help us read!,"have you ever been told you need to read, but ..."


#### As per the recommendation, Donor#1 is interested in music related projects and Donor#2 is interested in plants related projects. 