In [1]:
# Widen width of notebook
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

import pandas as pd
import numpy as np
#import imblearn

import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy 

from sklearn.metrics.pairwise import cosine_similarity
import random

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# pd.set_option('display.max_colwidth', 20)

import warnings
warnings.filterwarnings('ignore')

# data loading

In [2]:
df_full = pd.read_csv('df_19.csv')
del df_full['Unnamed: 0']
df_full.shape

(215004, 44)

In [3]:
df_full.head(1).append(df_full.tail(1))

Unnamed: 0,LOAN_ID,LOAN_NAME,ORIGINAL_LANGUAGE,DESCRIPTION,DESCRIPTION_TRANSLATED,FUNDED_AMOUNT,LOAN_AMOUNT,STATUS,IMAGE_ID,VIDEO_ID,ACTIVITY_NAME,SECTOR_NAME,LOAN_USE,COUNTRY_CODE,COUNTRY_NAME,TOWN_NAME,CURRENCY_POLICY,CURRENCY_EXCHANGE_COVERAGE_RATE,CURRENCY,PARTNER_ID,POSTED_TIME,PLANNED_EXPIRATION_TIME,DISBURSE_TIME,RAISED_TIME,LENDER_TERM,NUM_LENDERS_TOTAL,NUM_JOURNAL_ENTRIES,NUM_BULK_ENTRIES,TAGS,BORROWER_NAMES,BORROWER_GENDERS,BORROWER_PICTURED,REPAYMENT_INTERVAL,DISTRIBUTION_MODEL,funding_rate,funding_status,LENDERS,number_lender,len_des,len_loan_use,nb_tag,post_dur,raise_dur,year_disburse
0,1873913,Celesti,English,Celesti is a married woman with four children....,Celesti is a married woman with four children....,500.0,500.0,funded,3305762.0,,Food,Food,to buy ingredients for her food production bus...,PH,Philippines,"Guindulman, Bohol",standard,,PHP,145.0,2019-11-12 03:20:43+00:00,2020-01-31 18:10:25+00:00,2019-10-17 07:00:00+00:00,2019-12-18 06:30:52+00:00,8.0,16,1,1,,Celesti,female,True,monthly,field_partner,1.0,1.0,"dave6087, JY1024, antoine8599, olivier1537, an...",11,456.0,51.0,1,80.617847,36.132049,2019.0
215003,1856954,Mercy,English,Mercy is a 46-year-old maize farmer in Kiirua/...,Mercy is a 46-year-old maize farmer in Kiirua/...,125.0,125.0,funded,3272678.0,,Farming,Agriculture,to access premium seeds and high-quality ferti...,KE,Kenya,"Kiirua/Naari, Meru",shared,0.1,KES,596.0,2019-10-15 14:15:07+00:00,2019-11-14 22:50:10+00:00,2019-10-04 07:00:00+00:00,2019-10-15 23:30:31+00:00,7.0,2,1,1,volunteer_pick,Mercy,female,True,bullet,field_partner,1.0,1.0,"mark98525949, jen2255",2,475.0,161.0,1,30.357674,0.385694,2019.0


# strength analysis

In [4]:
df = df_full[['LOAN_ID','LENDERS']]

In [5]:
df.head(5)

Unnamed: 0,LOAN_ID,LENDERS
0,1873913,"dave6087, JY1024, antoine8599, olivier1537, an..."
1,1761599,"ashish5575, donandkathy6425, danielrothman"
2,1874026,"martha6847, davidandsusan9466, anonymous5138, ..."
3,1912468,"ann6727, patrick5166, jane2464, rabab9981, kat..."
4,1912242,"benjamin5505, ah5200, vnev5260, ral2153"


In [6]:
from itertools import chain

# return list from series of comma-separated strings
def chainer(s):
    return list(chain.from_iterable(s.str.split(',')))

# calculate lengths of splits
lens = df['LENDERS'].str.split(',').map(len)

# create new dataframe, repeating or chaining as appropriate
res = pd.DataFrame({'LOAN_ID': np.repeat(df['LOAN_ID'], lens),
                    'LENDERS': chainer(df['LENDERS'])})
res['count']=1
res.head(5)

Unnamed: 0,LOAN_ID,LENDERS,count
0,1873913,dave6087,1
0,1873913,JY1024,1
0,1873913,antoine8599,1
0,1873913,olivier1537,1
0,1873913,anonymous5138,1


In [7]:
# check for repeat donation
strength_df = res.groupby(['LENDERS', 'LOAN_ID'])['count'].sum().reset_index()
distribution_repeat_donation = pd.DataFrame(strength_df['count'].value_counts(normalize=True))

In [8]:
distribution_repeat_donation_1 = distribution_repeat_donation.iloc[0]
distribution_repeat_donation_other = distribution_repeat_donation.iloc[1:].sum()
repeat_donation = pd.concat([distribution_repeat_donation_1, distribution_repeat_donation_other])
repeat_donation = pd.DataFrame(repeat_donation).reset_index()
repeat_donation.iloc[0,0] = 'donate once per loan'
repeat_donation.iloc[1,0] = 'donate more than once per loan'
repeat_donation = repeat_donation.set_index('index')
repeat_donation

Unnamed: 0_level_0,0
index,Unnamed: 1_level_1
donate once per loan,0.992085
donate more than once per loan,0.007915


> 99.2% of lenders only donate once to one loan >> i.e. repeat donation is rare

In [9]:
freq = res[['LENDERS','count']].groupby(['LENDERS'])['count'].sum().reset_index()
freq.head(2)

Unnamed: 0,LENDERS,count
0,013023,1
1,0326lsw,29


In [10]:
len(df[df['LENDERS'].str.contains('0326lsw')])

29

In [11]:
freq['count'].value_counts(normalize=True)

1        0.438970
2        0.177200
3        0.088505
4        0.057692
5        0.038444
6        0.028779
7        0.021776
8        0.016923
9        0.013447
10       0.011602
11       0.009441
12       0.008170
13       0.006935
14       0.005923
15       0.005232
16       0.004564
17       0.004058
18       0.003632
19       0.003245
20       0.002932
21       0.002715
22       0.002423
23       0.002123
24       0.002078
25       0.001860
26       0.001704
27       0.001643
28       0.001469
29       0.001402
30       0.001313
32       0.001232
31       0.001180
33       0.001041
35       0.001028
34       0.000972
37       0.000887
36       0.000861
39       0.000800
40       0.000717
38       0.000693
41       0.000650
42       0.000637
43       0.000619
44       0.000604
46       0.000578
45       0.000563
47       0.000550
50       0.000543
49       0.000524
48       0.000515
52       0.000456
51       0.000396
53       0.000378
56       0.000376
55       0.000372
54       0

> for 43.9% lender who only donate once, there may not be enough information for content based recommendation
<br> hence, predictions (filtering) about the preference of loans that interest these donors by what other similar user donate

In [12]:
freq_count = pd.DataFrame(freq['count'].value_counts()).reset_index()
# bin the values into 12 separate bins
bins = [0, 1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, np.inf]
freq_count['bin'] = pd.cut(freq_count['index'], bins, 
                          labels=['1','2-5','6-10','11-15','16-20','21-25','26-30','31-35','36-40','41-45','46-50','>50'])
freq_count.head(2).append(freq_count.tail(2))

Unnamed: 0,index,count,bin
0,1,201969,1
1,2,81529,2-5
862,2641,1,>50
863,373,1,>50


In [13]:
freq_bin=freq_count[['count','bin']].groupby('bin').sum()
freq_bin.to_csv('freq_count.csv')

In [14]:
print('# of projects: %d' % strength_df.LOAN_ID.nunique())
print('# of unique user/project donations: %d' % strength_df.LENDERS.nunique())

# of projects: 215004
# of unique user/project donations: 460097


# Collaborative Filtering Model

 - Based on EDA - 99.2% lender only donate once per loan
 <br> the utility matrix built between loan and lender is mainly binary with most non-zero value being 1, providing little meaningful information gain, and it is also very sparse
 - Hence, not suitable to use collaborative filtering model in this case

# infreq doner - similar doner matching

In [15]:
int_grp = pd.read_csv (r'C:\Users\DHE00\Dropbox\My PC (LAPTOP-OGR1U1RQ)\Desktop\MSBA\Sem2\BT5153 Applied Machine Learning for Business Analytics\Group Project\kiva_ds_csv\19lender_interestgrp_all.csv')
lenders = pd.read_csv (r'C:\Users\DHE00\Dropbox\My PC (LAPTOP-OGR1U1RQ)\Desktop\MSBA\Sem2\BT5153 Applied Machine Learning for Business Analytics\Group Project\kiva_ds_csv\lenders_with_occupation.csv')
del lenders['Unnamed: 0']

In [16]:
lenders.tail(2)

Unnamed: 0,permanent_name,display_name,main_pic_id,city,state,country_code,member_since,personal_url,occupation,loan_because,other_info,loan_purchase_num,invited_by,num_invited,years_joined,occupation_cluster
1539524,jessica73733054,Jessica,,,,,2017-12-03 17:57:19,,,,,1,,0,11.929787,
1539525,matina8349,Matina,,,,,2017-12-03 18:21:59,,,,,19,,0,11.929834,


In [17]:
int_grp = int_grp.rename(columns={"ID": "permanent_name"})
del int_grp['link']
int_grp.head(2)

Unnamed: 0,permanent_name,team
0,john79603208,['BR']
1,wdon3007,no_team


In [18]:
lender = pd.merge(lenders, int_grp, on=['permanent_name'])
lender.head(2)

Unnamed: 0,permanent_name,display_name,main_pic_id,city,state,country_code,member_since,personal_url,occupation,loan_because,other_info,loan_purchase_num,invited_by,num_invited,years_joined,occupation_cluster,team
0,john3382,John,1665284.0,La Quinta,CA,US,2009-01-19 19:08:55,www.anderholtwhittaker.com,attorney,,,15,,2,3.053211,98.0,no_team
1,brandon4842,Brandon,259032.0,Toronto,Ontario,CA,2009-01-20 16:31:41,,writer,There are good people who need help.,"I'm a storyteller, hoping to share my stories ...",13,Karen,0,3.055652,30.0,no_team


In [19]:
lender.isnull().sum()

permanent_name             0
display_name             316
main_pic_id           229441
city                  224548
state                 238956
country_code          189285
member_since               0
personal_url          368321
occupation            279221
loan_because          323986
other_info            353735
loan_purchase_num          0
invited_by            298494
num_invited                0
years_joined               0
occupation_cluster    279221
team                       0
dtype: int64

> interest group & loan_because are strong indicator of lender's preference
<br> lender with similar interest group & loan_because may be interested in similar loan

In [20]:
# replace personal URL with 1,0 (1 for those with URL, 0 for those without)
lender['personal_url'] = lender['personal_url'].notnull().astype('int')

# rename frequency dataframe that contain the donation count of each doner, merge with lender info for lender with team
freq = freq.rename(columns={"LENDERS": "permanent_name"})
lender_wteam = lender[lender['team'] != 'no_team']
lender_wteam = lender_wteam.merge(freq, on='permanent_name', how='left')
lender_wteam = lender_wteam[lender_wteam['count'].notna()] # drop those didn't donate in 2019
lender_wteam.head(2)

Unnamed: 0,permanent_name,display_name,main_pic_id,city,state,country_code,member_since,personal_url,occupation,loan_because,other_info,loan_purchase_num,invited_by,num_invited,years_joined,occupation_cluster,team,count
13,annekathrinundaxel6117,Annekathrin and Axel,266741.0,Brussels,,BE,2009-01-19 22:30:40,0,research,We travel a lot and have been impressed by the...,We work in the area of European environmental ...,864,,3,3.053595,43.0,"['Kiva Team Germany', 'Team Europe', 'Berlin',...",1.0
24,teddy7662,Teddy,2330634.0,,,US,2016-10-07 19:21:55,0,,,,12,,0,10.773784,,['PayPal Customers'],1.0


In [21]:
# lender with donation in 2019
lender_wteam.permanent_name.nunique()

21366

In [22]:
lender_wteam.describe()

Unnamed: 0,main_pic_id,personal_url,loan_purchase_num,num_invited,years_joined,occupation_cluster,count
count,13929.0,21366.0,21366.0,21366.0,21366.0,8883.0,21366.0
mean,1431487.0,0.120144,392.416222,2.717823,6.304134,42.784983,6.654357
std,966329.6,0.325138,1908.838427,170.621167,3.627404,26.671624,112.170315
min,1565.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,667528.0,0.0,34.0,0.0,3.25673,24.0,1.0
50%,1228121.0,0.0,100.0,0.0,5.752076,37.0,1.0
75%,2211158.0,0.0,279.0,1.0,9.076603,64.0,3.0
max,4112534.0,1.0,151734.0,24921.0,14.206017,99.0,10691.0


In [23]:
# delete columns without much information gain
del lender_wteam['display_name']
del lender_wteam['occupation']
del lender_wteam['main_pic_id']
del lender_wteam['member_since']
del lender_wteam['city']
del lender_wteam['state']

In [24]:
lender_wteam = lender_wteam.fillna(0)

> filter at least one same interest group + similar reason (cos>0.2)

In [25]:
# define list of freq and infreq doner based on donation count
# donated more than 10 times are considered frequent doner
freq_wteam = lender_wteam[lender_wteam['count']>100].reset_index(drop=True)
infreq_wteam = lender_wteam[lender_wteam['count']<=100].reset_index(drop=True)
print('freq', len(freq_wteam), 'infreq', len(infreq_wteam))

freq 114 infreq 21252


In [26]:
# random select 1000 infreq doner profile as test sample
test1000 = infreq_wteam.sample(n = 1000)
test1000['similar_doner'] = 'no'
test1000['find_shared_loan'] = 0
test1000 = test1000.set_index('permanent_name')
test1000.head(2)

Unnamed: 0_level_0,country_code,personal_url,loan_because,other_info,loan_purchase_num,invited_by,num_invited,years_joined,occupation_cluster,team,count,similar_doner,find_shared_loan
permanent_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
greg2009,AU,0,0,0,111,0,1,6.663829,0.0,['Bendigo'],1.0,no,0
trond2460,NO,0,0,0,79,0,0,11.365866,0.0,"['Norway', 'Nerdfighters']",4.0,no,0


In [27]:
for a in range(1000):
    target_infreq = test1000.index[a]
    
    # generate list of the team the infreq doner belong to
    c= infreq_wteam[infreq_wteam['permanent_name'] == target_infreq]['team'].tolist()[0][1:-1]
    c = c.split(', ')

    # filter list of freq doner who are at least join one common team as the infreq doner
    same1 = freq_wteam[freq_wteam['team'].str.contains('|'.join(map(re.escape, c)))]['permanent_name'].tolist()
    
    # identify freq doner that share similar loan reason as the infreq doner by using cosine similarity.
    similar_reason=[]
    sw = stopwords.words('english') # sw contains the list of stopwords

    X = infreq_wteam[infreq_wteam['permanent_name'] == target_infreq]['loan_because'].to_string().lower()
    X_list = word_tokenize(X) # tokenization
    l1 =[]
    X_set = {w for w in X_list if not w in sw} # remove stop words from the string

    for j in same1:
        Y = freq_wteam[freq_wteam['permanent_name'] == j]['loan_because'].to_string().lower()
        Y_list = word_tokenize(Y) # tokenization
        l2 =[]
        Y_set = {w for w in Y_list if not w in sw}  # remove stop words from the string

        # form a set containing keywords of both strings 
        rvector = X_set.union(Y_set) 
        for w in rvector:
            if w in X_set: l1.append(1) # create a vector
            else: l1.append(0)
            if w in Y_set: l2.append(1)
            else: l2.append(0)
        c = 0

        # cosine formula 
        for i in range(len(rvector)):
                c+= l1[i]*l2[i]
        cosine = c / float((sum(l1)*sum(l2))**0.5)

        if cosine >= 0.2:
            similar_reason.append(j)
    
    if similar_reason == []:
        similar_reason = same1 
    
    # test result, count the number of shared LOANID
    count = 0
    for k in similar_reason:
        trial = df[(df['LENDERS'].str.contains(k)) & (df['LENDERS'].str.contains(target_infreq))]
        if trial.empty:
            continue
        else:
            count +=1

    test1000['find_shared_loan'][target_infreq]=count
    test1000['similar_doner'][target_infreq]= similar_reason
            

In [30]:
print('number of infrequent doners who have donated in the same loan as the similar frequent doner identified ', len(test1000[test1000['find_shared_loan'] != 0]))

number of infrequent doners who have donated in the same loan as the similar frequent doner identified  370


In [31]:
test1000['find_shared_loan'].describe()

count    1000.00000
mean        0.95400
std         1.83337
min         0.00000
25%         0.00000
50%         0.00000
75%         1.00000
max        18.00000
Name: find_shared_loan, dtype: float64

In [32]:
test1000['find_shared_loan'].value_counts(normalize=True)

0     0.630
1     0.153
2     0.085
3     0.048
4     0.031
5     0.022
7     0.010
6     0.009
9     0.003
8     0.003
13    0.002
11    0.002
18    0.001
12    0.001
Name: find_shared_loan, dtype: float64

## Samples for illustration

In [33]:
# create sample dataframe for some infrequent doner to illustrate the model output
sample = test1000[['similar_doner']]
sample['shared_loan'] = 0

# 5 infrequent lenders were picked
infreqlst = ['hassan76586349', 'bettina4357','jenny5213', 'reg9953','jeremy79228168']
sample = sample[sample.index.isin(infreqlst)]
sample

Unnamed: 0_level_0,similar_doner,shared_loan
permanent_name,Unnamed: 1_level_1,Unnamed: 2_level_1
jenny5213,"['ryana9900', 'sharon9045']",0
jeremy79228168,"['am8748', 'areef7365', 'joaquin9318', 'mike00...",0
bettina4357,"['heg', 'chipinforchange']",0
hassan76586349,['tim4327'],0
reg9953,"['shirley1905', 'mark2704']",0


In [34]:
# identify the common loan shared between the infrequent lender and the similar doner identified
for i in infreqlst:
    similar = sample.loc[i]['similar_doner'][1:-1].split(',')[0][1:-1]
    common_loan_lst = df[(df['LENDERS'].str.contains(i)) & (df['LENDERS'].str.contains(similar))]['LOAN_ID'].tolist()
    if not common_loan_lst:
        similar = sample.loc[i]['similar_doner'][1:-1].split(',')[1][2:-1]
        common_loan_lst = df[(df['LENDERS'].str.contains(i)) & (df['LENDERS'].str.contains(similar))]['LOAN_ID'].tolist()
    common_loan = ' '.join([str(elem) for elem in common_loan_lst])
    sample['shared_loan'].loc[i] = common_loan

sample

Unnamed: 0_level_0,similar_doner,shared_loan
permanent_name,Unnamed: 1_level_1,Unnamed: 2_level_1
jenny5213,"['ryana9900', 'sharon9045']",1772382
jeremy79228168,"['am8748', 'areef7365', 'joaquin9318', 'mike00...",1779590 1864910
bettina4357,"['heg', 'chipinforchange']",1842288 1819217
hassan76586349,['tim4327'],1659603 1808367 1864467
reg9953,"['shirley1905', 'mark2704']",1816366 1830422 1809381 1769462


For example
- the similar frequent doner identified for the infrequent doner 'reg9953' are 'shirley1905' and 'mark2704'.
- there are 4 loans in common which have been contributed by both parties
- the common loan IDs are 1816366 1830422 1809381 1769462