In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import matplotlib
import matplotlib.pyplot as plt


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
loans = pd.read_csv("/kaggle/input/data-science-for-good-kiva-crowdfunding/kiva_loans.csv")

In [None]:
variables = ["loan_amount", "sector", "country_code", "term_in_months", "borrower_genders", "repayment_interval"]
loans.head()

In [None]:
## create a "funded" column

loans["funded"] = ((loans["loan_amount"] - loans["funded_amount"]) / (loans["loan_amount"] - loans["funded_amount"]) - 1) * -1 
loans["funded"] = loans["funded"].astype('bool')
loans[loans["funded_amount"] < loans["loan_amount"]].head()


## how many loans get funded?

In [None]:
n_funded = loans[loans["funded"] == True ].shape[0]
n_tot = loans.shape[0]

print("{} out of {} loans got funded ({} %)".format(n_funded, n_tot, round(n_funded/n_tot*100)))


## transform gender variable to has_female, has_male

In [None]:
## get male/female
loans["borrower_genders"]= loans["borrower_genders"].astype("str")
loans["has_female"] = loans["borrower_genders"].astype("str").apply(lambda x: "female" in x)
loans["has_male"] = loans["borrower_genders"].astype("str").apply(lambda x: len(x) == 4 or (len(x) > 6 and (len(x) % 8) != 6))

## what is the gender distribution for funded/not funded?


## We want to investigate:
- Can we predict the probability of a loan getting funded?
- Which variables are more important for getting funded?

# 1. Variable analysis

we want to answer:
- which are the variables and are they numerical or cathegorical?
- what are the distributions?

## Variables
We have 6 variables: 
- "loan_amount" (numerical)
- "sector" (categorical) 
- "country_code" (categorical)
- "term_in_months" (numerical)
- "borrower_genders" (categorical)
- "repayment_interval" (categorical)


In [None]:
variables = ["loan_amount", "sector", "term_in_months", "has_male", "has_female", "repayment_interval", "funded"]

loans_explore = loans[variables]

loans_explore.head()


In [None]:
loans_explore["loan_amount"].describe()

In [None]:
loans_explore["sector"].describe()

In [None]:
# loans_explore["country_code"].hist()

In [None]:
loans_explore["repayment_interval"].describe()

In [None]:
loans_explore["term_in_months"].describe() 

In [None]:
loans_explore["loan_amount"].describe()

In [None]:
print(loans_explore.shape)
print("{} datapoints".format(loans_explore.shape[0] * loans_explore.shape[1]))

# Data preparation

## Variables
We have 6 variables: 
- "loan_amount" (numerical)
- "sector" (categorical) -> dummy
- "country_code" (categorical) --> dummy
- "term_in_months" (numerical)
- "borrower_genders" (categorical) --> dummy variables: "has_female", "has_male"
- "repayment_interval" (categorical) --> dummy

In [None]:
l = loans_explore.copy()

In [None]:
## create dummy variables 
l = pd.concat([l.drop("sector", axis = 1), pd.get_dummies(l["sector"], prefix = "sector")], axis=1, sort=False)
# pd.concat([l.drop("country_code", axis = 1), pd.get_dummies(l["country_code"], prefix = "country")], axis=1, sort=False)
l = pd.concat([l.drop("repayment_interval", axis = 1), pd.get_dummies(l["repayment_interval"], prefix = "repayment_interval")], axis=1, sort=False)

In [None]:
# convert bools to 1/0
l["has_male"] = pd.to_numeric(l["has_male"])
l["has_female"] = pd.to_numeric(l["has_female"])
l["funded"] = pd.to_numeric(l["funded"])

In [None]:
# normalize loan amount, term in month
l["loan_amount_norm"] = l["loan_amount"] / np.max(l["loan_amount"]) # max is 100k
l["term_in_months_norm"] = l["term_in_months"] / np.max(l["term_in_months"]) # max is 158

l = l.drop("loan_amount", axis = 1)
l = l.drop("term_in_months", axis = 1)
l.head()

## inflate data labelled as false

In [None]:
l_false = l[l["funded"] == False]

In [None]:
n_false_labels = l_false.shape[0]
n_true_labels = l.shape[0] - n_false_labels
factor = round(n_true_labels / n_false_labels) - 1
print("inflate all false labels with a factor of {}".format(factor))
false_inflated = pd.concat([l_false] * factor)
l = pd.concat([l, false_inflated])
print("now we have {} data points".format(l.shape[0]))

# Check correlations among variables


In [None]:
l_fem = l[l["has_female"] == True]

fem_funded = l_fem[l_fem["funded"] == True].shape[0]
fem_tot = l_fem.shape[0]

print("{}/{} females funded ({} %)".format(fem_funded, fem_tot, round(100 * fem_funded/fem_tot)))

l_m = l[l["has_male"] == True]

m_funded = l_m[l_m["funded"] == True].shape[0]
m_tot = l_m.shape[0]

print("{}/{} males funded ({} %)".format(m_funded, m_tot, round(100 * m_funded/m_tot)))


In [None]:
cm = l.corr()

In [None]:
alpha = ['ABC', 'DEF', 'GHI', 'JKL']
alpha = l.columns

fig_cm = plt.figure(figsize = (15, 15))
ax = fig_cm.add_subplot(111)
cax = ax.matshow(cm, interpolation='nearest', cmap = 'Blues')
fig_cm.colorbar(cax)

plt.xticks(np.arange(0, len(l.columns)), rotation='vertical')
plt.yticks(np.arange(0, len(l.columns)))

ax.set_xticklabels(['']+alpha)
ax.set_yticklabels(['']+alpha)

plt.show()

### learnings from the above plot:

- high correlations between having a female borrower and getting funded 
- negative correlation between having a male borrower and getting funded)
- high correlation between irregular payments and funded (although, also high correlation between female and irregular payment, so this might explain it)
- low loan term AND low loan amount has negative correlations to funded ==> shorter and smaller loans get funded easier (it's called microfinance for a reason I guess) 


# Training models

In [None]:
## let's import scikit learn
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

## Split the data into train and test

In [None]:
# use only some variables
# variables_used = ["term_in_months_norm", "loan_amount_norm", "has_male", "has_female", "repayment_interval_bullet", "repayment_interval_irregular", "repayment_interval_monthly", "repayment_interval_weekly"]

# use all variables
variables_used = l.drop("funded", axis = 1).columns

In [None]:
x = l.drop("funded", axis = 1)[variables_used]
y = l["funded"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 42)

In [None]:
x_train.head()

In [None]:
y_train.head()

# 0. check current true/false ratio

In [None]:
n_true = y[y == True].shape[0]
n_false = y[y == False].shape[0]
n = y.shape[0]
print("{} ({} %) true, {} false".format(n_true, round(100 * n_true/n), n_false))

## 1. Train a Random Forest classifier
### 1.1 train on 80% of the data

In [None]:
rf_model = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
rf_model.fit(x_train, y_train)

### 1.2. Validate by getting the accuracy and confusion matrix

In [None]:
train_score = rf_model.score(x_train, y_train)
test_score = rf_model.score(x_test, y_test)

print("train_score: {}".format(train_score))
print("test_score: {}".format(test_score))

In [None]:
y_pred = rf_model.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
plot_confusion_matrix(rf_model, x_test, y_test, normalize = 'true')

## 2. Train a logistic regression

In [None]:
lr_model = LogisticRegression(max_iter = 1000)
lr_model.fit(x_train, y_train)

In [None]:
train_score = lr_model.score(x_train, y_train)
test_score = lr_model.score(x_test, y_test)

print("train_score: {}".format(train_score))
print("test_score: {}".format(test_score))

In [None]:
plot_confusion_matrix(lr_model, x_test, y_test, normalize = 'true')

# 3. Analyze the results 

## 3.1 Get most important variables


In [None]:
importance = rf_model.feature_importances_
feature = x.columns

sort_order = np.argsort(importance)

print(importance)
print(feature)

importance = list(np.array(importance)[sort_order])
feature = list(np.array(feature)[sort_order])

In [None]:
%matplotlib inline
fig = plt.figure(figsize = (12, 12))

plt.barh(feature, importance, figure = fig)
plt.show()

## 3.2 See distributions

So, given that some features are more important than others, let's see if we can understand which values are more common for getting funded than others.




In [None]:
## split l into funded and not funded
lexp_funded = loans_explore[loans_explore["funded"] == True]
lexp_nfunded = loans_explore[loans_explore["funded"] == False]


### 3.2.1 loan amount of funded

In [None]:

lexp_funded["loan_amount"].hist(bins = 100, figsize = (12, 12), range = [0, 10000])

In [None]:
lexp_funded["loan_amount"].describe()

### 3.2.2 loan amount of not funded

In [None]:
lexp_nfunded["loan_amount"].hist(bins = 100, figsize = (12, 12), range = [0, 10000])

In [None]:
lexp_nfunded["loan_amount"].describe()

### 3.2.3 term in months of funded

In [None]:
lexp_funded["term_in_months"].hist(bins = 48, figsize = (12, 12), range = [0, 48])

In [None]:
lexp_funded["term_in_months"].describe()

### 3.2.4 loan term in months, not funded

In [None]:
lexp_nfunded["term_in_months"].hist(bins = 48, figsize = (12, 12), range = [0, 48])

In [None]:
lexp_nfunded["term_in_months"].describe()

## Propositions

- Sub - Proposition 1: On average, loans consisting of longer loan periods are more likely to get funded through the Kiva Platform 

- Sub - Proposition 2: On average, loans consisting of larger amounts are more likely to get funded

- Sub - Proposition 3: On average, women take smaller loans than men 

- Sub - Proposition 4: On average, women taking loans are more likely to have longer loan periods

- Sub - Proposition 5: Sector något

In [None]:
# - Sub - Proposition 1: On average, loans consisting of longer loan periods are more likely to get funded through the Kiva Platform 

## 1. Loan term and loan amount are the most important variables

## 2. 
l.corr()

# corr loan_amount <-> funded = -0.214468
# corr loan_term <-> funded = -0.276060



In [None]:
# - Sub - Proposition 2: On average, loans consisting of larger amounts are more likely to get funded

## kolla distributions för funded/not funded, gör statistiskt test på om de är olika





In [None]:
# - Sub - Proposition 3: On average, women take smaller loans than men 

## kolla distributions för män/kvinnors lånsize, gör statistiskt test på om de är olika




In [None]:
# - Sub - Proposition 4: On average, women taking loans are more likely to have longer loan periods

## kolla distributions för män/kvinnors terms, gör statistiskt test på om de är olika




In [None]:
# Sectors  - corr small amounts
l.corr()["loan_amount_norm"].sort_values()

In [None]:
x1 = list(loans[loans['sector'] == 'Agriculture']['loan_amount'])
x2 = list(loans[loans['sector'] == 'Housing']['loan_amount'])
x3 = list(loans[loans['sector'] == 'Retail']['loan_amount'])
x4 = list(loans[loans['sector'] == 'Clothing']['loan_amount'])
x5 = list(loans[loans['sector'] == 'Entertainment']['loan_amount'])
x6 = list(loans[loans['sector'] == 'Services']['loan_amount'])
x0 = list(loans[loans['sector'] == 'Personal Use']['loan_amount'])


plt.figure(figsize=(20,10))

# Assign colors for each airline and the names
colors = ['pink', '#E69F00', '#56B4E9', '#F0E442', '#009E73', '#D55E00', 'purple']
names = ['Personal Use', 'Agriculture.', 'Housing', 'Retail', 'Clothing',
         'Entertainment', 'Services']
         
# Make the histogram using a list of lists
# Normalize the flights and assign colors and names
plt.hist([x0, x1, x2, x3, x4, x5, x6], normed=True, # bins = 
         color = colors, label=names, range=(0, 10000))

# Plot formatting
plt.legend()
plt.xlabel('Loan Amount (USD)')
plt.ylabel('Normalized frequency')
plt.title('Loan amount per Sector')

In [None]:
# Sectors - corr loan terms

l.corr()["term_in_months_norm"].sort_values()

In [None]:
x1 = list(loans[loans['sector'] == 'Retail']['term_in_months'])
x2 = list(loans[loans['sector'] == 'Food']['term_in_months'])
x3 = list(loans[loans['sector'] == 'Clothing']['term_in_months'])
x4 = list(loans[loans['sector'] == 'Health']['term_in_months'])
x5 = list(loans[loans['sector'] == 'Housing']['term_in_months'])
x6 = list(loans[loans['sector'] == 'Education']['term_in_months'])
x0 = list(loans[loans['sector'] == 'Personal Use']['term_in_months'])


plt.figure(figsize=(20,10))

# Assign colors for each airline and the names
colors = ['pink', '#E69F00', '#56B4E9', '#F0E442', '#009E73', '#D55E00', 'purple']

names = [
'Personal Use'
'Retail',
'Food',
'Clothing',
'Health',
'Housing',
'Education',
]

         
# Make the histogram using a list of lists
# Normalize the flights and assign colors and names
plt.hist([x0, x1, x2, x3, x4, x5, x6], normed=True, bins = 10,
         color = colors, label=names, range=(0, 50))

# Plot formatting
plt.legend()
plt.xlabel('Loan term (months)')
plt.ylabel('Normalized frequency')
plt.title('Loan term per Sector')

In [None]:
loans_explore.head()

In [None]:
x1 = list(loans_explore[loans_explore['has_male'] == True]['loan_amount'])
x2 = list(loans_explore[loans_explore['has_female'] == True]['loan_amount'])


plt.figure(figsize=(20,10))

# Assign colors for each airline and the names
colors = ['#F0E442', '#009E73']
names = ['Male', 'Female']
         
# Make the histogram using a list of lists
# Normalize the flights and assign colors and names
plt.hist([x1, x2], normed=True, # bins = 
         color = colors, label=names, range=(0, 10000))

# Plot formatting
plt.legend()
plt.xlabel('Loan Amount (USD)')
plt.ylabel('Normalized frequency')
plt.title('Loan amount per Gender')

In [None]:
x1 = list(loans_explore[loans_explore['has_male'] == True]['term_in_months'])
x2 = list(loans_explore[loans_explore['has_female'] == True]['term_in_months'])

plt.figure(figsize=(20,10))

# Assign colors for each airline and the names
colors = ['#F0E442', '#009E73']
names = ['Male', 'Female']
         
# Make the histogram using a list of lists
# Normalize the flights and assign colors and names
plt.hist([x1, x2], normed=True, # bins = 
         color = colors, label=names, range=(0, 50))

# Plot formatting
plt.legend()
plt.xlabel('Loan term (Months)')
plt.ylabel('Normalized frequency')
plt.title('Loan Term per Gender')

In [None]:
from scipy import stats

merged = pd.merge(
    loans_explore[loans_explore['has_male'] == True]['term_in_months'].to_frame(),
    loans_explore[loans_explore['has_female'] == True]['term_in_months'].to_frame(),
    left_index=True,
    right_index=True)
merged.head()

stats.ks_2samp(merged.term_in_months_x, merged.term_in_months_y)