# This file is mainly to carve out recent Kiva loans from the raw dataset downloaded from the Kiva website.  We are also reducing sample of funded to make the modelling feasible for an unbalanced dataset.
# Source: https://www.kiva.org/build/data-snapshots

In [1]:
import pandas as pd
import numpy as np


## Loans.csv was downloaded from the Kivasite. Load into DataFrame to parse it down for project

In [2]:
#Dataset
kiva = pd.read_csv('./Kiva/kiva_ds_csv/loans.csv')

Variables of interest: 



In [34]:
#Append year to carve out recent sample
kiva['year'] = kiva['POSTED_TIME'].str[:4]

In [39]:
kiva['year'].value_counts()

2018    229633
2017    225476
2019    219005
2016    197236
2015    181835
2014    174232
2013    140035
2012    133847
2011    113009
2020    110418
2010     92902
2009     81629
2008     50697
2007     26314
2006      3200
Name: year, dtype: int64

### Data starts from 2006 onwards and is through 2019.  Keep dataset only to recent years (2015+ onwards) since older dataset has less relevance for model build.

In [57]:
#Getting only recent records
kiva_recent = kiva[kiva['year'].isin(['2015', '2016', '2017', '2018','2019'])]

In [59]:
kiva_recent['year'].value_counts()

2018    229633
2017    225476
2019    219005
2016    197236
2015    181835
Name: year, dtype: int64

In [60]:
kiva_recent['STATUS'].value_counts()

funded      980589
expired      69156
refunded      3440
Name: STATUS, dtype: int64

## Dataset is imbalanced with expired being ~5% of overall data.  Undersampling majority class to build a better model.  Getting all rows of non-funded and merging 350k from funded to create an integrate dataset

In [61]:
#Getting 350k rows from funded dataset
kivafunded=kiva_recent[kiva_recent['STATUS']=='funded']
kivaf_sample = kivafunded.sample(n=350_000)

In [62]:
#Getting all expired
kiva_e = kiva_recent[kiva_recent['STATUS']=='expired']

In [63]:
#Merging and shuffling funded and expired
kivamix = pd.concat([kivaf_sample, kiva_e])
from sklearn.utils import shuffle
kivamix = shuffle(kivamix)

In [64]:
#Reviewing mix of funded accounts at Kiva
kivamix['STATUS'].value_counts()


funded     350000
expired     69156
Name: STATUS, dtype: int64

In [65]:
#Converting to boolean for status
kivamix['STATUS']=kivamix['STATUS'].map({'funded':1, 'expired':0})

In [28]:
#Shuffling the data to randomize the mix of funded and expired loans

from sklearn.utils import shuffle
kivamix = shuffle(kivamix)

In [29]:
#Creating global dataset of recent Kiva loans
kivamix.to_csv('kivamix.csv', index=False)