In [1]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('kiva_loans_20181016.csv')
df.head()

Unnamed: 0,id,date,activity,sector,use,funded_amount,loan_amount,diff_funded_loan,status,country_code,country,currency,gender,borrower_genders,lender_count,term_in_months,repayment_interval,tags
0,653051,1/1/14,Fruits & Vegetables,Food,"To buy seasonal, fresh fruits to sell.",300,300,0,1,PK,Pakistan,PKR,female,female,12,12,irregular,
1,653053,1/1/14,Rickshaw,Transportation,to repair and maintain the auto rickshaw used ...,575,575,0,1,PK,Pakistan,PKR,group,"female, female",14,11,irregular,
2,653068,1/1/14,Transportation,Transportation,To repair their old cycle-van and buy another ...,150,150,0,1,IN,India,INR,female,female,6,43,bullet,"user_favorite, user_favorite"
3,653063,1/1/14,Embroidery,Arts,to purchase an embroidery machine and a variet...,200,200,0,1,PK,Pakistan,PKR,female,female,8,11,irregular,
4,653084,1/1/14,Milk Sales,Food,to purchase one buffalo.,400,400,0,1,PK,Pakistan,PKR,female,female,16,14,monthly,


In [3]:
# Convert to datetime type
# df['date'] = pd.to_datetime(df['date'])

df['date'] = pd.to_datetime(df['date'])
df['newdate'] = [d.date() for d in df['date']]

In [4]:
# Convert to String and then to int

df['year'] = df['newdate'].apply(lambda date: str(date).split('-')[0]).astype(str).astype(int)
df['month'] = df['newdate'].apply(lambda date: str(date).split('-')[1]).astype(str).astype(int)
df['day_of_month'] = df['newdate'].apply(lambda date: str(date).split('-')[2]).astype(str).astype(int)
df['weekday'] = df['date'].dt.day_name()

In [5]:
df.head(2)

Unnamed: 0,id,date,activity,sector,use,funded_amount,loan_amount,diff_funded_loan,status,country_code,...,borrower_genders,lender_count,term_in_months,repayment_interval,tags,newdate,year,month,day_of_month,weekday
0,653051,2014-01-01,Fruits & Vegetables,Food,"To buy seasonal, fresh fruits to sell.",300,300,0,1,PK,...,female,12,12,irregular,,2014-01-01,2014,1,1,Wednesday
1,653053,2014-01-01,Rickshaw,Transportation,to repair and maintain the auto rickshaw used ...,575,575,0,1,PK,...,"female, female",14,11,irregular,,2014-01-01,2014,1,1,Wednesday


In [6]:
days = {'Monday':'Mon','Tuesday':'Tue','Wednesday':'Wed','Thursday':'Thu','Friday':'Fri','Saturday':'Sat','Sunday':'Sun'}
df['weekday_short'] = df['weekday'].apply(lambda x: days[x])

In [7]:
df.head(2)

Unnamed: 0,id,date,activity,sector,use,funded_amount,loan_amount,diff_funded_loan,status,country_code,...,lender_count,term_in_months,repayment_interval,tags,newdate,year,month,day_of_month,weekday,weekday_short
0,653051,2014-01-01,Fruits & Vegetables,Food,"To buy seasonal, fresh fruits to sell.",300,300,0,1,PK,...,12,12,irregular,,2014-01-01,2014,1,1,Wednesday,Wed
1,653053,2014-01-01,Rickshaw,Transportation,to repair and maintain the auto rickshaw used ...,575,575,0,1,PK,...,14,11,irregular,,2014-01-01,2014,1,1,Wednesday,Wed


In [8]:
df.year.value_counts()

2016    197236
2015    181833
2014    174234
2017    117902
Name: year, dtype: int64

In [11]:
df.shape

(671205, 25)

In [12]:
df.status.value_counts()

1    622877
0     48328
Name: status, dtype: int64

In [14]:
df.dtypes

id                             int64
date                  datetime64[ns]
activity                      object
sector                        object
use                           object
funded_amount                  int64
loan_amount                    int64
diff_funded_loan               int64
status                         int64
country_code                  object
country                       object
currency                      object
gender                        object
borrower_genders              object
lender_count                   int64
term_in_months                 int64
repayment_interval            object
tags                          object
newdate                       object
year                           int64
month                          int64
day_of_month                   int64
weekday                       object
weekday_short                 object
continent                     object
dtype: object

In [27]:
df.isnull().sum()

id                         0
date                       0
activity                   0
sector                     0
use                     4232
funded_amount              0
loan_amount                0
diff_funded_loan           0
status                     0
country_code               8
country                    0
currency                   0
gender                  4221
borrower_genders        4221
lender_count               0
term_in_months             0
repayment_interval         0
tags                  171416
newdate                    0
year                       0
month                      0
day_of_month               0
weekday                    0
weekday_short              0
continent                  0
dtype: int64

In [24]:
df1 = df[['status','funded_amount', 'loan_amount', 'activity', 'sector',  'country,
         'currency','gender','term_in_months']]

In [25]:
df1.head(2)

Unnamed: 0,status,funded_amount,loan_amount,activity,sector,country,continent,currency,gender,term_in_months
0,1,300,300,Fruits & Vegetables,Food,Pakistan,Oceania,PKR,female,12
1,1,575,575,Rickshaw,Transportation,Pakistan,Oceania,PKR,group,11


In [30]:
df2 = df1.dropna()
df2.head()

Unnamed: 0,status,funded_amount,loan_amount,activity,sector,country,continent,currency,gender,term_in_months
0,1,300,300,Fruits & Vegetables,Food,Pakistan,Oceania,PKR,female,12
1,1,575,575,Rickshaw,Transportation,Pakistan,Oceania,PKR,group,11
2,1,150,150,Transportation,Transportation,India,Asia,INR,female,43
3,1,200,200,Embroidery,Arts,Pakistan,Oceania,PKR,female,11
4,1,400,400,Milk Sales,Food,Pakistan,Oceania,PKR,female,14


In [32]:
df2.shape

(666984, 10)

# Applying Label Encoding

In [36]:
# Convert features (columns) values to number to prepare for Machine learning Modeling process. Label Encoder (le)

from sklearn import preprocessing
def encode_features(df2):
    features = ['status','funded_amount', 'loan_amount', 'activity', 'sector',  'country',
                'currency','gender','term_in_months']
    df2_combined = pd.concat([df2])
    
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df2_combined[feature])
        df2[feature] = le.transform(df2[feature])
    return df2
    
data = encode_features(df2)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,status,funded_amount,loan_amount,activity,sector,country,continent,currency,gender,term_in_months
0,1,48,11,67,6,54,6,43,0,11
1,1,74,22,134,13,54,6,43,1,10
2,1,28,5,148,13,29,3,21,0,42
3,1,36,7,52,1,54,6,43,0,10
4,1,61,15,95,6,54,6,43,0,13


In [40]:
df2.continent.value_counts()

3    222376
6    168944
0    152630
1     78809
2     20691
5     19473
4      4061
Name: continent, dtype: int64

In [43]:
X = df2[['loan_amount', 'activity', 'sector',  'country','gender','term_in_months']]
y = df2['status']

In [44]:
ss = StandardScaler()
lr = LogisticRegression()
lr_pipe = Pipeline([('sscale', ss), ('logreg', lr)])

In [45]:
lr_pipe.fit(X, y)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


Pipeline(memory=None,
     steps=[('sscale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logreg', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [46]:
lr_pipe.score(X,y)

  Xt = transform.transform(Xt)


0.926933779520948

# Divide the dataset into separate training (80% of the data) and test (20% of the data) datasets.

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

# Chain the StandardScaler and Logistic Regression objects in a pipeline.

In [48]:
lr_pipe.fit(X_train, y_train)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


Pipeline(memory=None,
     steps=[('sscale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logreg', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [49]:
lr_pipe.score(X_test, y_test)  # prediction accuracy score

  Xt = transform.transform(Xt)


0.9261827477379552

In [50]:
lr_pipe.score(X_train, y_train)

  Xt = transform.transform(Xt)


0.9272920817036397

In [51]:
y_pred = lr_pipe.predict(X_test)

  Xt = transform.transform(Xt)


In [52]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [53]:
print(f1_score(y_test, y_pred, average="macro"))
print(precision_score(y_test, y_pred, average="macro"))
print(recall_score(y_test, y_pred, average="macro")) 

0.4899962975585071
0.5805765364906476
0.5035496314353719
