In [1]:
import numpy as np
import pandas as pd
import time
import warnings
from sklearn import metrics, preprocessing, tree
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")

In [2]:
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result
    return timed

In [3]:
file_loc = 'loan_prediction.csv'

In [4]:
df = pd.read_csv(file_loc)

In [5]:
df.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,5849,0.0,0.0,360.0,1.0,1
1,4583,1508.0,128.0,360.0,1.0,0
2,3000,0.0,66.0,360.0,1.0,1
3,2583,2358.0,120.0,360.0,1.0,1
4,6000,0.0,141.0,360.0,1.0,1


In [13]:
df.tail()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
609,2900,0.0,71.0,360.0,1.0,1
610,4106,0.0,40.0,180.0,1.0,1
611,8072,240.0,253.0,360.0,1.0,1
612,7583,0.0,187.0,360.0,1.0,1
613,4583,0.0,133.0,360.0,0.0,0


In [6]:
df.shape

(614, 6)

In [14]:
from sklearn.tree import DecisionTreeClassifier as dt
clf = dt()

In [15]:
clf

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [16]:
X = df.iloc[:,0:len(df.columns)-1].values
Y = df.iloc[:,-1].values

In [17]:
X.shape

(614, 5)

In [18]:
Y.shape

(614,)

In [19]:
X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

In [20]:
X_train.shape

(460, 5)

In [21]:
X_test.shape

(154, 5)

In [22]:
scores = cross_val_score(clf, X_train, Y_train, cv=5, scoring='f1_macro')

In [23]:
scores.mean()

0.6367477523401855

In [24]:
# Fit the model
clf.fit(X_train, Y_train)
# Make predictions
train_predictions = clf.predict(X_train)
test_predictions = clf.predict(X_test)

In [25]:
clf

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [26]:
train_cols = df.columns[0:len(df.columns)-1]

In [27]:
target_cols = df.columns[-1]

In [28]:
print('The Training F1 Score is', f1_score(train_predictions, Y_train))
print('The Testing F1 Score is', f1_score(test_predictions, Y_test))

The Training F1 Score is 1.0
The Testing F1 Score is 0.7545454545454546


In [29]:
parameters = {'max_depth':[1,2,3,4,5], 
              'min_samples_leaf':[1,2,3,4,5], 
              'min_samples_split':[2,3,4,5],
              'criterion' : ['gini','entropy']}
scorer = make_scorer(f1_score)

In [30]:
@timeit
def generate_clf_from_search(grid_or_random, clf, parameters, scorer, X, y):
    if grid_or_random == "Grid":
        search_obj = GridSearchCV(clf, parameters, scoring=scorer)
    elif grid_or_random == "Random":
        search_obj = RandomizedSearchCV(clf, parameters, scoring=scorer)
    fit_obj = search_obj.fit(X, y)
    best_clf = fit_obj.best_estimator_
    return best_clf

In [31]:
best_clf_grid = generate_clf_from_search("Grid", 
                                         clf, 
                                         parameters, 
                                         scorer, 
                                         X_train, 
                                         Y_train)

'generate_clf_from_search'  1473.06 ms


In [32]:
scores = cross_val_score(best_clf_grid, X_train, Y_train, cv=5, scoring='f1_macro')
scores.mean()

0.7058924321624135

In [33]:
best_clf_grid.fit(X_train, Y_train)
# Make predictions using the new model.
best_train_predictions = best_clf_grid.predict(X_train)
best_test_predictions = best_clf_grid.predict(X_test)

# Calculate the f1_score of the new model.
print('The training F1 Score is', f1_score(best_train_predictions, Y_train))
print('The testing F1 Score is', f1_score(best_test_predictions, Y_test))

The training F1 Score is 0.8360902255639098
The testing F1 Score is 0.8620689655172413


In [34]:
best_clf_random = generate_clf_from_search("Random", 
                                           clf, 
                                           parameters, 
                                           scorer, 
                                           X_train, 
                                           Y_train)

'generate_clf_from_search'  89.78 ms


In [35]:
scores = cross_val_score(best_clf_random, X_train, Y_train, cv=5, scoring='f1_macro')
scores.mean()

0.6952617779438539

In [29]:
best_clf_random.fit(X_train, Y_train)
# Make predictions using the new model.
best_train_predictions = best_clf_random.predict(X_train)
best_test_predictions = best_clf_random.predict(X_test)

# Calculate the f1_score of the new model.
print('The training F1 Score is', f1_score(best_train_predictions, Y_train))
print('The testing F1 Score is', f1_score(best_test_predictions, Y_test))

The training F1 Score is 0.8360902255639098
The testing F1 Score is 0.8620689655172413
