In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression

%matplotlib inline

In [None]:
df = pd.read_csv('../input/loan-default-prediction/train_v2.csv.zip')
test = pd.read_csv('../input/loan-default-prediction/test_v2.csv.zip')

In [None]:
df.head()

In [None]:
test.head()

Preprocessing

In [None]:
#checking the percentage of missing variables and printing out the names of the variables that are missing
for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    if (round(pct_missing*100)) != 0.0:
        print('{} - {}%'.format(col, round(pct_missing*100)))

In [None]:
##checking the percentage of missing variables and printing out the names of the variables that are missing in the test df
for col in test.columns:
    pct_missing = np.mean(test[col].isnull())
    if (round(pct_missing*100)) != 0.0:
        print('{} - {}%'.format(col, round(pct_missing*100)))

In [None]:
#dealing with missing variables
df_numeric = df.select_dtypes(include=[np.number])
numeric_cols = df_numeric.columns.values

for col in numeric_cols:
    missing = df[col].isnull()
    num_missing = np.sum(missing)
    
    if num_missing > 0:
        df['{}_ismissing'.format(col)] = missing
        med = df[col].median()
        df[col] = df[col].fillna(med)

In [None]:
#dealing with missing variables in the test df
test_numeric = test.select_dtypes(include=[np.number])
test_numeric_cols = test_numeric.columns.values

for col in test_numeric_cols:
    missing = test[col].isnull()
    num_missing = np.sum(missing)
    
    if num_missing > 0:
        test['{}_ismissing'.format(col)] = missing
        med = test[col].median()
        test[col] = test[col].fillna(med)

In [None]:
#checking the percentage of missing variables after dealing with numerical missing variables
for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    if (round(pct_missing*100)) != 0.0:
        print('{} - {}%'.format(col, round(pct_missing*100)))

In [None]:
#checking the percentage of missing variables after dealing with numerical missing variables in the test df
for col in test.columns:
    pct_missing = np.mean(test[col].isnull())
    if (round(pct_missing*100)) != 0.0:
        print('{} - {}%'.format(col, round(pct_missing*100)))

In [None]:
#dropping irrelavant columns about the fact of missing
df.drop(df.iloc[:, 771:1284], inplace = True, axis = 1) 

In [None]:
#dropping irrelavant columns about the fact of missing in the test df
test.drop(test.iloc[:, 770:1288], inplace = True, axis = 1) 

In [None]:
# Creating a correlation matrix and visualizing it
corr_matrix = df.corr().abs()
sns.heatmap(df.corr())

In [None]:
#dropping correlated columns in the df
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
df.drop(to_drop, axis=1, inplace=True)
#also dropping these columns in the test df
test.drop(to_drop, axis=1, inplace=True)

In [None]:
#making a column stating whether a loss happened or not
df['loss_fact'] = df['loss']

loss_fact = df['loss']

for i in loss_fact:
    if i != 0:
        loss_fact = loss_fact.replace(i, 1)
        
df['loss_fact'] = loss_fact
#train['loss_fact'] = train['loss'].apply(lambda x: 0 if x==0 else 1)

In [None]:
df['loss'].describe()

In [None]:
sns.distplot(df['loss'], hist=True, kde=False, 
             bins=int(180/5), color = 'blue',
             hist_kws={'edgecolor':'black'})

plt.title('Loss Distribution')
plt.xlabel('Loss (in %)')
plt.ylabel('Count')

In [None]:
df['loss_fact'].describe()

In [None]:
sns.distplot(df['loss_fact'], hist=True, kde=False, 
             bins=int(180/5), color = 'blue',
             hist_kws={'edgecolor':'black'})

plt.title('Loss Fact')
plt.xlabel('Loss Happened or Not')
plt.ylabel('Count')

Building ML models

In [None]:
#Training on float64

In [None]:
trainfloatloss = pd.concat([df.select_dtypes(include=[np.float64]), df['loss_fact']], axis=1)

In [None]:
testfloat = test.select_dtypes(include=[np.float64])

In [None]:
#trainfloatlossdna = trainfloatloss.dropna(how='any')

In [None]:
float_columns = [c for c in trainfloatloss.columns if trainfloatloss[c].dtype.name == 'float64']
#print('float_columns:', float_columns)

In [None]:
test_float_train = testfloat
test_float_train = test_float_train.drop('f5', axis = 1)
test_float_train = test_float_train.drop('f531', axis = 1)

In [None]:
float_columns_test = [c for c in testfloat.columns if testfloat[c].dtype.name == 'float64']
float_columns_test.remove('f5')
float_columns_test.remove('f531')
#print('float_columns:', float_columns_test)

In [None]:
for c in float_columns:
    trainfloatloss[trainfloatloss[c].name + '_no_out'] = trainfloatloss[c]
    loss_fact0_float = trainfloatloss[trainfloatloss['loss_fact'] == 0][c]
    loss_fact1_float = trainfloatloss[trainfloatloss['loss_fact'] == 1][c]
    loss_fact0_float_no_out = loss_fact0_float[~(loss_fact0_float > loss_fact0_float.mean() + 2*loss_fact0_float.std())]
    loss_fact1_float_no_out = loss_fact1_float[~(loss_fact1_float > loss_fact1_float.mean() + 2*loss_fact1_float.std())]
    trainfloatloss[trainfloatloss[c].name + '_no_out'] = loss_fact0_float_no_out.append(loss_fact1_float_no_out)

In [None]:
trainfloatloss[['f756_no_out']].isna().values.sum()

In [None]:
sns.violinplot(x='loss_fact', y='f756_no_out', data=trainfloatloss)

In [None]:
sns.violinplot(x='loss_fact', y='f756', data=trainfloatloss)

In [None]:
trainfloatlossdna = trainfloatloss.dropna(how='any')

In [None]:
trainfloatlossdnanoout = pd.concat([trainfloatlossdna.filter(regex='no_out'), trainfloatlossdna['loss_fact']], axis=1)

In [None]:
X= trainfloatlossdnanoout.drop(['loss_fact'], axis=1)
y= trainfloatlossdnanoout['loss_fact']

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=17) 

In [None]:
X_train.shape, X_valid.shape

In [None]:
first_tree = DecisionTreeClassifier(random_state=17)

In [None]:
np.mean(cross_val_score(first_tree, X_train, y_train, cv=5))

In [None]:
first_knn = KNeighborsClassifier()

In [None]:
np.mean(cross_val_score(first_knn, X_train, y_train, cv=5))

In [None]:
#Bernoulli Naive Bayes Classifier
def bernoulli_naive_bayes(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    model = BernoulliNB(binarize = 0.01)
    classifier = model.fit(X_train,y_train)
    predict = classifier.predict(X_test)
    cm = confusion_matrix(predict,y_test)
    accuracy = cm.trace()/cm.sum()
    acc.append(accuracy)
    
#     print('Accuracy:', format(accuracy, '.2f'))
#     print('Confusion Matrix:', '\n', confusion_matrix(predict,y_test))
#     print('Classification Report:', '\n', classification_report(predict,y_test))
    
print('Bernoulli Naive Bayes Classifier')
acc = []
for i in range(20):
    bernoulli_naive_bayes(X, y)
    
acc_ = np.array(acc)
print('Average accuracy in 20 iterations is:', np.average(acc_))

In [None]:
#Logistic Regression
def logistic_regression(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    model = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000)
    classifier = model.fit(X_train,y_train)
    predict = classifier.predict(X_test)
    cm = confusion_matrix(predict,y_test)
    accuracy = cm.trace()/cm.sum()
    acc.append(accuracy)
    
#     print('Accuracy:', format(accuracy, '.2f'))
#     print('Confusion Matrix:', '\n', confusion_matrix(predict,y_test))
#     print('Classification Report:', '\n', classification_report(predict,y_test))
    
print('Logistic Regression')
acc = []
for i in range(20):
    logistic_regression(X,y)
    
acc_ = np.array(acc)
print('Average accuracy in 20 iterations is:', np.average(acc_))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000)
classifier = model.fit(X_train,y_train)
predict = classifier.predict(X_test)
cm = confusion_matrix(predict,y_test)
accuracy = cm.trace()/cm.sum()

In [None]:
def write_to_submission_file(predicted_labels, out_file, train_num=105471,
                    target='loss', index_label="id"):
    #turning predictions into a data frame and saving them as a csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(train_num + 1,
                                                  train_num + 1 +
                                                  predicted_labels.shape[0]),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [None]:
classifier.predict(test_float_train)

In [None]:
write_to_submission_file(classifier.predict(test_float_train), out_file="logistic_regression_loan_default_prediction.csv")

In [None]:
#Training on int64

In [None]:
X= df.select_dtypes(include=[np.int64]).drop(['loss', 'loss_fact'], axis=1)
y= df['loss_fact']

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=17) 

In [None]:
X_train.shape, X_valid.shape

In [None]:
#Bernoulli Naive Bayes Classifier
def bernoulli_naive_bayes(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    model = BernoulliNB(binarize = 0.01)
    classifier = model.fit(X_train,y_train)
    predict = classifier.predict(X_test)
    cm = confusion_matrix(predict,y_test)
    accuracy = cm.trace()/cm.sum()
    acc.append(accuracy)
    
#     print('Accuracy:', format(accuracy, '.2f'))
#     print('Confusion Matrix:', '\n', confusion_matrix(predict,y_test))
#     print('Classification Report:', '\n', classification_report(predict,y_test))
    
print('Bernoulli Naive Bayes Classifier')
acc = []
for i in range(20):
    bernoulli_naive_bayes(X, y)
    
acc_ = np.array(acc)
print('Average accuracy in 20 iterations is:', np.average(acc_))

In [None]:
#Logistic Regression
def logistic_regression(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    model = LogisticRegression(penalty='l2', solver='sag', max_iter=1000)
    classifier = model.fit(X_train,y_train)
    predict = classifier.predict(X_test)
    cm = confusion_matrix(predict,y_test)
    accuracy = cm.trace()/cm.sum()
    acc.append(accuracy)
    
#     print('Accuracy:', format(accuracy, '.2f'))
#     print('Confusion Matrix:', '\n', confusion_matrix(predict,y_test))
#     print('Classification Report:', '\n', classification_report(predict,y_test))
    
print('Logistic Regression')
acc = []
for i in range(20):
    logistic_regression(X,y)
    
acc_ = np.array(acc)
print('Average accuracy in 20 iterations is:', np.average(acc_))

In [None]:
first_tree = DecisionTreeClassifier(random_state=17)

In [None]:
np.mean(cross_val_score(first_tree, X_train, y_train, cv=5))

In [None]:
first_knn = KNeighborsClassifier()

In [None]:
np.mean(cross_val_score(first_knn, X_train, y_train, cv=5))

In [None]:
tree_params = {'max_depth': np.arange(1,11), 'max_features': [.5,.7,1]}

In [None]:
tree_grid = GridSearchCV(first_tree, tree_params, cv=5, n_jobs=-1)

In [None]:
%%time 
tree_grid.fit(X_train, y_train)

In [None]:
tree_grid.best_score_

In [None]:
tree_grid.best_params_

In [None]:
#knn_params = {'n_neighbors':[1,2,3,4,]+ list(range(10,100,10))} 

In [None]:
knn_params = {'n_neighbors':range(10,30,1)} 

In [None]:
knn_grid = GridSearchCV(first_knn, knn_params, cv=5)

In [None]:
%%time 
knn_grid.fit(X_train, y_train)

In [None]:
knn_grid.best_score_, knn_grid.best_params_ 

In [None]:
tree_grid.best_estimator_ 

In [None]:
tree_grid.predict(X_valid)

In [None]:
tree_valid_pred = tree_grid.predict(X_valid)

In [None]:
tree_grid.score(X_valid, y_valid)

In [None]:
1 - np.mean(y)

In [None]:
export_graphviz(tree_grid.best_estimator_, out_file='loan_tree.dot', feature_names=X.columns, filled=True)

In [None]:
#!ls *.dot
!dot -Tpng loan_tree.dot -o loan_tree.png