In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import BernoulliNB

%matplotlib inline
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('../input/loan-default-prediction/train_v2.csv.zip', low_memory=False)
test = pd.read_csv('../input/loan-default-prediction/test_v2.csv.zip', low_memory=False)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train_numerical = train.select_dtypes(include=[np.number])
numerical_cols = train_numerical.columns.values

for col in numerical_cols:
    missing = train[col].isnull()
    number_missing = np.sum(missing)
    
    if number_missing > 0:
        train['{}_ismissing'.format(col)] = missing
        med = train[col].median()
        train[col] = train[col].fillna(med)

In [None]:
test_numerical = test.select_dtypes(include=[np.number])
test_numerical_cols = test_numerical.columns.values

for col in test_numerical_cols:
    missing = test[col].isnull()
    number_missing = np.sum(missing)
    
    if number_missing > 0:
        test['{}_ismissing'.format(col)] = missing
        med = test[col].median()
        test[col] = test[col].fillna(med)

In [None]:
train.drop(train.iloc[:, 771:1284], inplace = True, axis = 1) 

In [None]:
test.drop(test.iloc[:, 770:1288], inplace = True, axis = 1) 

In [None]:
corr_matrix = train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
train.drop(to_drop, axis=1, inplace=True)
test.drop(to_drop, axis=1, inplace=True)

In [None]:
train['loss_fact'] = train['loss']

loss_fact = train['loss']

for i in loss_fact:
    if i != 0:
        loss_fact = loss_fact.replace(i, 1)
        
train['loss_fact'] = loss_fact


In [None]:
trainfloatloss = pd.concat([train.select_dtypes(include=[np.float64]), train['loss_fact']], axis=1)

In [None]:
testfloat = test.select_dtypes(include=[np.float64])

In [None]:
float_columns = [c for c in trainfloatloss.columns if trainfloatloss[c].dtype.name == 'float64']

In [None]:
test_float_train = testfloat
test_float_train = test_float_train.drop('f5', axis = 1)
#test_float_train = test_float_train.drop('f531', axis = 1)

In [None]:
float_columns_test = [c for c in testfloat.columns if testfloat[c].dtype.name == 'float64']
float_columns_test.remove('f5')
#float_columns_test.remove('f531')

In [None]:
for c in float_columns:
    trainfloatloss[trainfloatloss[c].name + '_no_out'] = trainfloatloss[c]
    loss_fact0_float = trainfloatloss[trainfloatloss['loss_fact'] == 0][c]
    loss_fact1_float = trainfloatloss[trainfloatloss['loss_fact'] == 1][c]
    loss_fact0_float_no_out = loss_fact0_float[~(loss_fact0_float > loss_fact0_float.mean() + 2*loss_fact0_float.std())]
    loss_fact1_float_no_out = loss_fact1_float[~(loss_fact1_float > loss_fact1_float.mean() + 2*loss_fact1_float.std())]
    trainfloatloss[trainfloatloss[c].name + '_no_out'] = loss_fact0_float_no_out.append(loss_fact1_float_no_out)

In [None]:
trainfloatloss[['f756_no_out']].isna().values.sum()

In [None]:
trainfloatlossdna = trainfloatloss.dropna(how='any')

In [None]:
trainfloatlossdnanoout = pd.concat([trainfloatlossdna.filter(regex='no_out'), trainfloatlossdna['loss_fact']], axis=1)

In [None]:
X= trainfloatlossdnanoout.drop(['loss_fact'], axis=1)
y= trainfloatlossdnanoout['loss_fact']

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=17) 

In [None]:
X_train.shape, X_valid.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)
model = BernoulliNB(binarize = 0.01)
classifier = model.fit(X_train,y_train)
predict = classifier.predict(X_test)
cm = confusion_matrix(predict,y_test)
acc = accuracy_score(y_test, predict)
print(acc*100)

In [None]:
def bernoulli_naive_bayes(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)
    model = BernoulliNB(binarize = 0.01)
    classifier = model.fit(X_train,y_train)
    predict = classifier.predict(X_test)
    cm = confusion_matrix(predict,y_test)
    accuracy = cm.trace()/cm.sum()
    acc.append(accuracy)
    
#     print('Accuracy:', format(accuracy, '.2f'))
#     print('Confusion Matrix:', '\n', confusion_matrix(predict,y_test))
#     print('Classification Report:', '\n', classification_report(predict,y_test))
    
print('Bernoulli Naive Bayes Classifier')
acc = []
for i in range(20):
    bernoulli_naive_bayes(X, y)
    
acc_ = np.array(acc)
print('Average accuracy in 20 iterations is:', np.average(acc_))

In [None]:
def write_to_submission_file(predicted_labels, out_file, train_num=105471,
                    target='loss', index_label="id"):
    #turning predictions into a data frame and saving them as a csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(train_num + 1,
                                                  train_num + 1 +
                                                  predicted_labels.shape[0]),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [None]:
classifier.predict(test_float_train)

In [None]:
write_to_submission_file(classifier.predict(test_float_train), out_file="Naive_Bayes_loan_default_prediction.csv")