In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [None]:
#import data 

data = data.head(n = 100)
data = data.copy().sample(frac= 0.1,random_state= 1)
print(data)

# Data Cleaning 
# # Data has some elements that need to be either deleted or transformed: 

# # Education. Categories defined are: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others).
# #However it also has values 5 & 6. So converting those values into 4, to classify as others 

for i in len(data['education']):
    if data['education'][i] > 4:
        data['education'][i] = 4

# # Marital status (1 = married; 2 = single; 3 = others).
# #However it also has values 0 (Unknown) So remove these values
data = data[data.marriage != 0]
print(data.shape)

#Descriptive Analysis
observations: int = data.shape[0]
features: int = data.shape[1]
print("Dataset has", observations, "observations and", features, "features.")
# print("Dataset has", data.isna().sum())
print('Average age of the customer is', data['age'].mean())
print(data.describe())

# #distribution of Limited_Bal 
plt.subplot()
# sns.distplot(data.balance_limit)
plt.show()
print(data.balance_limit.skew())

# #distrubution plot Age
plt.subplot()
# sns.distplot(data.age)
plt.show()
data.age.describe()
def_cnt = (data.Y.value_counts())
def_cnt.plot.bar()
for x,y in zip([0,1],def_cnt):
    plt.text(x,y,y,fontsize=12)
plt.show()
print(data.age.skew())
# #Bivariate Analysis 

# print(data.corr())
# # #Age Vs default 
plt.scatter(data.age, data.Y)
plt.show()

# # #Boxplot 
ax = data[['balance_limit','sex','education', 'marriage','age','pay_hist_apr','pay_hist_may','pay_hist_jun','pay_hist_jul',	'pay_hist_aug',	'pay_hist_sep']].plot(kind='box', title='boxplot', showmeans=True)
plt.xticks(rotation=90)
plt.show()

# # #Analysing relationship between Variables 
f = plt.figure(figsize = (10,3)) # Fix plot size
ax1 = f.add_subplot() # Instantiate figure 1
ax2 = f.add_subplot() # Instantiate figure 2
ax3 = f.add_subplot() # Instantiate figure 2
ax4 = f.add_subplot() # Instantiate figure 2

ax1.scatter(data.balance_limit, data.Y) # Plot figure 1
ax1.set_xlabel('balance_limit')
ax1.set_ylabel('Y')
ax1.set_title('balance_limit vs Y')

ax2.scatter(data.age, data.Y) # Plot figure 1
ax2.set_xlabel('age')
ax2.set_ylabel('Y')
ax2.set_title('age vs Y')

ax3.scatter(data.bill_amt_sep, data.Y) # Plot figure 1
ax3.set_xlabel('bill_amt_sep')
ax3.set_ylabel('Y')
ax3.set_title('bill_amt_sep vs Y')

ax4.scatter(data.prev_amt_paid_apr, data.Y) # Plot figure 1
ax4.set_xlabel('prev_amt_paid_apr')
ax4.set_ylabel('Y')
ax4.set_title('prev_amt_paid_apr vs Y')
plt.show()

#No sharp linear relation between Xs and Y. Let's try different models to see which one works best

data = pd.get_dummies(data, columns = ['sex','education','marriage','pay_hist_apr','pay_hist_may','pay_hist_jun',	'pay_hist_jul',	'pay_hist_aug',	'pay_hist_sep'], drop_first = True) 
X = data.drop(['Y'],axis = 1)
y = data['Y']
# print(y.value_counts()) # compare 0 and 1 values . Skewed distribution with 23364 non events(0) and 6636 events(1)

print('######################### Checking LOGISTIC REGRESSION model results #########################')

# Splitting data into train_valid, test data set with 80% test set
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.8, random_state = 7283)

# # #Further splitting the data into train and valid set
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size = 0.37, random_state = 7283)

# # Fit Logistic Regerssion to check if changing weights make a difference
# # Tuning hyperparameters 

print('##################### LOGISTIC REGRESSION *Threshold , weights and alpha Tuning* #####################') 
print('############# Performing 5 Fold CV #######################')

# # # Set up the candidate hyperparameters

thresholds = np.linspace(0, 1, 11)
thresholds = np.delete(thresholds,[10]) #remove th = 1
lambdas = np.logspace(-4, 4, 9)

kfold = KFold(5, False) #set K = 5
hyperparameter_triplet = list(itertools.product(thresholds,lambdas)) #create unique triplet pairs using itertools.product
validation_scores = [] #initialise list to store MSE validation scores
for index, triplet in enumerate(hyperparameter_triplet): #iterate on each hyperparameter triplet 
    validation_scores_tmp = []
    for train_index, valid_index in kfold.split(X_train_valid): #splitting into train, valid set
        X_train, Y_train = X_train_valid.iloc[train_index], y_train_valid.iloc[train_index] # Training set
        X_valid, Y_valid = X_train_valid.iloc[valid_index], y_train_valid.iloc[valid_index] # Validation set
        scaler = StandardScaler() # Instantiate
        scaler.fit(X_train) # Fit the data
        X_train = pd.DataFrame(scaler.transform(X_train)) #transform X data
        X_valid = pd.DataFrame(scaler.transform(X_valid))

        lr = LogisticRegression(solver = 'liblinear', C=triplet[1], penalty='l1') #building logistic regression model
        lr.fit(X_train, Y_train) #fitting the lr model
        y_pred = lr.predict_proba(X_valid)[:,1] #predicting on validation set using likelihood function
        y_hat = np.where(y_pred > triplet[0], 1, 0) #classifying outcomes based on threshold 'a'
        confmat = confusion_matrix(Y_valid, y_hat, labels = [1,0]) #creating confusion matrix based on classification done above 
        TP = confmat[0,0] 
        FN = confmat[0,1]
        FP = confmat[1,0]
        TN = confmat[1,1]
        validation_scores_tmp.append((TP + TN) / sum(sum(confmat))) #calc accuracy and append in acc_list_tmp
    validation_scores.append(np.mean(validation_scores_tmp))

print("")
print('MAX ACCURACY ##############################################################')
acc_max = max(validation_scores)
best_triplet = hyperparameter_triplet[np.argmax(validation_scores)]
best_threshold = best_triplet[0]
best_alpha = best_triplet[1]
print('Maximum Accuracy is ',acc_max, 'at the threshold value', best_threshold, 'for alpha ',best_alpha ) 
#Max accuracy of the model is 82% at 0.5 Threshold value and alpha  100 

scaler.fit(X_train_valid) # Fit the data
X_train_valid = pd.DataFrame(scaler.transform(X_train_valid)) # Transform the data
X_test = pd.DataFrame(scaler.transform(X_test))

# # #Fitting model for max accuracy
final_model_acc_max = LogisticRegression(max_iter = 10000, random_state = 1, C= best_alpha, penalty='l1',solver = 'liblinear')
final_model_acc_max.fit(X_train_valid, y_train_valid)
y_hat = np.where(final_model_acc_max.predict_proba(X_test)[:,1] > best_threshold, 1, 0) #classifying using th_acc_max
final_confmat = confusion_matrix(y_test, y_hat, labels = [1,0])
TP = final_confmat[0,0]
FN = final_confmat[0,1]
FP = final_confmat[1,0]
TN = final_confmat[1,1]

Acc = (TP + TN) / sum(sum(final_confmat)) #Accuracy
Precision = TP / (TP + FP) # Precision 
TPR = TP/(TP+FN)
FPR = FP/(FP+TN)

print('Final Confusion Matrix for Max accuracy:')
print(final_confmat)

print('TPR is ', TPR) # 35% times able to coreectly classify events as events 
print('FPR is ' , FPR) # 5% times incorrectly classify non events as events 
print('Accuracy of the model is', Acc) # 81% of the times able to correctly predict events the class 
print('Precision of the model is', Precision) #67%
print('MSE of Logistic model is ', metrics.mean_squared_error(y_test, y_hat)) #0.18
print('Loss function of Logistic model is ', metrics.log_loss(y_test, y_hat)) #6.2
#Final variables chosen 
pd.set_option("display.max_rows", None, "display.max_columns", None)
print(pd.DataFrame(zip(final_model_acc_max.coef_[0], X.columns.values)))

print(np.exp(0.23))
print(np.exp(0.009))
print(np.exp(0.08))
print(np.exp(0.1))
print(np.exp(0.06))
#Interpreting variables
# # Few variables were eliminated : pay_hist_sep_8, pay_hist_aug_8, pay_hist_jul_6 etc
# # Balance_limit: A unit change in Balance_limit is associated with decrease
# in the odds of customer being a defaulter by exp(0.23) or 1.25 times
# # Age: A unit change in Age is associated with increase
# in the odds of customer being a defaulter by exp(0.009) or 1.009 times
# # Sex: A female customer is exp(0.08) or 1.008 times more likely to 
# default than a male customer 
# #Education: Customers who have University & high school degree are exp(0.1) or 1.105 and exp(0.06 ) or 1.106
# times more likely to default than a customer with graduate degree

print('')
print('##################### KNN #####################') 
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, random_state = 283)

n_neighbors = np.arange(1,21,3)
leaf_size = list(range(1,50))
kfold = KFold(5, False) 
hyperparameter_couple = list(itertools.product(n_neighbors, leaf_size))
valid_acc = []
for index, couple in enumerate(hyperparameter_couple): #iterate on each hyperparameter triplet 
    valid_acc_tmp = []
    for train_index, valid_index in kfold.split(X_train_valid): #splitting into train, valid set
        X_train, Y_train = X_train_valid.iloc[train_index], y_train_valid.iloc[train_index] # Training set
        X_valid, Y_valid = X_train_valid.iloc[valid_index], y_train_valid.iloc[valid_index] # Validation set
        ss = StandardScaler().fit(X_train)
        X_train = pd.DataFrame(ss.transform(X_train))
        X_valid = pd.DataFrame(ss.transform(X_valid))
        
        knn = neighbors.KNeighborsClassifier(n_neighbors = couple[0], leaf_size = couple[1])
        knn.fit(X_train, Y_train)
        y_hat = knn.predict(X_valid)
        score = metrics.accuracy_score(Y_valid, y_hat)
        valid_acc_tmp.append(score)
    valid_acc.append(np.mean(valid_acc_tmp))

best_couple = hyperparameter_couple[np.argmax(valid_acc)+1]
bestK = best_couple[0]
best_leaf_size = best_couple[1]

# Calculating final accuracy on the Tesing Set
# KNN Scale Data
ss = StandardScaler().fit(X_train_valid)
X_train_valid_knn = pd.DataFrame(ss.transform(X_train_valid))
X_test_knn = pd.DataFrame(ss.transform(X_test))

knn = neighbors.KNeighborsClassifier(n_neighbors = bestK, leaf_size =  best_leaf_size)
knn.fit(X_train_valid_knn, y_train_valid)
y_hat = knn.predict(X_test_knn)
score = metrics.accuracy_score(y_test, y_hat)
print(best_couple) # best K = 19, Best leaf size = 2
print('Accuracy Score of KNN Classifier is ',score) # ~ 0.82
print('MSE of KNN model is ',metrics.mean_squared_error(y_test, y_hat)) # 0.17

print('##################### Random Forest #####################') 
n_estimators = [50,100,150,200,250,300]
max_depth= range(10,20)
# features = range(1, 14)
hyperparameter_triplets = list(itertools.product(n_estimators, max_depth)) #create unique triplet pairs using itertools.product
validation_scores = [] #initialise list to store MSE validation scores
for index, triplets in enumerate(hyperparameter_triplets): #iterate on each hyperparameter triplet 
    rf = RandomForestClassifier(n_estimators = triplets[0], max_depth = triplets[1]) #Build RandomForest model for the triplets
    rf.fit(X_train, y_train) # Fit model on training set
    accuracy = accuracy_score(y_test, rf.predict(X_test)) #Calculate MSE on Validation set
    validation_scores.append(accuracy)

best_triplet = hyperparameter_triplets[np.argmin(validation_scores)]
print('Final Tunes Parameters are : n_estimators: ', best_triplet[0], 'max_depth: ', best_triplet[1]) #200, 18
rf = RandomForestClassifier(n_estimators = best_triplet[0], max_depth = best_triplet[1])
rf.fit(X_train_valid, y_train_valid)
print('MSE of Random Forest Model on Test set is ',mean_squared_error(rf.predict(X_test), y_test)) #0.13
print(sorted(zip(rf.feature_importances_,X.columns.values), reverse = True))
print('Accuracy score of Random forest is ' ,accuracy_score(y_test, rf.predict(X_test))) # ~ 0.86

# # #Top 2 Important Features are: age & balance_limit