In [42]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

from sklearn.metrics import roc_curve, auc

from imblearn.over_sampling import SMOTE

from imblearn.combine import SMOTEENN


In [43]:
df =pd.read_csv('UCI_Credit_Card.csv')


In [44]:
df['default.payment.next.month'].value_counts()

0    23364
1     6636
Name: default.payment.next.month, dtype: int64

In [45]:
df.apply(lambda x:sum(x.isnull()))

ID                            0
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default.payment.next.month    0
dtype: int64

In [46]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,30000.0,15000.5,8660.398374,1.0,7500.75,15000.5,22500.25,30000.0
LIMIT_BAL,30000.0,167484.322667,129747.661567,10000.0,50000.0,140000.0,240000.0,1000000.0
SEX,30000.0,1.603733,0.489129,1.0,1.0,2.0,2.0,2.0
EDUCATION,30000.0,1.853133,0.790349,0.0,1.0,2.0,2.0,6.0
MARRIAGE,30000.0,1.551867,0.52197,0.0,1.0,2.0,2.0,3.0
AGE,30000.0,35.4855,9.217904,21.0,28.0,34.0,41.0,79.0
PAY_0,30000.0,-0.0167,1.123802,-2.0,-1.0,0.0,0.0,8.0
PAY_2,30000.0,-0.133767,1.197186,-2.0,-1.0,0.0,0.0,8.0
PAY_3,30000.0,-0.1662,1.196868,-2.0,-1.0,0.0,0.0,8.0
PAY_4,30000.0,-0.220667,1.169139,-2.0,-1.0,0.0,0.0,8.0


# Smoteenn  - hybrid sampling method

In [47]:
df_smote=df.copy()
y=df_smote['default.payment.next.month']
X=df_smote.drop('default.payment.next.month', axis=1)
sm=SMOTEENN(random_state=1)
X_res, y_res=sm.fit_resample(X,y)
df_balanced=X_res
df_balanced['default.payment.next.month'] = y_res

In [48]:
df_balanced['default.payment.next.month'].value_counts()
new_df=df_balanced.copy()

# data processing and normalization

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [50]:
scaler=MinMaxScaler()
scaler.fit(new_df)
scaled_features = scaler.transform(new_df)
new_df = pd.DataFrame(data=scaled_features, columns= new_df.columns.values)

In [51]:
new_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,27501.0,0.49992,0.277341,0.0,0.264277,0.502717,0.733456,1.0
LIMIT_BAL,27501.0,0.190751,0.170299,0.0,0.054054,0.139038,0.283784,1.0
SEX,27501.0,0.47391,0.499328,0.0,0.0,0.0,1.0,1.0
EDUCATION,27501.0,0.289365,0.117573,0.0,0.166667,0.333333,0.333333,1.0
MARRIAGE,27501.0,0.472746,0.168863,0.0,0.333333,0.333333,0.666667,1.0
AGE,27501.0,0.264679,0.159357,0.0,0.148148,0.240741,0.37037,1.0
PAY_0,27501.0,0.227875,0.113216,0.0,0.2,0.2,0.3,1.0
PAY_2,27501.0,0.240472,0.135164,0.0,0.111111,0.222222,0.333333,1.0
PAY_3,27501.0,0.21004,0.121188,0.0,0.1,0.2,0.2,1.0
PAY_4,27501.0,0.226873,0.135486,0.0,0.111111,0.222222,0.222222,1.0


In [52]:
new_df

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,0.000100,0.054054,0.0,0.333333,0.333333,0.666667,0.1,0.222222,0.1,0.222222,...,0.239064,0.164181,0.306145,0.002290,0.021779,0.011248,0.014493,0.001615,0.001284,0.0
1,0.000133,0.054054,0.0,0.166667,0.666667,0.296296,0.2,0.222222,0.2,0.222222,...,0.237128,0.164954,0.307394,0.002862,0.001078,0.000739,0.001610,0.002345,0.001513,0.0
2,0.000200,0.121622,1.0,0.333333,0.666667,0.037037,0.2,0.111111,0.1,0.222222,...,0.213123,0.132638,0.280167,0.000435,0.000357,0.000000,0.000936,0.003955,0.002917,0.0
3,0.000233,0.175676,1.0,0.500000,0.333333,0.129630,0.2,0.222222,0.4,0.222222,...,0.228135,0.152167,0.284578,0.003811,0.000000,0.000486,0.001610,0.002345,0.001892,0.0
4,0.000267,0.013514,0.0,0.500000,0.666667,0.259259,0.0,0.000000,0.0,0.000000,...,0.212846,0.154150,0.298842,0.000000,0.000000,0.000000,0.020945,0.002631,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27496,0.493282,0.324442,1.0,0.166667,0.333333,0.055556,0.4,0.222222,0.2,0.222222,...,0.299823,0.200506,0.303167,0.004077,0.001139,0.001704,0.003224,0.018746,0.000574,1.0
27497,0.162494,0.013514,1.0,0.166667,0.666667,0.481481,0.2,0.333333,0.3,0.444444,...,0.220097,0.143026,0.287954,0.000254,0.000912,0.000000,0.001087,0.000419,0.003946,1.0
27498,0.934289,0.256757,0.0,0.166667,0.333333,0.314815,0.4,0.444444,0.4,0.333333,...,0.322289,0.278767,0.409263,0.004719,0.001489,0.003374,0.005850,0.011392,0.002938,1.0
27499,0.468311,0.027027,1.0,0.333333,0.333333,0.111111,0.3,0.555556,0.4,0.333333,...,0.247046,0.180048,0.319904,0.000000,0.000000,0.001607,0.003513,0.001688,0.003465,1.0


# Feature Selection

In [53]:
from sklearn.decomposition import PCA

In [54]:
pca=PCA(n_components=24)
pca.fit(new_df)

PCA(n_components=24)

In [55]:
print(pca.explained_variance_ratio_)

[3.85054697e-01 2.40868357e-01 9.66369424e-02 8.26262729e-02
 5.90298135e-02 4.30229510e-02 2.12621098e-02 2.05965639e-02
 1.30826901e-02 1.24651451e-02 6.72202004e-03 5.11666360e-03
 3.41579863e-03 2.64696468e-03 2.46465296e-03 1.26645614e-03
 1.08738526e-03 6.69548069e-04 6.16485509e-04 5.17355094e-04
 2.89240511e-04 2.18501127e-04 1.55716962e-04 1.29215722e-04]


In [56]:
new_df

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,0.000100,0.054054,0.0,0.333333,0.333333,0.666667,0.1,0.222222,0.1,0.222222,...,0.239064,0.164181,0.306145,0.002290,0.021779,0.011248,0.014493,0.001615,0.001284,0.0
1,0.000133,0.054054,0.0,0.166667,0.666667,0.296296,0.2,0.222222,0.2,0.222222,...,0.237128,0.164954,0.307394,0.002862,0.001078,0.000739,0.001610,0.002345,0.001513,0.0
2,0.000200,0.121622,1.0,0.333333,0.666667,0.037037,0.2,0.111111,0.1,0.222222,...,0.213123,0.132638,0.280167,0.000435,0.000357,0.000000,0.000936,0.003955,0.002917,0.0
3,0.000233,0.175676,1.0,0.500000,0.333333,0.129630,0.2,0.222222,0.4,0.222222,...,0.228135,0.152167,0.284578,0.003811,0.000000,0.000486,0.001610,0.002345,0.001892,0.0
4,0.000267,0.013514,0.0,0.500000,0.666667,0.259259,0.0,0.000000,0.0,0.000000,...,0.212846,0.154150,0.298842,0.000000,0.000000,0.000000,0.020945,0.002631,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27496,0.493282,0.324442,1.0,0.166667,0.333333,0.055556,0.4,0.222222,0.2,0.222222,...,0.299823,0.200506,0.303167,0.004077,0.001139,0.001704,0.003224,0.018746,0.000574,1.0
27497,0.162494,0.013514,1.0,0.166667,0.666667,0.481481,0.2,0.333333,0.3,0.444444,...,0.220097,0.143026,0.287954,0.000254,0.000912,0.000000,0.001087,0.000419,0.003946,1.0
27498,0.934289,0.256757,0.0,0.166667,0.333333,0.314815,0.4,0.444444,0.4,0.333333,...,0.322289,0.278767,0.409263,0.004719,0.001489,0.003374,0.005850,0.011392,0.002938,1.0
27499,0.468311,0.027027,1.0,0.333333,0.333333,0.111111,0.3,0.555556,0.4,0.333333,...,0.247046,0.180048,0.319904,0.000000,0.000000,0.001607,0.003513,0.001688,0.003465,1.0


In [57]:
from scipy.stats import pearsonr

In [58]:
for eachValue in new_df.columns:    
    corr, _ = pearsonr(new_df[eachValue], new_df['default.payment.next.month'])
    print('{0} : {1}'.format( eachValue, corr))

ID : -0.03258462535330245
LIMIT_BAL : -0.3260097860725663
SEX : -0.2277106466185799
EDUCATION : -0.07161478120688382
MARRIAGE : -0.20178365144416355
AGE : -0.017251174503348482
PAY_0 : 0.38564706464376014
PAY_2 : 0.3363947939394312
PAY_3 : 0.3142304162623036
PAY_4 : 0.29143631467480724
PAY_5 : 0.278698973492368
PAY_6 : 0.2618801607395705
BILL_AMT1 : -0.03879643705307247
BILL_AMT2 : -0.0269700202457076
BILL_AMT3 : -0.02490673760418721
BILL_AMT4 : -0.0167023661278439
BILL_AMT5 : -0.009897695052861477
BILL_AMT6 : -0.0073358238541098375
PAY_AMT1 : -0.18181157936757408
PAY_AMT2 : -0.14770036078268609
PAY_AMT3 : -0.1433888848755252
PAY_AMT4 : -0.15899913695473944
PAY_AMT5 : -0.15206386704900843
PAY_AMT6 : -0.14028728478732824
default.payment.next.month : 0.999999999999997


In [59]:
new_df=new_df.drop('ID',axis=1)
new_df=new_df.drop('LIMIT_BAL',axis=1)
new_df=new_df.drop('SEX',axis=1)
new_df=new_df.drop('EDUCATION',axis=1)
new_df=new_df.drop('MARRIAGE',axis=1)
new_df=new_df.drop('AGE',axis=1)
new_df=new_df.drop('BILL_AMT1',axis=1)
new_df=new_df.drop('BILL_AMT2',axis=1)
new_df=new_df.drop('BILL_AMT3',axis=1)
new_df=new_df.drop('BILL_AMT4',axis=1)
new_df=new_df.drop('BILL_AMT5',axis=1)
new_df=new_df.drop('BILL_AMT6',axis=1)
new_df=new_df.drop('PAY_AMT1',axis=1)
new_df=new_df.drop('PAY_AMT2',axis=1)
new_df=new_df.drop('PAY_AMT3',axis=1)
new_df=new_df.drop('PAY_AMT4',axis=1)
new_df=new_df.drop('PAY_AMT5',axis=1)
new_df=new_df.drop('PAY_AMT6',axis=1)

In [60]:
new_df

Unnamed: 0,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,default.payment.next.month
0,0.1,0.222222,0.1,0.222222,0.222222,0.2,0.0
1,0.2,0.222222,0.2,0.222222,0.222222,0.2,0.0
2,0.2,0.111111,0.1,0.222222,0.222222,0.1,0.0
3,0.2,0.222222,0.4,0.222222,0.222222,0.2,0.0
4,0.0,0.000000,0.0,0.000000,0.111111,0.1,0.0
...,...,...,...,...,...,...,...
27496,0.4,0.222222,0.2,0.222222,0.222222,0.2,1.0
27497,0.2,0.333333,0.3,0.444444,0.444444,0.4,1.0
27498,0.4,0.444444,0.4,0.333333,0.333333,0.3,1.0
27499,0.3,0.555556,0.4,0.333333,0.333333,0.4,1.0


In [61]:
y_original=new_df['default.payment.next.month']
X_original=new_df.drop('default.payment.next.month', axis=1)

In [62]:
X, x_test, y,y_test=train_test_split(X_original,y_original,test_size=0.2)

# importing models

In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,recall_score,precision_score,f1_score 
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler

In [84]:
n_splits=10
kf=KFold(n_splits=n_splits, random_state=None)

In [85]:
type(X)

pandas.core.frame.DataFrame

In [86]:
type(y)

pandas.core.series.Series

In [87]:

def k_fold_cross_validated_score(model,X,y):
    best_model=''
    best_accuracy=0
    average_accuracy=0
    scores=cross_val_score(model, X, y, cv=10 )
    best_accuracy=scores.max()
    average_accuracy=scores.mean()
    return best_model, best_accuracy, average_accuracy


In [93]:
def k_fold_cross_validated_result(model):
    best_model=''
    best_accuracy=0
    average_accuracy=0
    acc_score=[]
    
    for train_index, test_index in kf.split(X):
    
        print(len(train_index))
        x_train, x_heldout=X.iloc[train_index, : ], X.iloc[test_index, :]
        y_train, y_heldout=y[train_index, :], y[test_index, :]
        model.fit(x_train, y_train)
        pred_values = model.predict(x_heldout)
        
        acc = accuracy_score(pred_values, y_heldout)
        if acc > best_accuracy:
            best_accuracy=acc
            best_model=model
        acc_score.append(acc)
    average_accuracy=sum(acc_score)/len(acc_score)

    return best_model, best_accuracy, average_accuracy

# logistic regression

In [89]:
model=LogisticRegression()

In [90]:
k_fold_cross_validated_score(model, X, y)

('', 0.6872727272727273, 0.6752272727272727)

In [94]:
m, best_acc, avg_acc=k_fold_cross_validated_result(model)

print('Model : {0}, best Accuracy : {1}, Average Accuracy : {2}'.format(m, best_acc, avg_acc))

TypeError: object of type 'generator' has no len()