In [None]:
import pickle
import numpy as np
import pandas as pd

In [None]:
import matplotlib.pyplot as plt 
from plotly.offline import iplot, init_notebook_mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import seaborn as sns

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

from imblearn.over_sampling import SMOTE

In [None]:
random_seed = 8789
np.random.seed(random_seed)

In [None]:
df_train = pd.read_csv("/kaggle/input/GiveMeSomeCredit/cs-training.csv")
df_train = df_train.set_index("Unnamed: 0")

df_test = pd.read_csv("/kaggle/input/GiveMeSomeCredit/cs-test.csv")
df_test = df_test.set_index("Unnamed: 0")

# EDA

In [None]:
df_train.head(10)

In [None]:
df_train.describe()

In [None]:
print(df_train.isnull().sum())

**Here, we have some null values. Nulls for MonthlyIncome fill up to 19.82% of rows in the training set. For NumberOfDependents, nulls account for 2.62% of the rows. We will try to impute them.**

In [None]:
df_class0 = df_train.loc[df_train.SeriousDlqin2yrs == 0]
df_class1 = df_train.loc[df_train.SeriousDlqin2yrs == 1]

In [None]:
def proc_pkl(name, data):
    pkl_file= open(name+'.pkl', 'wb')
    pickle.dump(data, pkl_file)
    pkl_file.close()

In [None]:
def func_np2df(data, source, add_target=False, label_name=None, copy_columns=False):
    df = pd.DataFrame(data)
    df.index = source.index
    if add_target:
        df["label"] = source[label_name]
    if copy_columns:
        df.columns = source.columns
    df.head(10)
    return df

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
colors = ['g', 'r']

dict_count = {}
dict_count["No issue"] = df_class0.shape[0]
dict_count["Fail in 2 years"] = df_class1.shape[0]

bars = ax.bar(list(dict_count.keys()), list(dict_count.values()))
bars[0].set_color(colors[0])
bars[1].set_color(colors[1])

for i, bar in enumerate(ax.patches):
    plt.text(bar.get_x() + bar.get_width() / 2.0, bar.get_height() * 1.025, str(round((bar.get_height() * 100 / df_train.shape[0]), 4))+'%', 
             fontsize=12, fontweight='bold', color=colors[i], ha='center')

plt.show()
plt.clf()

**We have a fairly imbalanced dataset where the positive class accounts for 6.68% of the dataset.**

In [None]:
corr = df_train.corr()
plt.figure(figsize=(20,20))
sns.heatmap(corr, cmap="Reds",annot=True)
plt.show()
plt.clf()

In [None]:
print(df_train["NumberOfTime30-59DaysPastDueNotWorse"].value_counts())
print(df_train["NumberOfTime60-89DaysPastDueNotWorse"].value_counts())
print(df_train["NumberOfTimes90DaysLate"].value_counts())

In [None]:
df_train.loc[(df_train["NumberOfTimes90DaysLate"] == 96) | (df_train["NumberOfTimes90DaysLate"] == 98)][["NumberOfTime30-59DaysPastDueNotWorse", "NumberOfTime60-89DaysPastDueNotWorse" , "NumberOfTimes90DaysLate"]]

**3 features are highly correlated together : NumberOfTime30-59DaysPastDueNotWorse, NumberOfTime60-89DaysPastDueNotWorse, NumberOfTimes90DaysLate. It seems they are always equal when one is 96 or 98. It looks like something off with those numbers. Not so much row affected.**

In [None]:
for idx, feature in enumerate(df_train.columns[1:]):
    fig, axs = plt.subplots(1, 3, figsize=(10,5))
    sns.boxplot(y=df_train[feature], ax=axs[0])
    sns.boxplot(y=df_class0[feature], ax=axs[1], color="green")
    sns.boxplot(y=df_class1[feature], ax=axs[2], color="red")
    plt.show()
    plt.clf()
print(df_train.columns[1:])

In [None]:
df_train.loc[df_train.MonthlyIncome == 0].describe()

In [None]:
for idx, feature in enumerate(df_train.columns[1:]):
    fig, axs = plt.subplots(1, 3, figsize=(20,5))
    sns.histplot(df_train[feature], kde=True, bins=100, ax=axs[0])    
    sns.histplot(df_class0[feature], kde=True, bins=100, color="green", ax=axs[1])
    sns.histplot(df_class1[feature], kde=True, bins=100, color="red", ax=axs[2])
    
    plt.show()
    plt.clf()
print(df_train.columns[1:])

In [None]:
print(df_train.DebtRatio.quantile(q=0.95))
print(df_train.MonthlyIncome.quantile(q=0.95))

fig, ax = plt.subplots(figsize=(15, 10))
sns.scatterplot(x=df_class0.DebtRatio, y=df_class0.MonthlyIncome, color='g', alpha=0.6, ax=ax)
sns.scatterplot(x=df_class1.DebtRatio, y=df_class1.MonthlyIncome, color='r', alpha=0.6, ax=ax)
plt.vlines(1, 0, df_train.MonthlyIncome.max(), colors="black")
plt.vlines(10, 0, df_train.MonthlyIncome.max(), colors="blue")
ax.set_xscale('log')
ax.set_yscale('log')
plt.show()
plt.clf()

In [None]:
mi = df_train.loc[(df_train.MonthlyIncome >= 0) & (df_train.MonthlyIncome < 100)]
print(mi.MonthlyIncome.value_counts().sort_index())
mi.head(10)

## EDA Conclusions
**1. There are null values to handle.**

**2. Values of 96 and 98 for [NumberOfTime30-59DaysPastDueNotWorse, NumberOfTime60-89DaysPastDueNotWorse, NumberOfTimes90DaysLate] seems wrong.**

**3. Some values are off the chart for certain features according to boxplots : [RevolvingUtilizationOfUnsecuredLines, DebtRatio, NumberOfTime30-59DaysPastDueNotWorse, NumberOfTime60-89DaysPastDueNotWorse, NumberOfTimes90DaysLate]**

**4. There are a non neglectible number of MonthlyIncome of 0 and 1 yield very high debt ratio.**

**5. Few things definitely need more digging but considering the age of the competition, it will be difficult. According to forum discussions, something might be wrong with some feature values. I will assume it is better get rid of some row instead of imputing or engineering extra features.**

# Imputation

In [None]:
from sklearn.impute import SimpleImputer

imp_monthlyincome = SimpleImputer(missing_values=np.nan, strategy='median')
imp_dependents = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

X_train = imp_monthlyincome.fit_transform(df_train)
X_train = imp_dependents.fit_transform(X_train)

X_train = func_np2df(X_train, df_train, add_target=False, label_name=None, copy_columns=False)
X_train = pd.DataFrame(X_train)
X_train.columns = df_train.columns
X_train.index = df_train.index

In [None]:
X_test = imp_monthlyincome.transform(df_test)
X_test = imp_dependents.transform(X_test)

X_test = pd.DataFrame(X_test)
X_test.columns = df_test.columns
X_test.index = df_test.index

# Cleaning

In [None]:
print(X_train.shape[0])
X_train = X_train.loc[X_train.MonthlyIncome > 0]
print(X_train.shape[0])

# Outliers transformation

In [None]:
X_train.loc[X_train['RevolvingUtilizationOfUnsecuredLines'] > 13, 'RevolvingUtilizationOfUnsecuredLines'] = 14
X_train.loc[(X_train['NumberOfTime30-59DaysPastDueNotWorse'] == 96) | (X_train['NumberOfTime30-59DaysPastDueNotWorse'] == 98), 'NumberOfTime30-59DaysPastDueNotWorse'] = 14
X_train.loc[(X_train['NumberOfTime60-89DaysPastDueNotWorse'] == 96) | (X_train['NumberOfTime60-89DaysPastDueNotWorse'] == 98), 'NumberOfTime60-89DaysPastDueNotWorse'] = 12
X_train.loc[(X_train['NumberOfTimes90DaysLate'] == 96) | (X_train['NumberOfTimes90DaysLate'] == 98), 'NumberOfTimes90DaysLate'] = 18

X_test.loc[X_test['RevolvingUtilizationOfUnsecuredLines'] > 13, 'RevolvingUtilizationOfUnsecuredLines'] = 14
X_test.loc[(X_test['NumberOfTime30-59DaysPastDueNotWorse'] == 96) | (X_test['NumberOfTime30-59DaysPastDueNotWorse'] == 98), 'NumberOfTime30-59DaysPastDueNotWorse'] = 14
X_test.loc[(X_test['NumberOfTime60-89DaysPastDueNotWorse'] == 96) | (X_test['NumberOfTime60-89DaysPastDueNotWorse'] == 98), 'NumberOfTime60-89DaysPastDueNotWorse'] = 12
X_test.loc[(X_test['NumberOfTimes90DaysLate'] == 96) | (X_test['NumberOfTimes90DaysLate'] == 98), 'NumberOfTimes90DaysLate'] = 18

In [None]:
proc_pkl("x_train", X_train)
proc_pkl("x_test", X_test)

In [None]:
scaler = StandardScaler()
X_standard = scaler.fit_transform(X_train.drop(columns=["SeriousDlqin2yrs"]))

# PCA & t-SNE

In [None]:
# pca = PCA(n_components=3)
# X_pca = pca.fit_transform(X_standard)

# if n_comp == 3:
#     X_train["pca_x"] = X_pca[:,0]
#     X_train["pca_y"] = X_pca[:,1]
#     X_train["pca_z"] = X_pca[:,2]

#     trace_label0 = go.Scatter3d(
#         x=X_train.loc[X_train.SeriousDlqin2yrs == 0]['pca_x'],
#         y=X_train.loc[X_train.SeriousDlqin2yrs == 0]['pca_y'],
#         z=X_train.loc[X_train.SeriousDlqin2yrs == 0]['pca_z'],
#         mode='markers',
#         marker=dict(
#             size=2,
#             color='rgb(255,0,0)',    
#         )
#     )

#     trace_label1 = go.Scatter3d(
#         x=X_train.loc[X_train.SeriousDlqin2yrs == 1]['pca_x'],
#         y=X_train.loc[X_train.SeriousDlqin2yrs == 1]['pca_y'],
#         z=X_train.loc[X_train.SeriousDlqin2yrs == 1]['pca_z'],
#         mode='markers',
#         marker=dict(
#             size=2,
#             color='rgb(0,255,0)',    
#         )
#     )

#     data = [trace_label0, trace_label1]
#     layout = go.Layout(
#         margin=dict(
#             l=0,
#             r=0,
#             b=0,
#             t=0  
#         )

#     )
#     fig = go.Figure(data=data, layout=layout)
#     iplot(fig)

In [None]:
def func_tsne(index, X_source, dataset):
    tsne = TSNE(n_components=3, verbose=2, n_iter=500)
    X_tsne = tsne.fit_transform(dataset)
    proc_pkl("tsne_"+index, X_tsne)

    X_source['tsne'+index+'_x'] = X_tsne[:,0]
    X_source['tsne'+index+'_y'] = X_tsne[:,1]
    X_source['tsne'+index+'_z'] = X_tsne[:,2]
    
    trace_label0 = go.Scatter3d(
    x=X_source.loc[X_source.SeriousDlqin2yrs == 0]['tsne'+index+'_x'],
    y=X_source.loc[X_source.SeriousDlqin2yrs == 0]['tsne'+index+'_y'],
    z=X_source.loc[X_source.SeriousDlqin2yrs == 0]['tsne'+index+'_z'],
    mode='markers',
    marker=dict(
        size=2,
        color='rgb(255,0,0)',    
        )
    )

    trace_label1 = go.Scatter3d(
    x=X_source.loc[X_source.SeriousDlqin2yrs == 1]['tsne'+index+'_x'],
    y=X_source.loc[X_source.SeriousDlqin2yrs == 1]['tsne'+index+'_y'],
    z=X_source.loc[X_source.SeriousDlqin2yrs == 1]['tsne'+index+'_z'],
    mode='markers',
    marker=dict(
        size=2,
        color='rgb(0,255,0)',    
        )
    )

    data = [trace_label0, trace_label1]
    layout = go.Layout(
    margin=dict(
            l=0,
            r=0,
            b=0,
            t=0  
            )
        )

    return data, layout

In [None]:
# data, layout = func_tsne("pca", X_train, X_pca) 
# fig = go.Figure(data=data, layout=layout)
# iplot(fig)

In [None]:
# X_t0 = X_train.loc[X_train.SeriousDlqin2yrs == 0].sample(frac=0.3, random_state=random_seed)
# X_t1 = X_train.loc[X_train.SeriousDlqin2yrs == 1].sample(frac=0.3, random_state=random_seed)

# X_tsne = pd.concat([X_t0, X_t1])
# X_tsne.describe()

# data, layout = func_tsne("Curated Dataset", X_tsne, X_tsne.drop(columns=["SeriousDlqin2yrs"]))
# fig = go.Figure(data=data, layout=layout)
# iplot(fig)

In [None]:
# oversample = SMOTE()
# X_smote, y_smote = oversample.fit_resample(X_train, X_train.SeriousDlqin2yrs)
# XY_smote = pd.DataFrame(X_smote)
# XY_smote.columns = X_train.columns
# XY_smote['label'] = y_smote

# X_t0 = XY_smote.loc[XY_smote.label == 0].sample(frac=0.3, random_state=random_seed)
# X_t1 = XY_smote.loc[XY_smote.label == 1].sample(frac=0.3, random_state=random_seed)
# X_tsne = pd.concat([X_t0, X_t1])
# X_tsne.describe()

# data, layout = func_tsne("SMOTE", X_tsne, X_tsne.drop(columns=["SeriousDlqin2yrs"]))
# fig = go.Figure(data=data, layout=layout)
# iplot(fig)

**Unfortunately, PCA and t-SNE (commented out to save time) do not yield interesting visual results.**

In [None]:
pca = PCA(n_components=0.95)

X_pca = pca.fit_transform(X_standard)
X_pca = func_np2df(X_pca, X_train, True, "SeriousDlqin2yrs")
proc_pkl("pca", X_pca)

X_standard_test = scaler.transform(X_test.drop(columns=["SeriousDlqin2yrs"]))
X_pca_test = pca.transform(X_standard_test)
X_pca_test = func_np2df(X_pca_test, X_test, False)
proc_pkl("pca_test", X_pca_test)