# Home Credit Default Risk


this project is to predict how capable each applicant is of repaying a loan from the Kaggle website [Home Credit Default Risk
](https://www.kaggle.com/c/home-credit-default-risk/overview)


In [None]:
# import library 
import  pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy import isnan
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,StandardScaler

from sklearn.model_selection import  train_test_split

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier


from sklearn.metrics import accuracy_score

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()

In [None]:
# Load data
train= pd.read_csv('../input/home-credit-default-risk/application_train.csv')
test= pd.read_csv('../input/home-credit-default-risk/application_test.csv')


In [None]:
print('shape train data',train.shape)
print('shape test data',test.shape)

# dealing with missing data

first, we will drop the columns that contains more than 50% nan values and before w drop them we will check if these features are correlated with the target does missing values have meaning

In [None]:
missing_value_train = train.isnull().mean()
plt.subplots(figsize=(20,15))
plt.xticks(rotation=90)
plt.plot(missing_value_train.sort_values() )


In [None]:
df_train=train.drop(['TARGET'],axis=1)
df_test =test.copy()

In [None]:
train

In [None]:

# filling the nan for df_tain set
missing_value_train = df_train.isnull().mean()

for col in list(missing_value_train.index):
    if missing_value_train[col]<0.5:
        if df_train[col].dtype == 'object':
            df_train[col].fillna(df_train[col].value_counts().index[0], inplace=True)
        else:
            df_train[col].fillna(df_train[col].mean(), inplace=True)
    else:
        if df_train[col].dtype == 'object':
            df_train[col].fillna('missing', inplace=True)
    
# filling the nan for df_test set
missing_value_test = test.isnull().mean()

for col in list(missing_value_train.index):
    if missing_value_train[col]<0.5:
        if df_test[col].dtype == 'object':
            df_test[col].fillna(df_test[col].value_counts().index[0], inplace=True)
        else:
            df_test[col].fillna(df_test[col].mean(), inplace=True)
    else:
        if df_test[col].dtype == 'object':
            df_test[col].fillna('missing', inplace=True)
    



In [None]:
missing_value_train2 = df_train.isnull().mean()
plt.subplots(figsize=(20,15))
plt.xticks(rotation=90)
plt.plot(missing_value_train2.sort_values())

in this part, we will transform the features of more than 50% of missing data and into a labeled column 
if not nan value then take 0 
if nan value then take 1

In [None]:

nan_values = df_train.isnull().sum()
pourcentage= pd.DataFrame(np.array(nan_values) ,index=nan_values.index)
pour=pourcentage.loc[pourcentage[0]>0.5]
a=df_train[pour.index]
a[~isnan(a)] = 0
# mark all nan as 1
a[isnan(a)] = 1
df_train[pour.index]=a

as we see in next plot there's no nan value left

In [None]:
missing_value_train3 = df_train.isnull().mean()
plt.subplots(figsize=(20,15))
plt.xticks(rotation=90)
plt.plot(missing_value_train3.sort_values())

Now we will label encoding every categorical features in the data

In [None]:
label_encoders = {}
for col in list(df_train.columns):
    if df_train[col].dtype == 'object':
        label_encoders[col] = LabelEncoder()
        df_train[col]=label_encoders[col].fit_transform(df_train[col])
        df_test[col]=label_encoders[col].transform(df_test[col])

In [None]:
df_train.columns

In [None]:
a['TARGET']=train['TARGET']

In [None]:
a

In [None]:
# get correlation matrix 
corrolation=a.corr()
corrolation

In [None]:
corrolation= a.corr()
mask = np.triu(np.ones_like(corrolation, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
plt.subplots(figsize=(20,15))

sns.heatmap(corrolation, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True)

After transforming the columns of more than 50% of nan values into  binary columns turns out there no correlation between these feature and the target so we will delete them

In [None]:
df_train.drop(pour.index,axis=1,inplace=True)
df_test.drop(pour.index,axis=1,inplace=True)

In [None]:
df_train['TARGET']=train['TARGET']
target_coo = df_train.corr()['TARGET']
target_coo

In [None]:
plt.subplots(figsize=(20,15))
plt.xticks(rotation=90)
plt.plot(target_coo.sort_values())



After exploring the course of correlation with target we will drop the columns that have less 0.05 correlation with target 

In [None]:
feauture_todrop= target_coo[(target_coo>-0.05)& (target_coo<0.05)]
df_train.drop(feauture_todrop.index,axis=1,inplace=True)
df_test.drop(feauture_todrop.index,axis=1,inplace=True)

In [None]:
after_drop = df_train.corr()['TARGET']
plt.subplots(figsize=(20,15))
plt.xticks(rotation=90)
plt.plot(after_drop.sort_values())


In [None]:
mask = np.triu(np.ones_like(df_train.corr(), dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
plt.subplots(figsize=(20,15))

sns.heatmap(df_train.corr(), mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True)

In [None]:
df_train['TARGET'].value_counts()

In [None]:
plt.bar((0,1),height=df_train['TARGET'].value_counts())

In [None]:
X=df_train.drop(['TARGET'],axis=1).values
y= df_train.TARGET.values

In [None]:
X.shape

In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X,y = rus.fit_resample(X,y)

In [None]:
X.shape

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X,y = oversample.fit_resample(X,y)

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
X_train , X_test ,y_train, y_test= train_test_split(X,y,test_size=0.3)


In [None]:
# Spot Check Algorithms
models = []
models.append(('RFC', RandomForestClassifier()))
models.append(('KNC', KNeighborsClassifier( )))
models.append(('DTC', DecisionTreeClassifier( )))
models.append(('GBC', GradientBoostingClassifier( )))


In [None]:
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

In [None]:
# Compare Algorithms
from matplotlib import pyplot

pyplot.boxplot(results, labels=names)
pyplot.title('Algorithm Comparison')
pyplot.show()

In [None]:

gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
pred= gbc.predict(X_test)

In [None]:
cm=confusion_matrix(y_test,pred)
plot_confusion_matrix(cm,[0,1])


In [None]:
t=scaler.transform(df_test)

In [None]:
test_predict =gbc.predict(t)

In [None]:
test_predict

In [None]:
test['target']= test_predict

In [None]:
plt.bar((0,1),test['target'].value_counts())

what I have learned :
- imbalanced data may cause the model to overfit on one target and ignore other 
- balancing the data by oversampling cause overfit on the target generated 
-the best idea is to undersampling the higher target 