In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.pandas.set_option("display.max_rows", None,'display.max_columns', None)

In [None]:
application_train = pd.read_csv('/kaggle/input/home-credit-default-risk/application_train.csv')
application_test = pd.read_csv('/kaggle/input/home-credit-default-risk/application_test.csv')


In [None]:
application_train.shape

In [None]:
application_test.shape

# **Feature Engineering**

### Change days to absolute

In [None]:
application_train['DAYS_BIRTH'] = application_train['DAYS_BIRTH'].abs()
application_train['DAYS_EMPLOYED'] = application_train['DAYS_EMPLOYED'].abs()
application_train['DAYS_REGISTRATION'] = application_train['DAYS_REGISTRATION'].abs()
application_train['DAYS_ID_PUBLISH'] = application_train['DAYS_ID_PUBLISH'].abs()
application_train['DAYS_LAST_PHONE_CHANGE'] = application_train['DAYS_LAST_PHONE_CHANGE'].abs()

In [None]:
application_test['DAYS_BIRTH'] = application_test['DAYS_BIRTH'].abs()
application_test['DAYS_EMPLOYED'] = application_test['DAYS_EMPLOYED'].abs()
application_test['DAYS_REGISTRATION'] = application_test['DAYS_REGISTRATION'].abs()
application_test['DAYS_ID_PUBLISH'] = application_test['DAYS_ID_PUBLISH'].abs()
application_test['DAYS_LAST_PHONE_CHANGE'] = application_test['DAYS_LAST_PHONE_CHANGE'].abs()

### Handling anamolous data in train/test set


In [None]:
application_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

In [None]:

application_test['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

### Handling missing values

In [None]:
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

In [None]:
missing_data(application_train).head(30)

features that have more than 60% missing value 

In [None]:
missing_above_60 = ['OWN_CAR_AGE','YEARS_BUILD_AVG','COMMONAREA_AVG','FLOORSMIN_AVG','LIVINGAPARTMENTS_AVG',
                 'NONLIVINGAPARTMENTS_AVG','YEARS_BUILD_MODE','COMMONAREA_MODE','FLOORSMIN_MODE','LIVINGAPARTMENTS_MODE',
                 'NONLIVINGAPARTMENTS_MODE','YEARS_BUILD_MEDI','COMMONAREA_MEDI','FLOORSMIN_MEDI','LIVINGAPARTMENTS_MEDI',
                 'NONLIVINGAPARTMENTS_MEDI','FONDKAPREMONT_MODE']

dropping features that have more than 60% missing value 

In [None]:
application_train = application_train.drop(missing_above_60,1)
application_train.shape

In [None]:
application_test = application_test.drop(missing_above_60,1)
application_test.shape

features that have missing values but less than 60% missing value 

In [None]:
missing_below_60 = ['OCCUPATION_TYPE', 'EXT_SOURCE_1', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 
                 'YEARS_BEGINEXPLUATATION_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'LANDAREA_AVG', 
                 'LIVINGAREA_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 
                 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'LANDAREA_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAREA_MODE',
                 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 
                 'FLOORSMAX_MEDI', 'LANDAREA_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAREA_MEDI', 'HOUSETYPE_MODE', 'TOTALAREA_MODE',
                 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 
                 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 
                 'AMT_REQ_CREDIT_BUREAU_YEAR','NAME_TYPE_SUITE','DAYS_LAST_PHONE_CHANGE','EXT_SOURCE_2']

In [None]:
train_miss = pd.DataFrame(application_train,columns=missing_below_60)
train_miss.head()

In [None]:
test_miss = pd.DataFrame(application_test,columns=missing_below_60)
test_miss.head()

In [None]:
train_miss.describe()

In [None]:
train_miss.shape

In [None]:
test_miss.describe()

In [None]:
test_miss.shape

In [None]:
train_miss.select_dtypes('number').columns

In [None]:
train_miss.select_dtypes('number').skew().abs()<1

In [None]:
test_miss.select_dtypes('number').skew().abs()<1

handling missing values with mean

In [None]:
miss_mean = ['EXT_SOURCE_1','EXT_SOURCE_3','DAYS_LAST_PHONE_CHANGE','EXT_SOURCE_2']

In [None]:
 for feature in miss_mean:
    application_train[feature].fillna(application_train[feature].mean(),inplace=True)
    application_test[feature].fillna(application_test[feature].mean(),inplace=True)

handling missing values with median

In [None]:
miss_median = ['APARTMENTS_AVG', 'BASEMENTAREA_AVG','YEARS_BEGINEXPLUATATION_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG',
       'FLOORSMAX_AVG', 'LANDAREA_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAREA_AVG',
       'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE',
       'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'LANDAREA_MODE',
       'LIVINGAREA_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI',
       'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'ELEVATORS_MEDI',
       'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'LANDAREA_MEDI', 'LIVINGAREA_MEDI',
       'NONLIVINGAREA_MEDI', 'TOTALAREA_MODE', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR','DEF_60_CNT_SOCIAL_CIRCLE','OBS_30_CNT_SOCIAL_CIRCLE','DAYS_EMPLOYED']

In [None]:
for feature in miss_median:
    application_train[feature].fillna(application_train[feature].median(),inplace=True)
    application_test[feature].fillna(application_test[feature].median(),inplace=True)

In [None]:
train_miss.select_dtypes('object').columns

In [None]:
test_miss.select_dtypes('object').columns

handling missing values with mode

In [None]:
miss_mode = ['OCCUPATION_TYPE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE',
       'EMERGENCYSTATE_MODE', 'NAME_TYPE_SUITE']

In [None]:
for feature in miss_mode:
    application_train[feature].fillna(application_train[feature].mode()[0],inplace=True)
    application_test[feature].fillna(application_test[feature].mode()[0],inplace=True)

In [None]:
application_train_corr = application_train.corr()

In [None]:
plt.figure(figsize=(30, 30))
sns.heatmap(application_train_corr, square=True);
plt.show()

- 'CNT_CHILDREN' and 'CNT_FAM_MEMBERS' have a high correlation.

- 'AMT_GOODS_PRICE' and 'AMT_CREDIT' are perfectly correlated.

- 'AMT_GOODS_PRICE' and 'AMT_ANNUITY' have a high correlation.

- 'AMT_ANNUITY' and 'AMT_CREDIT' have a high correlation.

- 'OBS_30_CNT_SOCIAL_CIRCLE' and 'OBS_60_CNT_SOCIAL_CIRCLE' are perfectly correlated.

- 'DEF_30_CNT_SOCIAL_CIRCLE' and 'DEF_60_CNT_SOCIAL_CIRCLE' have a high correlation.


We need to remove one of each pairs except where as we can select one from 'AMT_GOODS_PRICE','AMT_CREDIT' and 'AMT_ANNUITY'

In [None]:
remove_features = ['CNT_FAM_MEMBERS', 'AMT_GOODS_PRICE', 'AMT_ANNUITY', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE']
application_train = application_train.drop(remove_features,1)
application_test = application_test.drop(remove_features,1)

In [None]:
missing_data(application_train).head()

In [None]:
missing_data(application_test).head()

In [None]:
application_train.shape

In [None]:
application_test.shape

### Feature Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
cat_features = application_train.select_dtypes('object')
cat_features.columns

In [None]:
for col in application_train.columns:
    le = LabelEncoder()
    if application_train[col].dtypes == np.object:
        application_train[col] = le.fit_transform(application_train[col])
        application_test[col] = le.transform(application_test[col])

In [None]:
application_train.sample(10)

In [None]:
application_test.sample(10)

In [None]:
Xtrn = application_train.drop(['TARGET','SK_ID_CURR'],1)
Xtst = application_test.drop(['SK_ID_CURR'],1)
y = application_train['TARGET']

In [None]:
Xtrn.shape

In [None]:
y.value_counts()


### Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val = train_test_split(Xtrn,y,stratify = y,test_size = 0.05,random_state = 10)

# Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_transform = scaler.fit_transform(X_train)
X_val_transform = scaler.transform(X_val)

X_train_transform = pd.DataFrame(X_train_transform,columns = Xtrn.columns)
X_val_transform = pd.DataFrame(X_val_transform,columns = Xtrn.columns)

X_test_transform = scaler.transform(Xtst)
X_test_transform = pd.DataFrame(X_test_transform,columns = Xtst.columns)

X_train_transform.head()

# Over Sampling

In [None]:
from imblearn.over_sampling import SMOTE
oversampling = SMOTE(random_state = 10)

In [None]:
X_over, y_over = oversampling.fit_resample(X_train_transform, y_train)
y_over.value_counts()

In [None]:
sns.barplot(x=[0,1], y=y_over.value_counts(normalize=True))

# DeepLearning Model Using Pytorch

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

**Changing datas to pytorch tensors**

In [None]:

pt_Xtrain = torch.FloatTensor(X_over.values).to(device)
pt_Xval = torch.FloatTensor(X_val_transform.values).to(device)
pt_ytrain = torch.squeeze(torch.LongTensor(y_over.values)).to(device)
pt_yval = torch.squeeze(torch.LongTensor(y_val.values)).to(device)
pt_Xtest = torch.FloatTensor(X_test_transform.values).to(device)

**Dividing dataset in mini batch**

In [None]:
from torch.utils.data import Dataset, DataLoader
class Data(Dataset):
    def __init__(self):
        self.x = pt_Xtrain
        self.y = pt_ytrain
        self.len = self.x.shape[0]
    def __getitem__(self,index):      
        return self.x[index], self.y[index]
    def __len__(self):
        return self.len

In [None]:
data_set = Data()
trainloader=DataLoader(dataset=data_set,batch_size=512)

**ANN model with 2 hidden layers**

In [None]:
class ANN(nn.Module):
    def __init__(self, in_features):
        super(ANN, self).__init__()
        self.in_layer = nn.Linear(in_features, 64)  # layer1
        self.hid_layer1 = nn.Linear(64,42) # layer2
        self.out_layer = nn.Linear(42, 2)  # layer3

    def forward(self, x):
        x = F.leaky_relu(self.in_layer(x))
        x = F.leaky_relu(self.hid_layer1(x))
 
        return self.out_layer(x)


In [None]:
ann = ANN(pt_Xtrain.shape[1]).to(device)

**Crossentropy loss as loss function**

In [None]:
criterion = nn.CrossEntropyLoss()

**Adam as optimizer**

In [None]:
optimizer = torch.optim.Adam(ann.parameters(), lr=0.001)

**Training ANN**

In [None]:
epochs = 10
losses = []

for i in range(epochs):
    for x, y in trainloader:
      y_pred = torch.squeeze(ann.forward(data_set.x))
      loss = criterion(y_pred, data_set.y)
    
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
    losses.append(loss)
    print(f'epoch: {i:2}  loss: {loss.item():10.8f}')

In [None]:
###Plot the loss function
plt.figure(2)
plt.plot(range(epochs), losses)
plt.ylabel('Loss')
plt.xlabel('epoch');

**ANN Accuracy**

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

classes = ['0', '1']

with torch.no_grad():
    y_pred = ann.forward(pt_Xval).cpu()
    _, predicted = torch.max(y_pred, 1)
    pt_yval = pt_yval.cpu()

print(classification_report(pt_yval, predicted, target_names=classes))

**Confusion Matrix**

In [None]:
cm = confusion_matrix(pt_yval, predicted)
df_cm = pd.DataFrame(cm, index=classes, columns=classes)

hmap = sns.heatmap(df_cm, annot=True, fmt="d")
hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
plt.ylabel('True label')
plt.xlabel('Predicted label');

**Prediction on test set**

In [None]:
with torch.no_grad():
        y_pred = ann.forward(pt_Xtest).cpu()
        _, predicted = torch.max(y_pred, 1)
print(y_pred)
predicted

In [None]:
df_predicted = pd.DataFrame(predicted.numpy())
df_predicted.value_counts()