In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

In [None]:
application_train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
application_test  = pd.read_csv('../input/home-credit-default-risk/application_test.csv')
application_sub   = pd.read_csv('../input/home-credit-default-risk/sample_submission.csv')

In [None]:
#Using a for loop in Python to figure out the number of missing values in each column
missing_data_application_train = application_train.isnull()
most_missing = pd.DataFrame(columns=['Column','Percentage'])

for column in missing_data_application_train.columns.values.tolist(): 
    #print(column)
    #print(missing_data_application_train[column].value_counts())
    #print("Percentage of missing values in column:",np.sum(missing_data_application_train[column])/missing_data_application_train[column].count()*100)
    #print("")
    
    if (np.sum(missing_data_application_train[column])/missing_data_application_train[column].count()*100) > 0:
        most_missing = most_missing.append({'Column': column,'Percentage':np.sum(missing_data_application_train[column])/missing_data_application_train[column].count()*100}, ignore_index=True)
    else:
        continue

In [None]:
most_missing.sort_values(by='Percentage',ascending=False).round(1)[most_missing['Percentage'] > 20]

In [None]:
#Delete non important columns that we won't take into account in our model
for row in most_missing.sort_values(by='Percentage',ascending=False).round(1)[most_missing['Percentage'] > 20].iterrows():
    name_column = list(row)
    name_column = [str(i).split('\n',1)[0] for i in name_column]
    name_column = name_column[1].split()[1]
    application_train.drop([name_column],axis=1,inplace=True)
    application_test.drop( [name_column],axis=1,inplace=True)

#TRAIN: Convert categorical variable into dummy/indicator variables with get_dummies and concat them to the df
for index, dtype in application_train.dtypes.iteritems():
    if dtype == object:
        normalized_column = pd.get_dummies(application_train[index],drop_first=True)
        application_train = pd.concat([application_train,normalized_column],axis=1)
        #Drop the old columns with categorical variables no longer useful
        application_train.drop([index],axis=1,inplace=True)

#TEST: Convert categorical variable into dummy/indicator variables with get_dummies and concat them to the df
for index, dtype in application_test.dtypes.iteritems():
    if dtype == object:
        normalized_column = pd.get_dummies(application_test[index],drop_first=True)
        application_test = pd.concat([application_test,normalized_column],axis=1)
        #Drop the old columns with categorical variables no longer useful
        application_test.drop([index],axis=1,inplace=True)

In [None]:
application_train['TARGET']

In [None]:
def getDuplicatesWithCount(listOfElems):
    ''' Get frequency count of duplicate elements in the given list '''
    dictOfElems = dict()
    # Iterate over each element in list
    for elem in listOfElems:
        # If element exists in dict then increment its value else add it in dict
        if elem in dictOfElems:
            dictOfElems[elem] += 1
        else:
            dictOfElems[elem] = 1    
 
    # Filter key-value pairs in dictionary. Keep pairs whose value is greater than 1 i.e. only duplicate elements from list.
    dictOfElems = { key:value for key, value in dictOfElems.items() if value > 1}
    # Returns a dict of duplicate elements and thier frequency count
    return dictOfElems

In [None]:
dictOfElems = getDuplicatesWithCount(application_train.columns)     
for key, value in dictOfElems.items():
        print(key , ' :: ', value)

In [None]:
application_train.drop(['Maternity leave','Unknown','XNA','Y',], axis=1,inplace=True)
application_test.drop(['Y','XNA'], axis=1,inplace=True)

In [None]:
len(set(application_test))

In [None]:
missing = []
for column in application_train.columns:
    if column not in application_test.columns:
         missing.append(column)
missing

In [None]:
#Clean data from NA values, fill them with the mean()
application_train.fillna(application_train.mean(),inplace=True)
application_test.fillna(application_test.mean(),inplace=True)

#Put aside TARGET column in app_train
application_train_target = pd.DataFrame({'TARGET':application_train['TARGET']})

#Put the columns with titles aside before asigning the scalars
train_columns = application_train.loc[:, application_train.columns != 'TARGET'].columns
test_columns  = application_test.columns

#Multiply each train and test dataframes by a scalar
Scaler1 = StandardScaler()
Scaler2 = StandardScaler()
application_train = pd.DataFrame(Scaler1.fit_transform(application_train.loc[:, application_train.columns != 'TARGET']))
application_test  = pd.DataFrame(Scaler2.fit_transform(application_test))

#Reput the write column titles
application_train.columns = train_columns
application_test.columns  = test_columns

#Reassign TARGET in app_train
application_train['TARGET'] = application_train_target

application_train
#Save features and target names in separate variables
features = application_train.iloc[:,2:].columns.tolist()
target   = application_train.loc[:, 'TARGET'].name

#Create n dimensional arrays with features ('X_train') and the targets for each ('Y_train')
X_train = application_train.iloc[:,2:].values
y_train = application_train.loc[:,'TARGET'].values

#Create n dimensional arrays with features ('X_test')
X_test = application_test.iloc[:,1:].values


In [None]:
application_train

We import everything we need from Pytorch

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

We define our data loaders :

In [None]:
## train data
class trainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_data = trainData(torch.FloatTensor(X_train), 
                       torch.FloatTensor(y_train))
## test data    
class testData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = testData(torch.FloatTensor(X_test))

In [None]:
batch_size = 200
n_epochs = 300
batch_no = len(X_train) // batch_size

train_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

Let's examine how much features we get after the normalization of the data, to align this number with the inputs of the NN:

In [None]:
len(features)

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layer1 = nn.Linear(148, 296)
        self.layer2 = nn.Linear(296, 148)
        self.layer3 = nn.Linear(148, 74)
        self.layer_out = nn.Linear(74, 1)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(296)
        self.batchnorm2 = nn.BatchNorm1d(148)
        self.batchnorm3 = nn.BatchNorm1d(74)
        
        
    def forward(self, inputs):
        #print('Shape of inputs:',inputs.shape)
        x = self.relu(self.layer1(inputs))
        #print('Shape of x after relu.layer1', x.shape)
        x = self.batchnorm1(x)
        #print('Shape of x after batchnorm1', x.shape)
        x = self.relu(self.layer2(x))
        #print('Shape of x after relu.layer2', x.shape)
        x = self.batchnorm2(x)
        #print('Shape of x after batchnorm2', x.shape)
        x = self.relu(self.layer3(x))
        #print('Shape of x after relu.layer3', x.shape)
        x = self.batchnorm3(x)
        x = self.dropout(x)
        #print('Shape of x after dropout', x.shape)
        x = self.layer_out(x)
        return x



In [None]:
ngpu = 1
device = torch.device("cuda:0" if (torch.cuda.is_available() and ngpu > 0) else "cpu")
print(device)

model = Net()
model.to(device)
print(model)

In [None]:
criterion = nn.BCEWithLogitsLoss()

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In all this configuration of the NN, we also want to check the accuracy of the model, so let's build a function for it:

In [None]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [None]:
model.train()
for e in range(1, n_epochs+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        output = model(X_batch)
        loss = criterion(output, y_batch.unsqueeze(1))
        acc = binary_acc(output, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        #print(loss.item())
        
    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')
    if epoch_acc < epoch_acc += acc.item() 

In [None]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [None]:
perc = submission.loc[submission['TARGET'] == 1].count()/submission.loc[submission['TARGET'] == 0].count()*100
perc

In [None]:
sum(y_pred_list)
sns.countplot(y_pred_list)

In [None]:
submission = pd.DataFrame({'SK_ID_CURR': application_sub['SK_ID_CURR'], 'TARGET': y_pred_list})
submission.to_csv('submission.csv', index=False)