In [None]:
import pandas as pd
import numpy as np
import os
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

warnings.filterwarnings('ignore')

In [None]:
os.listdir('../input')

Application train and application test are the two main files here! Train has the *TARGET* variable which has the defaulted (1) or not defaulted (0) outcome. Each row in these files is a loan.

We will focus on these files first in this kernel!

In [None]:
# read in application train and application test files
app_train = pd.read_csv('../input/application_train.csv')
app_test = pd.read_csv('../input/application_test.csv')

In [None]:
# How many rows and columns are there? train has the extra TARGET variable over test!
print(app_train.shape)
print(app_test.shape)

In [None]:
# What are the column types?
plt.figure(figsize = (10,6))
ax = app_train.dtypes.value_counts().plot(kind = 'bar', rot = 0)
ax.set_title('Count of Column Data Types')
ax.set_xlabel('Data Type')
ax.set_yticks([])
ax.grid(False)

# each rectangle is a bar
rects = ax.patches

# Make some labels.
labels = app_train.dtypes.value_counts().tolist()

# loop through each rectangle and put label
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width() / 2, height - 5, label,
            ha='center', va='bottom')

In [None]:
# define a function to get the missing values
def missing_check(df):
    '''Given a dataframe this determines the missing values and plots them'''
    missing_df = df.isnull().sum().reset_index()
    missing_df.columns = ['variable', 'missing_values']
    missing_df['Perc_Missing'] = missing_df['missing_values']*100/len(df)
    missing_df.sort_values('Perc_Missing', ascending = False, inplace = True)
    missing_df = missing_df.loc[missing_df['Perc_Missing']>0, :]
    if len(missing_df) == 0:
        return "No columns with missing values"
    else:
        missing_df['Perc_total'] = 100
        return missing_df

In [None]:
# visualize the top most columns with missing entries
abc = missing_check(app_train).reset_index(drop = True)

plt.figure(figsize = (10, 10))
plt.barh(abc.loc[:15, 'variable'], abc.loc[:15, 'Perc_total'], label = "Total rows")
plt.barh(abc.loc[:15,'variable'], abc.loc[:15,'Perc_Missing'], label = "Missing rows")
plt.legend()

We see many columns with greater than 60% missing values. We will have to figure out a strategy to impute these or remove these columns!

In [None]:
abc['Column_Type'] = abc['variable'].map(lambda x: app_train[x].dtype)
abc.head()

So what type of columns have the most missing values?

In [None]:
plt.figure(figsize = (10,6))
plt.style.use('seaborn-darkgrid')
ax = abc['Column_Type'].value_counts().plot(kind = 'bar')
ax.grid(False)
ax.set_title('Column Type count with missing values')
ax.set_xlabel('Column Type')
ax.set_ylabel('Count of columns')
ax.set_yticks([])

rects = ax.patches
labels = abc['Column_Type'].value_counts().tolist()

for rect, label in zip(rects, labels):
    height = rect.get_height()
    width = rect.get_width()
    ax.text(rect.get_x() + width/2, height - 3, label)

The 'NA' values in the *object* column type would get their own column with 0,1 when encoded. The *float* type columns can be easily imputed using a median/mean strategy!!

### Categorical Variable Encoding

There are around 16 categorical variables in our test and train dataframe, lets encode these! 
1. Label encode would assign a numerical value to each level of the variable, 
2. hot encoding creates a new column with 1,0 values for each level!

In [None]:
# First lets select the object type columns from app_train
obj_cols = app_train.select_dtypes('object').columns.tolist()
print(obj_cols)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
count_le = 0

for col in obj_cols:
    if len(app_train[col].unique()) <= 2:
        le.fit(app_train[col])
        app_train[col] = le.transform(app_train[col])
        app_test[col] = le.transform(app_test[col])
        
        count_le += 1
        
print("Columns with label encoding:", count_le)

Looking at the **app_train** and **app_test** datasets again, we find that 3 object type variables in each have been converted to a 0-1 encoding. Lets do one hot encoding for other variables!

In [None]:
# get_dummies takes all categorical variables and creates a new column for each level
app_train_encoded = pd.get_dummies(app_train)
app_test_encoded = pd.get_dummies(app_test)

In [None]:
print(app_train_encoded.shape)
print(app_test_encoded.shape)

Once we do the encoding, we see that the number of columns has increased significantly and that the number of columns in test and train datsets is not equal! **This is because the levels in the catgorical variables are not the same!**

In [None]:
# align the dataframes - This removes the target variable too, so lets store it separately
train_labels = app_train_encoded['TARGET']

app_train_encoded, app_test_encoded = app_train_encoded.align(app_test_encoded,
                                                             join = 'inner',
                                                             axis = 1)

print(app_train_encoded.shape)
print(app_test_encoded.shape)

Aligning the two dataframes has made the columns in train dataframe, same as that of the test dataframe! The shape output tells us as much! Note that now the **app_train_encoded** dataframe doesnt have the TARGET variable which is now in the **train_labels** variable.

One thing to note is that when we encode a categorical variable, **the NA values go into a column of their own**. This can help us if all NA values secretly mean something that we would otherwise not capture if we impute these values! Anyway, we have no robust method to impute categorical variables!

## Variable Correlations
Once we have modified our dataframe into a suitable shape, next we need to look into variable correlations to make sense of what variables are most important to us from the prediction aspect!

In [None]:
# attach train_labels to the app_train_encoded
app_train_encoded['TARGET'] = train_labels

# correlations
correlations = app_train_encoded.corr()['TARGET']
top_5_positive = correlations.sort_values(ascending = False)[:5]
top_5_negative = correlations.sort_values(ascending = True)[:5]

print(top_5_positive)
print(top_5_negative)

*EXT_SOURCE_3, EXT_SOURCE_2* and *EXT_SOURCE_1* are the most negatively correlated with *TARGET* and *DAYS_BIRTH* is positively correlated! 

**Since *DAYS_BIRTH* is all negative values, it tells us that as people get younger, they probably default more than older people!**

In [None]:
# What does the DAYS_BIRTH variable look like? This is the difference between the loan
# application date and birthdate of the applicant
plt.figure(figsize = (10,6))
app_train_encoded['DAYS_BIRTH'].plot.hist(bins = 25, edgecolor = 'k', rot = 0)
plt.title('DAYS_BIRTH Histogram')
plt.xlabel('DAYS_BIRTH')

In [None]:
# Lets convert this to positive values so that it makes more sense
app_train_encoded['DAYS_BIRTH'] = abs(app_train_encoded['DAYS_BIRTH'])
app_test_encoded['DAYS_BIRTH'] = abs(app_test_encoded['DAYS_BIRTH'])

plt.figure(figsize = (10,6))
app_train_encoded['DAYS_BIRTH'].plot.hist(bins = 25, edgecolor = 'k', rot = 0)

And now we see that the correlation has become negative! **As age increases -> default rate goes down!**

In [None]:
app_train_encoded['DAYS_BIRTH'].corr(app_train_encoded['TARGET'])

In [None]:
# age versus the outcome variable
plt.figure(figsize = (10, 8))

sns.kdeplot(app_train_encoded.loc[app_train_encoded['TARGET'] == 0, 'DAYS_BIRTH'],
           label = "TARGET = 0")
sns.kdeplot(app_train_encoded.loc[app_train_encoded['TARGET'] == 1, 'DAYS_BIRTH'],
           label = "TARGET = 1")
plt.title('DAYS_BIRTH vs TARGET')
plt.xlabel('DAYS_BIRTH')
plt.ylabel('Density')

On to the **EXT_SOURCE** variables!

In [None]:
correl = app_train_encoded[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'TARGET']].corr()

plt.figure(figsize = (10,6))
sns.heatmap(correl, cmap = plt.cm.RdYlBu_r, annot = True, vmin = -0.6, vmax = 0.8)
plt.yticks(rotation = 'horizontal')
plt.title('Correlation Heatmap')

We see that the *EXT_SOURCE* variables are more correlated with each other than with the *TARGET* variable in an absolute sense!

### Missing value imputations
Now that we have seen what variables are most correlated with our TARGET, we move on to imputing the missing values! Lets use the median strategy and mean strategy and compare the results!

A critical element to note here is that the while imputing,** we have to make sure that we treat the test data as unseen, which means that for all columns with missing data we need to calculate the mean/median on the training data and use the same mean/median while imputing values in the test data!**

In [None]:
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy = 'median')

if 'TARGET' in app_train_encoded:
    app_train_encoded.drop(columns = ['TARGET'], inplace = True)
    

# we need to remove the SK_ID_CURR variable before we do the scaling since these do not 
# have to be scaled
train_id = app_train_encoded['SK_ID_CURR']
test_id = app_test_encoded['SK_ID_CURR']

# Fit on train data and then transform test data as well!
app_train_encoded.drop(columns = ['SK_ID_CURR'], inplace = True)
app_test_encoded.drop(columns = ['SK_ID_CURR'], inplace = True)

# get column names from the dataframe since imputer converts these to matrices
features = app_train_encoded.columns.tolist()

# impute the missing values
app_train_enc_imput_med = imputer.fit_transform(app_train_encoded)
app_test_enc_imput_med = imputer.transform(app_test_encoded)

### Min max scaling of variables
Before we fit our models, it is critical to scale our variable ranges! This makes it easier for us to interpret the coefficients of models such as **Logistic Regression** to be proxies for variable importance! Generally, I have also seen model performance improvements with this!

MinMaxScaling for a `feature_range = (0,1)` essentially does -

[MinMaxScaling](https://drive.google.com/open?id=1l53oXLJFybxqK8HuRjwUADKPYQJjyf3S)

In our train and test data, although we do not expect, the min and max values for the same column can be very different. For this reason, we use the (min, max) values from the train data and use the same for scaling the test data. This ensures that columns across dataframes are scaled appropriately!

For instance, suppose a column in training data values  - `[4,5,6]`.  Then, min max scaling would change these values to - `[0, 0.5, 1]`. Now, lets suppose the same column in the test data has the values - `[9,10,15]`, then the min max scaling on the test data would change these to - `[0, 1/6 , 1]`. **For a model that uses these values to predict the outcome essentially a value of 6 becomes the same as 15 which is wrong.**

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (0,1))

app_train_enc_imput_med = scaler.fit_transform(app_train_enc_imput_med)
app_test_enc_imput_med = scaler.transform(app_test_enc_imput_med)

print("Test data:", app_test_enc_imput_med.shape)
print("Train data shape:", app_train_enc_imput_med.shape)

### Logistic Regression Model

In [None]:
# Lets break our train data into training and validation datasets - 0.7 and 0.3
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(app_train_enc_imput_med, 
                                                   train_labels,
                                                   test_size = 0.3,
                                                   random_state = 2)

print("Train data shape:", X_train.shape)
print("Test data shape:", X_test.shape)
print("train labels shape:", y_train.shape)
print("test labels shape:", y_test.shape)

In [None]:
# Fit the model on training data!
from sklearn.linear_model import LogisticRegression

# Logistic regression model
log_reg = LogisticRegression(C =  0.0001)

log_reg.fit(X_train, y_train)

### Get variable importance using the test data

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(log_reg, random_state = 2).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = features)

### What are the coefficients like?
Correlations told us that *EXT_SOURCE* variables and *DAYS_BIRTH* should be the most important variables! Lets see if the coefficients match this hypothesis! **Since our numerical variables are in the 0,1 range, the coefficients can be directly compared to assess impact of a variable on the *TARGET***

In [None]:
log_reg_coeff = pd.Series(log_reg.coef_.tolist()[0], index = features).sort_values()
print("Top 5 negative:\n",log_reg_coeff[:5])
print("\nTop 5 positive:\n",log_reg_coeff[-5:])

**What leads to less default** - 
1. Higher values on the *EXT_SOURCE* variables- Are these credit ratings?
2. Female loan applicants - Nice!
3. Older people in general!
4. Whether applicant provided cell phone number!

**What leads to more default** -
1. People with *OCCUPATION* as Laborer are more likely to default.
2. Male loan applicants!
3. Working people (Salaried) are more likely to default!
4. If applicant's permanent address does not match contact address
5. If applicant's permanent address does not match work address.

Let's now predict default values in the test set that we created. We will use the `predict_proba` function and get the probabilities of getting a 1 (Default)

In [None]:
# predict the probability of each class in the test data and extract the probability for
# class = 1
log_predictions = log_reg.predict_proba(X_test)[:,1]
log_predictions[:5]

The **log_predictions** variable holds the predicted probabilities for default = 1 for the test data!

In [None]:
# Whats the baseline accuracy for this problem
y_test.value_counts()/len(y_test)

We have an imbalanced class problem here since the proportion of defaulters is just 8.07% while non-defaulters are the majority with a proportion of 91.93%. **For this reason, any model that just predicts no-default for all the test rows would get an accuracy of 91.93%** - however, this is not an informative model since we want to identify with some accuracy, the people who will default.

For such problems the **Area Under the Receiver Operating Characteristic curve (AUC ROC)** is the metric of choice! For intuition, a model with an aucroc score of 0.5 is no good and its effectiveness increases the further the area goes upwards of 0.5!

In [None]:
# lets calculate the AUCROC metric
from sklearn import metrics
logit_accuracy = metrics.roc_auc_score(y_test, log_predictions)
print("Logistic Regression Accuracy: {0:.2f}".format(logit_accuracy))

### Cross Validation
The above *roc_auc_score* could be due to a chance selection of a good test set. Let's use cross validation to check if model performs in a stable manner.

In [None]:
# 10 fold cross validation setup
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits = 10, random_state = 2).split(X = app_train_enc_imput_med,
                                                               y = train_labels)

accuracies = []

# train the logistic regression model on each and get auc-roc scores
for train, holdout in kfold:
    log_reg.fit(app_train_enc_imput_med[train,:], train_labels[train])
    predictions = log_reg.predict_proba(app_train_enc_imput_med[holdout,:])[:,1]
    accuracy = metrics.roc_auc_score(train_labels[holdout], predictions)
    accuracies.append(accuracy)

In [None]:
# Scatter plot of the accuracies achieved through cross validation
plt.figure(figsize = (6, 6))
plt.scatter(range(1,11), accuracies)
plt.title('ROC AUC score with Cross Validation')
plt.xlabel('Fold Number')
plt.ylabel('Area under the ROC curve')

# draw a line for the mean auc-roc score
plt.axhline(y = np.mean(accuracies), color = 'red', linewidth = 1)
plt.text(x = 6, y = 0.695, 
         s = "Average Accuracy:{0:.3f}".format(np.mean(accuracies)),
        color = 'red')

### Random Forest Classifier
Let's also try the random Forest Classifier here.** A random forest classifier would make a lot of decision trees on different variables of the dataset and try to aggregate the predictions from all those trees into a single prediction. **

This works because with a lot of different algorithms that are usually uncorrelated with each other, we get a reduction in variance of the prediction - which is to say that all the trees won't get the same prediction wrong and this usually leads to improved performance!

In [None]:
# train the random Forest Model
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators = 100,
                                 n_jobs = 1,
                                 random_state = 2)

# 5 fold cross validation
kfold = StratifiedKFold(n_splits = 5, random_state = 2).split(X = app_train_enc_imput_med,
                                                              y = train_labels)

# Calculate the auc-roc scores for each train, holdout score
accuracies = []
for train, holdout in kfold:
    rf_model.fit(app_train_enc_imput_med[train, :], train_labels[train])
    predictions = rf_model.predict_proba(app_train_enc_imput_med[holdout, :])[:,1]
    accuracies.append(metrics.roc_auc_score(train_labels[holdout], predictions))

In [None]:
# Lets draw a scatter plot of the AUC-ROC scores for each fold!
plt.figure(figsize = (6,6))
plt.scatter(range(1,6), accuracies)
plt.title('Cross Validation AUC-ROC with Random Forest')
plt.xlabel('Fold Number')
plt.ylabel('ROC-AUC Score')

plt.axhline(y = np.mean(accuracies), color = 'red', linewidth = 1)
plt.text(x = 3.5, y = 0.715,
         s = "Avg AUC-ROC score:{0:.2f}".format(np.mean(accuracies)),
        color = 'red')
plt.show()

**Using a randomForest Model we get an auc-roc score of 0.710 on average**. This is an improvement over the previous logistic regression model which gave a score of 0.68!

### Auxilliary Data - 
There is a host of auxilliary data provided in this competition. Let's try to merge that with our `app_train_enc_input_med` to see whether it brings improvements in accuracy! We will do this in the next notebook. 

Let's write out our outputs from this notebook to a csv file and we can pick it up from there in the next notebook!

In [None]:
# convert the test and train matrices to a dataframe
app_train_part1 = pd.DataFrame(app_train_enc_imput_med,
                              columns = features)

app_test_part1 = pd.DataFrame(app_test_enc_imput_med,
                              columns = features)

# Append the SK_ID_CURR variable
app_train_part1 = pd.concat([app_train_part1, train_id], axis = 1)
app_test_part1 = pd.concat([app_test_part1, test_id], axis = 1)

In [None]:
# write to a csv file
app_train_part1.to_csv("app_train_part1.csv", index = False)
app_test_part1.to_csv("app_test_part1.csv", index = False)
train_labels.to_csv("train_labels.csv", index = False)