In [None]:
#TESTING
#Once more

#numpy and pandas for data manipulation
import numpy as np
import pandas as pd

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# List files available
print(os.listdir("../input/"))

In [None]:
# Training data
app_train = pd.read_csv('../Jupyter/input/application_train.csv')
print('Training data rows and columns: ', app_train.shape)
app_train.head()

In [None]:
# Testing data features
app_test = pd.read_csv('../Jupyter/input/application_test.csv')
print('Testing data rows and columns: ', app_test.shape)
app_test.head()

## Exploratory Data Analysis
Trying figure out if there are some trends, anomalies, patterns or relationships within the data.
Basically, learn what our data can tell us.

In [None]:
# loans paid off(0) and the one with difficulties(1)
app_train["TARGET"].value_counts() 

In [None]:
app_train["TARGET"].astype(int).plot.hist();

In [None]:
# Function to calculate missing values by column
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [None]:
# missing vales statistics 
missing_values = missing_values_table(app_train)
missing_values.head(20)

In [None]:
# number of each type of column
app_train.dtypes.value_counts()

In [None]:
#number of unique classes in each object column
app_train.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

In [None]:
#Create a label ecnoder object
le = LabelEncoder()
le_count = 0

#Iterate thorugh the columns
for col in app_train:
    if app_train[col].dtype =='object':
        # if 2 or fewer unqiue categories 
        if len(list(app_train[col].unique())) <= 2:
            # train on the training data
            le.fit(app_train[col])
            #transform both training and testing data
            app_train[col]=le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
            #keep track of how many columns were label encoded
            le_count += 1
print('%d columns were label encoded.' % le_count)

In [None]:
# one-hot encoding of categorical variables
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

print('Training Features shape: ', app_train.shape)
print('Training Features shape: ', app_test.shape)

In [None]:
train_labels = app_train['TARGET']

# Align the traingin and testing data, keep only columns o resent in both dataframes
app_train, app_test = app_train.align(app_test, join ='inner', axis =1 )

# Add the target back in 
app_train["TARGET"] = train_labels

print("Training Features shape: ", app_train.shape)
print("Testing Features shape: ", app_test.shape)

In [None]:
(app_train["DAYS_BIRTH"]/-365).describe()


In [None]:
app_train['DAYS_EMPLOYED'].describe()


In [None]:
app_train['DAYS_EMPLOYED'].plot.hist(title="Days Employment Histogram");
plt.xlabel("Days Employment");


In [None]:
anom = app_train[app_train["DAYS_EMPLOYED"]==365243]
non_anom = app_train[app_train["DAYS_EMPLOYED"]!=365243]
print('The non-anomalies default on %0.2f%% of loans' %(100*non_anom["TARGET"].mean()))
print('The anomalies default on %0.02f%% of loans'%(100*anom["TARGET"].mean()))
print('There are %d anomalous days of employment' %len(anom))

In [None]:
tescik = app_train[app_train["DAYS_EMPLOYED"]==365243]
tescik.head()

In [None]:
# Create an anomalous flag column
app_train["DAYS_EMPLOYED_ANOM"] = app_train["DAYS_EMPLOYED"]== 365243

# Replace the anomalous values with nan
app_train["DAYS_EMPLOYED"].replace({365243: np.nan},inplace =True)

app_train["DAYS_EMPLOYED"].plot.hist(title = "Days Employment Histogram");
plt.xlabel("Days Employment");

In [None]:
app_test['DAYS_EMPLOYED_ANOM'] = app_test["DAYS_EMPLOYED"] == 365243
app_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)

print('There are %d anomalies in the test data out of %d entries' % (app_test["DAYS_EMPLOYED_ANOM"].sum(), len(app_test)))

## Correlations
looking for correlations between the features and th target
hence we calculate correlation coefficient, which absolute values are categorised as follows
- .00-.19 “very weak”
- .20-.39 “weak”
- .40-.59 “moderate”
- .60-.79 “strong”
- .80-1.0 “very strong”

In [None]:
# Find correlations with the target and sort
correlations = app_train.corr(method="pearson")['TARGET'].sort_values()
# Display correlations
print('Most Positive Correlations:\n', correlations.tail(15))
print('\nMost Negative Correlations:\n', correlations.head(15))

In [None]:
# Find the correlation of the positive days since birth and target
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
app_train['DAYS_BIRTH'].corr(app_train['TARGET'])


In [None]:
# Set the style of plots
plt.style.use('fivethirtyeight')

# Plot the distribution of ages in years
plt.hist(app_train['DAYS_BIRTH'] / 365, edgecolor = 'k', bins = 25)
plt.title('Age of Client'); plt.xlabel('Age (years)'); plt.ylabel('Count');

## Kernel density estimation plot

In [None]:
plt.figure(figsize = (8, 6))

# KDE plot of loans that were repaid on time
sns.kdeplot(app_train.loc[app_train['TARGET'] == 0, 'DAYS_BIRTH'] / 365, label = 'target == 0')

 # KDE plot of loans which were not repaid on time
sns.kdeplot(app_train.loc[app_train['TARGET'] == 1, 'DAYS_BIRTH'] / 365, label = 'target == 1')

# Labeling of plot
plt.xlabel('Age (years)'); plt.ylabel('Density'); plt.title('Distribution of Ages');
plt.legend();

## average failure to repay loans by age bracket.

In [None]:
# Age information into a separate dataframe
age_data = app_train[['TARGET', 'DAYS_BIRTH']]
age_data['YEARS_BIRTH'] = age_data['DAYS_BIRTH'] / 365

# Bin the age data
age_data['YEARS_BINNED'] = pd.cut(age_data['YEARS_BIRTH'], bins = np.linspace(20, 70, num = 11))
age_data.head(10)

In [None]:
# Group by the bin and calculate averages
age_groups  = age_data.groupby('YEARS_BINNED').mean()
age_groups


In [None]:
plt.figure(figsize = (8, 8))

# Graph the age bins and the average of the target as a bar plot
plt.bar(age_groups.index.astype(str), 100 * age_groups['TARGET'])

# Plot labeling
plt.xticks(rotation = 75); plt.xlabel('Age Group (years)'); plt.ylabel('Failure to Repay (%)')
plt.title('Failure to Repay by Age Group');

## TREND:
younger applicants are more likely to not repay the loan! The rate of failure to repay is above 10% for the youngest three age groups and beolow 5% for the oldest age group.

## INFO

Now we want to investigate correlation between credit rating('EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'), age and repaying loan on time.


In [None]:
# Extract the EXT_SOURCE variables and show correlations
ext_data = app_train[['TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]
ext_data_corrs = ext_data.corr()
ext_data_corrs

## Heatmap 

to show correlations properly


In [None]:
plt.figure(figsize = (8, 6))

# Heatmap of correlations
sns.heatmap(ext_data_corrs, cmap = plt.cm.RdYlBu_r, vmin = -0.25, annot = True, vmax = 0.6)
plt.title('Correlation Heatmap');

## Reflection:
We can see that DAYS_BIRTH is positively correlated with EXT_SOURCE_1(strong correlation) and EXT_SOURCE_1(weak), indicating that maybe one of the factors in this score is the client age.

In [None]:
plt.figure(figsize=(10,12))

# iterate through the sources
for i, source in enumerate(['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']):
    # create a new subplot for each source
    plt.subplot(3, 1, i+1)
    #plot repaid loans
    sns.kdeplot(app_train.loc[app_train["TARGET"]==0, source], label ='target == 0')
    #plot loans that were not repaid
    sns.kdeplot(app_train.loc[app_train["TARGET"]==1, source], label = 'target == 1')
    
    # label the plots
    plt.title("Distribution of %s by Target Value" %source)
    plt.xlabel('%s'% source); plt.ylabel('Density');
    plt.legend();
plt.tight_layout(h_pad=2.5)

The EXT_SOURCE_3 displays the greates diffrences between the values of the target. We can clearly see that this featoure has some relationship with the repaying the loan. It is very weak but still exists.

In [None]:
# Copy the data for plotting
plot_data = ext_data.drop(columns = ['DAYS_BIRTH']).copy()

# Add in the age of the client in years
plot_data['YEARS_BIRTH'] = age_data['YEARS_BIRTH']

# Drop na values and limit to first 100000 rows
plot_data = plot_data.dropna().loc[:100000, :]

# Function to calculate correlation coefficient between two columns
def corr_func(x, y, **kwargs):
    r = np.corrcoef(x, y)[0][1]
    ax = plt.gca()
    ax.annotate("r = {:.2f}".format(r),
                xy=(.2, .8), xycoords=ax.transAxes,
                size = 20)

# Create the pairgrid object
grid = sns.PairGrid(data = plot_data, size = 3, diag_sharey=False,
                    hue = 'TARGET', 
                    vars = [x for x in list(plot_data.columns) if x != 'TARGET'])

# Upper is a scatter plot
grid.map_upper(plt.scatter, alpha = 0.2)

# Diagonal is a histogram
grid.map_diag(sns.kdeplot)

# Bottom is density plot
grid.map_lower(sns.kdeplot, cmap = plt.cm.OrRd_r);

plt.suptitle('Ext Source and Age Features Pairs Plot', size = 32, y = 1.05);

# Feature enigneering 
1. Polynomial features
2. Domain knowledge features

In [None]:
# Make a new dataframe for polynomial features
poly_features = app_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'TARGET']]
poly_features_test = app_test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]

# imputer for handling missing values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'median')

poly_target = poly_features['TARGET']

poly_features = poly_features.drop(columns = ['TARGET'])

# Need to impute missing values
poly_features = imputer.fit_transform(poly_features)
poly_features_test = imputer.transform(poly_features_test)

from sklearn.preprocessing import PolynomialFeatures
                                  
# Create the polynomial object with specified degree
poly_transformer = PolynomialFeatures(degree = 3)

In [None]:
# Train the polynomial features
poly_transformer.fit(poly_features)

# Transform the features
poly_features = poly_transformer.transform(poly_features)
poly_features_test = poly_transformer.transform(poly_features_test)
print('Polynomial Features shape: ', poly_features.shape)


In [None]:
poly_transformer.get_feature_names(input_features = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])[:15]


In [None]:
# Create a dataframe of the features 
poly_features = pd.DataFrame(poly_features, 
                             columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 
                                                                           'EXT_SOURCE_3', 'DAYS_BIRTH']))

# Add in the target
poly_features['TARGET'] = poly_target

# Find the correlations with the target
poly_corrs = poly_features.corr()['TARGET'].sort_values()

# Display most negative and most positive
print(poly_corrs.head(10))
print(poly_corrs.tail(5))

In [None]:
# Put test features into dataframe
poly_features_test = pd.DataFrame(poly_features_test, 
                                  columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 
                                                                                'EXT_SOURCE_3', 'DAYS_BIRTH']))

# Merge polynomial features into training dataframe
poly_features['SK_ID_CURR'] = app_train['SK_ID_CURR']
app_train_poly = app_train.merge(poly_features, on = 'SK_ID_CURR', how = 'left')

# Merge polnomial features into testing dataframe
poly_features_test['SK_ID_CURR'] = app_test['SK_ID_CURR']
app_test_poly = app_test.merge(poly_features_test, on = 'SK_ID_CURR', how = 'left')

# Align the dataframes
app_train_poly, app_test_poly = app_train_poly.align(app_test_poly, join = 'inner', axis = 1)

# Print out the new shapes
print('Training data with polynomial features shape: ', app_train_poly.shape)
print('Testing data with polynomial features shape:  ', app_test_poly.shape)