In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
train_df = pd.read_csv("../input/home-credit-default-risk/application_train.csv")

In [None]:
train_df.head()

In [None]:


def missing_values_table(train_df):
    """
    Function to calculate missing values by column
    
    params:
    -------
    train_df : data frame 
    the data frame to analyze 
    
    
    output : 
    -------
    mis_val_table_ren_columns : dataframe
    the missing values 
    """
    # Total missing values
    mis_val = train_df.isnull().sum()
    
    # Percentage of missing values
    mis_val_percent = 100 * train_df.isnull().sum()/len(train_df)
    
    # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    
    # Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(columns = {0: 'Missing Values', 1: '% of Total Values'})
    
    # Sort the table by percentage of missing descending
    mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1]!=0].sort_values(
    '% of Total Values', ascending=False).round(1)
    
    #Print some summary information
    print("Your selected dataframe has" + str(train_df.shape[1])+"columns.\n"
         "There are" + str(mis_val_table_ren_columns.shape[0])+
         "columns that have missing values.")
    # Return the dataframe with missing information
    return mis_val_table_ren_columns
     

In [None]:
missing_values = missing_values_table(train_df)
missing_values.head(20)

In [None]:
train_df['TARGET'].value_counts()

In [None]:
train_df['TARGET'].astype(int).plot.hist()

In [None]:

import plotly.express as px

fig = px.histogram(train_df,"TARGET")
fig.show()

In [None]:
train_df.DAYS_BIRTH.value_counts()

In [None]:
train_df['DAYS_BIRTH'] = abs(train_df['DAYS_BIRTH'])

In [None]:

import matplotlib.pyplot as plt
plt.hist(train_df['DAYS_BIRTH'] / 365, edgecolor = 'k', bins = 25)
plt.title('Age of Client'); plt.xlabel('Age (years)'); plt.ylabel('Count');

In [None]:
# Age information into a separate dataframe
age_data = train_df[['TARGET','DAYS_BIRTH']]
age_data['YEARS_BIRTH'] = age_data['DAYS_BIRTH']/365

# Bin the age data
age_data['YEARS_BINNED'] = pd.cut(age_data['YEARS_BIRTH'], bins=np.linspace(20,70,num=11))
age_data.head(10)

In [None]:
train_df['DAYS_BIRTH'].corr(train_df['TARGET'])
## As the client gets older, there is a negative linear relationship with the target meaning that as clients get older,
##they tend to repay their loans on time more often.



In [None]:
import seaborn as sns
plt.figure(figsize=(10,8))

# repaid on time
sns.kdeplot(train_df.loc[train_df['TARGET'] == 0, 'DAYS_BIRTH']/365, label = 'target == 0')

# not repaid
sns.kdeplot(train_df.loc[train_df['TARGET'] == 1, 'DAYS_BIRTH'] / 365, label = 'target == 1')

# Labeling of plot
plt.xlabel('Age (years)'); plt.ylabel('Density'); plt.title('Distribution of Ages');

In [None]:
# Group by the bin and calculate averages
age_groups = age_data.groupby('YEARS_BINNED').mean()
age_groups

In [None]:
plt.figure(figsize=(8,8))

# Graph the age bins and the average of the target as a bar plot
plt.bar(age_groups.index.astype(str), 100*age_groups['TARGET'])

# plot labeling
plt.xticks(rotation = 75); plt.xlabel('Age Group (years)'); plt.ylabel('Failure to Repay(%)')
plt.title('Failure to Repay by Age Group');

In [None]:
# Extract the EXT_SOURCE variables and show correlations
ext_data = train_df[['TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2',
                     'EXT_SOURCE_3','DAYS_BIRTH']]
ext_data_corrs = ext_data.corr()
ext_data_corrs

In [None]:
plt.figure(figsize=(8,6))

# Heatmap of correlations
sns.heatmap(ext_data_corrs, cmap=plt.cm.RdYlBu_r, vmin = -0.25, annot = True, vmax=0.6)
plt.title('Correlation Heatmap');