In [None]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm_notebook as tqdm
# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def separate_cat_num(df):
    categorical_cols = df.select_dtypes(include=['object'])
    num_cols = df.select_dtypes(exclude=['object']) 
    num_cols.drop(['SK_ID_PREV', 'SK_ID_CURR'], axis= 1,inplace = True)
    return categorical_cols,num_cols

In [None]:
def one_hot_encoder(categorical_cols):
    categorical_cols = pd.get_dummies(categorical_cols,dummy_na =True) 
    return categorical_cols

In [None]:
def create_aggregations_dictionary(categorical_cols,num_cols):
    PREVIOUS_APPLICATION_AGGREGATION_DICTIONARY = []
    for agg in ['mean', 'min', 'max', 'sum', 'var']:
        for select in categorical_cols.columns:
            PREVIOUS_APPLICATION_AGGREGATION_DICTIONARY.append((select, agg))
        for select in num_cols.columns:
            PREVIOUS_APPLICATION_AGGREGATION_DICTIONARY.append((select, agg))
    PREVIOUS_APPLICATION_AGGREGATION_DICTIONARY = [(['PREV'], PREVIOUS_APPLICATION_AGGREGATION_DICTIONARY)]
    return PREVIOUS_APPLICATION_AGGREGATION_DICTIONARY

In [None]:
def merge_upper_table(prev_agg_dict,application):
    groupby_aggregate_names = []
    for groupby_cols, specs in tqdm(prev_agg_dict):
        group_object = prev.groupby(groupby_cols)
        for select, agg in tqdm(specs):
            groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)
            application = application.merge(group_object[select]
                              .agg(agg)
                              .reset_index()
                              .rename(index=str,
                                      columns={select: groupby_aggregate_name})
                              [groupby_cols + [groupby_aggregate_name]],
                              on=groupby_cols,
                              how='left')
            groupby_aggregate_names.append(groupby_aggregate_name)
    return application

In [None]:
# Function to calculate missing values by column# Funct 
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [None]:
""" Process previous_application.csv and return a pandas dataframe. """
prev = pd.read_csv('../input/home-credit/previous_application.csv')
print('previous_application data shape: ', prev.shape)
prev.head()

In [None]:
categorical_cols,num_cols = separate_cat_num(prev)

In [None]:
categorical_cols.head()

In [None]:
num_cols.head()

In [None]:
missing_values_table(categorical_cols)

In [None]:
missing_values_table(num_cols)

In [None]:
num_cols.head()

In [None]:
# Feature engineering: ratios and difference
prev['NEW_APPLICATION_CREDIT_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_CREDIT']

prev['NEW_APPLICATION_CREDIT_RATIO'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']

prev['NEW_CREDIT_TO_ANNUITY_RATIO'] = prev['AMT_CREDIT']/prev['AMT_ANNUITY']

prev['NEW_DOWN_PAYMENT_TO_CREDIT'] = prev['AMT_DOWN_PAYMENT'] / prev['AMT_CREDIT']

In [None]:
# Innterest ratio previous application (simplified)
prev['NEW_TOTAL_PAYMENT'] = prev['AMT_ANNUITY'] * prev['CNT_PAYMENT']

prev['NEW_TOTAL_PAYMENT_TO_AMT_CREDIT'] = prev['NEW_TOTAL_PAYMENT'] / prev['AMT_CREDIT']

prev['NEW_SIMPLE_INTERESTS'] = (prev['NEW_TOTAL_PAYMENT'/prev['AMT_CREDIT'] - 1)/prev['CNT_PAYMENT']

https://www.calculatorsoup.com/calculators/financial/simple-interest-plus-principal-calculator.php

In [None]:
prev['NEW_CREDIT_TO_ANNUITY_RATIO'].head()

In [None]:
prev['CNT_PAYMENT'].head()

In [None]:
100 *  prev['NEW_SIMPLE_INTERESTS'].isnull().sum() / len( prev['NEW_SIMPLE_INTERESTS']) 

In [None]:
missing_values_table(prev)

In [None]:
prev.shape

In [None]:
prev[prev['AMT_ANNUITY'].isnull()==True]

In [None]:
prev[prev['NEW_SIMPLE_INTERESTS'].isnull()==True]

In [None]:
prev['NAME_PORTFOLIO'].value_counts()

In [None]:
prev['NAME_PORTFOLIO'].replace('XNA', np.nan, inplace= True)

In [None]:
prev['NAME_PORTFOLIO'].value_counts()

In [None]:
prev['NAME_PORTFOLIO'].isnull().sum()

In [None]:
prev['NAME_PRODUCT_TYPE'].value_counts() 

https://www.kaggle.com/c/home-credit-default-risk/discussion/61353

https://pazarlamaturkiye.com/capraz-satis-nedir-nasil-yapilir-teknikleri-nelerdir/

In [None]:
prev['NAME_PRODUCT_TYPE'].replace('XNA', np.nan, inplace= True)