In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

In [None]:
def fillna_by_mean_of_columns(df):
    for column in list(df.columns[df.isnull().sum() > 0]):
        mean_val = df[column].mean()
        df[column].fillna(mean_val, inplace=True)
    return df

In [None]:
df = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
df.head()

In [None]:
#Test the distinction between default users and non-default users by the original features before the construction of the institute
plt.figure(figsize = (12, 20))
# Construct iterators for new features
for i, feature in enumerate(['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3',
        'DAYS_BIRTH', 'DAYS_EMPLOYED','AMT_GOODS_PRICE','DAYS_ID_PUBLISH',
        'AMT_ANNUITY','DAYS_LAST_PHONE_CHANGE','AMT_CREDIT','DAYS_REGISTRATION']):
    
    # Create subplot
    plt.subplot(11, 1, i + 1)
    # KDE plot of users who can repay their debts in time
    sns.kdeplot(df.loc[df['TARGET'] == 0, feature], label = 'target == 0')
    # plot loans that were not repaid
    sns.kdeplot(df.loc[df['TARGET'] == 1, feature], label = 'target == 1')
    
    # KDE plot of users who fail to repay their debts in time
    plt.title('Distribution of %s by Target Value' % feature)
    plt.xlabel('%s' % feature); plt.ylabel('Density');
    
plt.tight_layout(h_pad = 2.5)

In [None]:
df['CREDIT_TERM'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
df['ANNUITY_INCOME_PERCENT'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
df['DAYS_EMPLOYED_PERCENT'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
df['HAS_HOUSE_INFORMATION'] = df['COMMONAREA_MEDI'].apply(lambda x:1 if x>0 else 0)

In [None]:
##Test the distinction between default users and non-default users by the 4 new features before the construction of the institute
plt.figure(figsize = (12, 20))
# Construct iterators for new features
for i, feature in enumerate(['HAS_HOUSE_INFORMATION', 'ANNUITY_INCOME_PERCENT', 'CREDIT_TERM', 'DAYS_EMPLOYED_PERCENT']):
    
    # Create subplot
    plt.subplot(4, 1, i + 1)
    # KDE plot of users who can repay their debts in time
    sns.kdeplot(df.loc[df['TARGET'] == 0, feature], label = 'target == 0')
    # plot loans that were not repaid
    sns.kdeplot(df.loc[df['TARGET'] == 1, feature], label = 'target == 1')
    
    # KDE plot of users who fail to repay their debts in time
    plt.title('Distribution of %s by Target Value' % feature)
    plt.xlabel('%s' % feature); plt.ylabel('Density');
    
plt.tight_layout(h_pad = 2.5)

In [None]:
X = df[['CREDIT_TERM', 'EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3',
        'DAYS_BIRTH', 'DAYS_EMPLOYED','AMT_GOODS_PRICE','DAYS_ID_PUBLISH',
        'AMT_ANNUITY','DAYS_LAST_PHONE_CHANGE','AMT_CREDIT','DAYS_REGISTRATION',
        'ANNUITY_INCOME_PERCENT','DAYS_EMPLOYED_PERCENT','HAS_HOUSE_INFORMATION']]
y = df[['TARGET']]

In [None]:
X = fillna_by_mean_of_columns(X)
y = fillna_by_mean_of_columns(y)

X = X.replace(np.inf,0)

In [None]:
from sklearn.model_selection import train_test_split

X_train = X
y_train = y

In [None]:
# Normalized the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [None]:
# Choose LogisticRegression with C = 0.0001 for estimating results.
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression(C = 0.0001)
lg.fit(X_train_scaled, y_train)

In [None]:
df = pd.read_csv('../input/home-credit-default-risk/application_test.csv')

df['CREDIT_TERM'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
df['ANNUITY_INCOME_PERCENT'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
df['DAYS_EMPLOYED_PERCENT'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
df['HAS_HOUSE_INFORMATION'] = df['COMMONAREA_MEDI'].apply(lambda x:1 if x>0 else 0)

X = df[['CREDIT_TERM', 'EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3',
        'DAYS_BIRTH', 'DAYS_EMPLOYED','AMT_GOODS_PRICE','DAYS_ID_PUBLISH',
        'AMT_ANNUITY','DAYS_LAST_PHONE_CHANGE','AMT_CREDIT','DAYS_REGISTRATION',
        'ANNUITY_INCOME_PERCENT','DAYS_EMPLOYED_PERCENT','HAS_HOUSE_INFORMATION']]

X = fillna_by_mean_of_columns(X)
X = X.replace(np.inf,0)

X_test_file_scaled = scaler.transform(X)

lg_predict = lg.predict_proba(X_test_file_scaled)

pred = pd.Series(lg_predict[:, -1], name='TARGET')
result = pd.concat([df['SK_ID_CURR'], pred], axis=1)
result.to_csv('result.csv',index=False)