In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import sklearn

from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns

# Any results you write to the current directory are saved as output.

In [None]:
train = pd.read_csv('../input/application_train.csv')
test = pd.read_csv('../input/application_test.csv')

In [None]:
test.shape

In [None]:
def missing_values_table(df):
    total_missing = df.isnull().sum()
    perc_missing = df.isnull().sum() * 100/ len(df)
    missing_total_perc = pd.concat([total_missing, perc_missing], axis=1)
    missing_total_perc.columns = ['total', 'percentage']
    missing_total_perc = missing_total_perc.sort_values('percentage', ascending=False).round(1)
    return missing_total_perc

In [None]:
#no of unique values in object columns
train.select_dtypes('object').apply(pd.Series.nunique, axis=0)

In [None]:
le = LabelEncoder()

le_count = 0

for col in train:
    if train[col].dtype == 'object':
        if len(list(train[col].unique())) < 3:
            le.fit(train[col])
            print(col)
            train[col] = le.transform(train[col])
            test[col] = le.transform(test[col])

In [None]:
train_dummied = pd.get_dummies(train)
test_dummied = pd.get_dummies(test)

In [None]:
train_labels = train.TARGET

In [None]:
app_train, app_test = train_dummied.align(test_dummied, join='inner', axis=1)

In [None]:
app_train.DAYS_EMPLOYED.plot.hist()

In [None]:
app_train['DAYS_EMPLOYED_ANOMALOUS'] = app_train['DAYS_EMPLOYED'] >= 300000
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace=True)

app_test['DAYS_EMPLOYED_ANOMALOUS'] = app_test['DAYS_EMPLOYED'] >= 300000
app_test['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace=True)

In [None]:
app_train.DAYS_EMPLOYED.plot.hist()

In [None]:
app_test.DAYS_EMPLOYED.plot.hist()

In [None]:
plt.style.use('fivethirtyeight')

plt.hist(app_train.DAYS_BIRTH /-365, edgecolor = 'k', bins=25)
plt.title('Age of Client'); plt.xlabel('Age(years)'); plt.ylabel('Count')

In [None]:
app_train['TARGET'] = train.TARGET
plt.figure(figsize=(10, 8))

sns.kdeplot(app_train.loc[app_train.TARGET == 0, 'DAYS_BIRTH']/-365, label='Target = 0')
sns.kdeplot(app_train.loc[app_train.TARGET == 1, 'DAYS_BIRTH']/-365, label='Target = 1')

In [None]:

pd.options.mode.chained_assignment = None  # default='warn'
age_data = train[['TARGET', 'DAYS_BIRTH']]
#age_data['YEARS_BIRTH'] = 0
age_data['YEARS_BIRTH'] = age_data['DAYS_BIRTH'] / -365

In [None]:
age_data['YEARS_BINNED'] = pd.cut(age_data['YEARS_BIRTH'], bins=np.linspace(20, 70, num=11))

In [None]:
age_groups = age_data.groupby('YEARS_BINNED').mean()
age_groups

In [None]:
plt.figure(figsize=(8, 8))

plt.bar(age_groups.index.astype(str), 100 *age_groups.TARGET)

In [None]:
ext_data = app_train[['TARGET', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]

In [None]:
ext_data_corrs = ext_data.corr()
ext_data_corrs

In [None]:
plt.figure(figsize=(10,12))

for i, source in enumerate(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']):
    
    plt.subplot(3, 1, i + 1)
    sns.kdeplot(app_train.loc[app_train['TARGET'] ==0, source], label="target=0")
    sns.kdeplot(app_train.loc[app_train['TARGET'] ==1, source], label="target=1")
    
    plt.title('Distribution of %s by Target Value'%(source))

In [None]:
poly_features = app_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'TARGET', 'DAYS_BIRTH']]
poly_features_test = app_test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]

from sklearn.preprocessing import Imputer

simple_imputer = Imputer(strategy='median')

poly_target = poly_features['TARGET']
poly_features = poly_features.drop(columns='TARGET')

poly_features = simple_imputer.fit_transform(poly_features)
poly_features_test = simple_imputer.fit_transform(poly_features_test)

from sklearn.preprocessing import PolynomialFeatures
poly_transformer = PolynomialFeatures(degree=3)

In [None]:
poly_transformer.fit(poly_features)

poly_features = poly_transformer.transform(poly_features)
poly_features_test = poly_transformer.transform(poly_features_test)

print ('Polynomial features shape %s'%(str(poly_features.shape)))

In [None]:
poly_transformer.get_feature_names(input_features=['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'])[:15]

In [None]:
poly_features = pd.DataFrame(poly_features, columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']))
poly_features['TARGET'] = poly_target
poly_corrs = poly_features.corr()['TARGET'].sort_values()

print(poly_corrs.head(10))
print("tail")
print(poly_corrs.tail(5))

In [None]:
poly_features_test = pd.DataFrame(poly_features_test, columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']))
poly_features['SK_ID_CURR'] = app_train['SK_ID_CURR']
app_train_poly = app_train.merge(poly_features, on='SK_ID_CURR', how='left')

poly_features_test['SK_ID_CURR'] = app_test['SK_ID_CURR']
app_test_poly = app_test.merge(poly_features_test, on='SK_ID_CURR', how='left')

app_train_poly, app_test_poly = app_train_poly.align(app_test_poly, join='inner', axis=1)

print('Training data with polynomials %s and testing shape with polynomials is %s'%(str(app_train_poly.shape), str(app_test_poly.shape)))

In [None]:
app_train_domain = app_train.copy()
app_test_domain = app_test.copy()

app_train_domain['CREDIT_INCOME_PERCENT'] = app_train_domain['AMT_CREDIT'] / app_train_domain.AMT_INCOME_TOTAL
app_train_domain['ANNUITY_INCOME_PERCENT'] = app_train_domain.AMT_ANNUITY / app_train_domain.AMT_INCOME_TOTAL
app_train_domain['CREDIT_TERM'] = app_train_domain.AMT_ANNUITY / app_train_domain.AMT_CREDIT
app_train_domain['DAYS_EMPLOYED_PERCENT'] = app_train_domain.DAYS_EMPLOYED / app_train_domain.DAYS_BIRTH

In [None]:
app_test_domain['CREDIT_INCOME_PERCENT'] = app_test_domain['AMT_CREDIT'] / app_test_domain.AMT_INCOME_TOTAL
app_test_domain['ANNUITY_INCOME_PERCENT'] = app_test_domain.AMT_ANNUITY / app_test_domain.AMT_INCOME_TOTAL
app_test_domain['CREDIT_TERM'] = app_test_domain.AMT_ANNUITY / app_test_domain.AMT_CREDIT
app_test_domain['DAYS_EMPLOYED_PERCENT'] = app_test_domain.DAYS_EMPLOYED / app_test_domain.DAYS_BIRTH

In [None]:
plt.figure(figsize=(12,20))

for i, source in enumerate(['CREDIT_INCOME_PERCENT', 'ANNUITY_INCOME_PERCENT', 'CREDIT_TERM', 'DAYS_EMPLOYED_PERCENT'  ]):
    
    plt.subplot(4, 1, i + 1)
    sns.kdeplot(app_train_domain.loc[app_train_domain['TARGET'] ==0, source], label="target=0")
    sns.kdeplot(app_train_domain.loc[app_train_domain['TARGET'] ==1, source], label="target=1")
    
    plt.title('Distribution of %s by Target Value'%(source))

In [None]:
from sklearn.preprocessing import MinMaxScaler

imputer = Imputer(strategy='median')
scaler = MinMaxScaler()

if 'TARGET' in app_train:
    train = app_train.drop(columns='TARGET')
else:
    train = app_train.copy()

features = list(train.columns)

test = app_test.copy()

imputer.fit(train)

train = imputer.transform(train)
test =  imputer.transform(test)

scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

print ('shape of training is %s and shape of testing is %s'%(str(train.shape), str(test.shape)))

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(C=0.0001)
log_reg.fit(train, train_labels)

In [None]:
log_reg_pred = log_reg.predict_proba(test)[:,1]

In [None]:
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = log_reg_pred

submit.to_csv('log_reg_baseline.csv', index=False)

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100, random_state=50, verbose=1, n_jobs=-1)

random_forest.fit(train, train_labels)
feature_importance_values = random_forest.feature_importances_
feature_importances = pd.DataFrame({'feature': features, 'importance': feature_importance_values})

predictions = random_forest.predict_proba(test)[:,1]

submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = predictions

submit.to_csv('random_forest_baseline.csv', index=False)

In [None]:

poly_features_names = list(app_train_poly.columns)

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')

print('Starting imputer fit...')
poly_features = imputer.fit_transform(app_train_poly)
poly_features_test = imputer.transform(app_test_poly)

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))
poly_features = scaler.fit_transform(poly_features)
poly_features_test = scaler.transform(poly_features_test)

In [None]:
random_forest_poly = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=50, verbose=1)

print('Fitting random forest model ...')
random_forest_poly.fit(poly_features, train_labels)

print('Predicting ..')
predictions = random_forest_poly.predict_proba(poly_features_test)[:, 1]

In [None]:
poly_features_names

In [None]:
submit = app_test[['SK_ID_CURR']]

submit['TARGET'] = predictions

submit.to_csv('random_forest_baseline_engineered.csv', index=False)

In [None]:
assert poly_features.shape[1] == len(poly_feature_names)

In [None]:
poly_features_test.shape

In [None]:
app_train_domain = app_train_domain.drop(columns = 'TARGET')
from sklearn.ensemble import RandomForestClassifier
domain_features_names = list(app_train_domain.columns)

# Impute the domainnomial features
imputer = Imputer(strategy = 'median')
print('fit and transform train')
domain_features = imputer.fit_transform(app_train_domain)
print('transform test')
domain_features_test = imputer.transform(app_test_domain)

In [None]:
# Scale the domainnomial features
scaler = MinMaxScaler(feature_range = (0, 1))

print('fit transform scaler')
domain_features = scaler.fit_transform(domain_features)
print('transform scaler test')
domain_features_test = scaler.transform(domain_features_test)

In [None]:
random_forest_domain = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)

print('fit random forest')
# Train on the training data
random_forest_domain.fit(domain_features, train_labels)

# Extract feature importances
feature_importance_values_domain = random_forest_domain.feature_importances_
feature_importances_domain = pd.DataFrame({'feature': domain_features_names, 'importance': feature_importance_values_domain})

In [None]:
# Make predictions on the test data
print('predict random forest')
predictions = random_forest_domain.predict_proba(domain_features_test)[:, 1]

In [None]:
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = predictions
submit.to_csv('random_forest_baseline_domain.csv', index=False)