In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.pandas.set_option("display.max_rows", None,'display.max_columns', None)

In [None]:
application_train = pd.read_csv('/kaggle/input/home-credit-default-risk/application_train.csv')
application_test = pd.read_csv('/kaggle/input/home-credit-default-risk/application_test.csv')


In [None]:
application_train.shape

In [None]:
application_test.shape

# **Feature Engineering**

### Change days to absolute

In [None]:
application_train['DAYS_BIRTH'] = application_train['DAYS_BIRTH'].abs()
application_train['DAYS_EMPLOYED'] = application_train['DAYS_EMPLOYED'].abs()
application_train['DAYS_REGISTRATION'] = application_train['DAYS_REGISTRATION'].abs()
application_train['DAYS_ID_PUBLISH'] = application_train['DAYS_ID_PUBLISH'].abs()
application_train['DAYS_LAST_PHONE_CHANGE'] = application_train['DAYS_LAST_PHONE_CHANGE'].abs()

In [None]:
application_test['DAYS_BIRTH'] = application_test['DAYS_BIRTH'].abs()
application_test['DAYS_EMPLOYED'] = application_test['DAYS_EMPLOYED'].abs()
application_test['DAYS_REGISTRATION'] = application_test['DAYS_REGISTRATION'].abs()
application_test['DAYS_ID_PUBLISH'] = application_test['DAYS_ID_PUBLISH'].abs()
application_test['DAYS_LAST_PHONE_CHANGE'] = application_test['DAYS_LAST_PHONE_CHANGE'].abs()

### Handling anamolous data in train/test set


In [None]:
application_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

In [None]:

application_test['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

### Handling missing values

In [None]:
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

In [None]:
missing_data(application_train).head(30)

features that have more than 60% missing value 

In [None]:
missing_above_60 = ['OWN_CAR_AGE','YEARS_BUILD_AVG','COMMONAREA_AVG','FLOORSMIN_AVG','LIVINGAPARTMENTS_AVG',
                 'NONLIVINGAPARTMENTS_AVG','YEARS_BUILD_MODE','COMMONAREA_MODE','FLOORSMIN_MODE','LIVINGAPARTMENTS_MODE',
                 'NONLIVINGAPARTMENTS_MODE','YEARS_BUILD_MEDI','COMMONAREA_MEDI','FLOORSMIN_MEDI','LIVINGAPARTMENTS_MEDI',
                 'NONLIVINGAPARTMENTS_MEDI','FONDKAPREMONT_MODE']

dropping features that have more than 60% missing value 

In [None]:
application_train = application_train.drop(missing_above_60,1)
application_train.shape

In [None]:
application_test = application_test.drop(missing_above_60,1)
application_test.shape

features that have missing values but less than 60% missing value 

In [None]:
missing_below_60 = ['OCCUPATION_TYPE', 'EXT_SOURCE_1', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 
                 'YEARS_BEGINEXPLUATATION_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'LANDAREA_AVG', 
                 'LIVINGAREA_AVG', 'NONLIVINGAREA_AVG', 'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 
                 'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'LANDAREA_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAREA_MODE',
                 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 
                 'FLOORSMAX_MEDI', 'LANDAREA_MEDI', 'LIVINGAREA_MEDI', 'NONLIVINGAREA_MEDI', 'HOUSETYPE_MODE', 'TOTALAREA_MODE',
                 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 
                 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 
                 'AMT_REQ_CREDIT_BUREAU_YEAR','NAME_TYPE_SUITE','DAYS_LAST_PHONE_CHANGE','EXT_SOURCE_2']

In [None]:
train_miss = pd.DataFrame(application_train,columns=missing_below_60)
train_miss.head()

In [None]:
test_miss = pd.DataFrame(application_test,columns=missing_below_60)
test_miss.head()

In [None]:
train_miss.describe()

In [None]:
train_miss.shape

In [None]:
test_miss.describe()

In [None]:
test_miss.shape

In [None]:
train_miss.select_dtypes('number').columns

In [None]:
train_miss.select_dtypes('number').skew().abs()<1

In [None]:
test_miss.select_dtypes('number').skew().abs()<1

handling missing values with mean

In [None]:
miss_mean = ['EXT_SOURCE_1','EXT_SOURCE_3','DAYS_LAST_PHONE_CHANGE','EXT_SOURCE_2']

In [None]:
 for feature in miss_mean:
    application_train[feature].fillna(application_train[feature].mean(),inplace=True)
    application_test[feature].fillna(application_test[feature].mean(),inplace=True)

handling missing values with median

In [None]:
miss_median = ['APARTMENTS_AVG', 'BASEMENTAREA_AVG','YEARS_BEGINEXPLUATATION_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG',
       'FLOORSMAX_AVG', 'LANDAREA_AVG', 'LIVINGAREA_AVG', 'NONLIVINGAREA_AVG',
       'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE',
       'ELEVATORS_MODE', 'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'LANDAREA_MODE',
       'LIVINGAREA_MODE', 'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI',
       'BASEMENTAREA_MEDI', 'YEARS_BEGINEXPLUATATION_MEDI', 'ELEVATORS_MEDI',
       'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'LANDAREA_MEDI', 'LIVINGAREA_MEDI',
       'NONLIVINGAREA_MEDI', 'TOTALAREA_MODE', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR','DEF_60_CNT_SOCIAL_CIRCLE','OBS_30_CNT_SOCIAL_CIRCLE','DAYS_EMPLOYED']

In [None]:
for feature in miss_median:
    application_train[feature].fillna(application_train[feature].median(),inplace=True)
    application_test[feature].fillna(application_test[feature].median(),inplace=True)

In [None]:
train_miss.select_dtypes('object').columns

In [None]:
test_miss.select_dtypes('object').columns

handling missing values with mode

In [None]:
miss_mode = ['OCCUPATION_TYPE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE',
       'EMERGENCYSTATE_MODE', 'NAME_TYPE_SUITE']

In [None]:
for feature in miss_mode:
    application_train[feature].fillna(application_train[feature].mode()[0],inplace=True)
    application_test[feature].fillna(application_test[feature].mode()[0],inplace=True)

In [None]:
application_train_corr = application_train.corr()

In [None]:
plt.figure(figsize=(30, 30))
sns.heatmap(application_train_corr, square=True);
plt.show()

- 'CNT_CHILDREN' and 'CNT_FAM_MEMBERS' have a high correlation.

- 'AMT_GOODS_PRICE' and 'AMT_CREDIT' are perfectly correlated.

- 'AMT_GOODS_PRICE' and 'AMT_ANNUITY' have a high correlation.

- 'AMT_ANNUITY' and 'AMT_CREDIT' have a high correlation.

- 'OBS_30_CNT_SOCIAL_CIRCLE' and 'OBS_60_CNT_SOCIAL_CIRCLE' are perfectly correlated.

- 'DEF_30_CNT_SOCIAL_CIRCLE' and 'DEF_60_CNT_SOCIAL_CIRCLE' have a high correlation.


We need to remove one of each pairs except where as we can select one from 'AMT_GOODS_PRICE','AMT_CREDIT' and 'AMT_ANNUITY'

In [None]:
remove_features = ['CNT_FAM_MEMBERS', 'AMT_GOODS_PRICE', 'AMT_ANNUITY', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE']
application_train = application_train.drop(remove_features,1)
application_test = application_test.drop(remove_features,1)

In [None]:
missing_data(application_train).head()

In [None]:
missing_data(application_test).head()

In [None]:
application_train.shape

In [None]:
application_test.shape

### Feature Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
cat_features = application_train.select_dtypes('object')
cat_features.columns

In [None]:
for col in application_train.columns:
    le = LabelEncoder()
    if application_train[col].dtypes == np.object:
        application_train[col] = le.fit_transform(application_train[col])
        application_test[col] = le.transform(application_test[col])

In [None]:
application_train.sample(10)

In [None]:
application_test.sample(10)

In [None]:
Xtrn = application_train.drop(['TARGET','SK_ID_CURR'],1)
Xtst = application_test.drop(['SK_ID_CURR'],1)
y = application_train['TARGET']

In [None]:
y.shape

In [None]:
y.value_counts()

# **Feature Selection using ANOVA**

In [None]:
Xtrn.columns

In [None]:
feats = Xtrn.columns

In [None]:
from sklearn.feature_selection import f_classif

anova = pd.DataFrame(f_classif(Xtrn,y)).transpose()

In [None]:
anova.columns = ['f-score','p-value']
anova['columns_name'] = feats

In [None]:
anova.shape

Choosing features having fscore above 100

In [None]:
anova_selected = anova[anova['f-score'] > 100]
anova_selected.sort_values('f-score',ascending=False)

In [None]:
anova_selected.shape

In [None]:
chosen_feats = list(anova_selected['columns_name'])
chosen_feats

In [None]:
Xtrn = pd.DataFrame(application_train,columns=chosen_feats)
Xtst = pd.DataFrame(application_test,columns=chosen_feats)

Xtrn.head()


### Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(Xtrn,y,stratify = y,test_size = 0.3,random_state = 123)

# Feature Scaling

In [None]:
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# X_transform = scaler.fit_transform(X_under)
# X_test = scaler.transform(X_test)
# X_transform = pd.DataFrame(X_transform,columns = Xtrn.columns)
# X_transform.head()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_transform = scaler.fit_transform(X_train)
X_test_transform = scaler.transform(X_test)

X_train_transform = pd.DataFrame(X_train_transform,columns = Xtrn.columns)
X_test_transform = pd.DataFrame(X_test_transform,columns = Xtrn.columns)

Xtst_transform = scaler.transform(Xtst)
Xtst_transform = pd.DataFrame(Xtst_transform,columns = Xtst.columns)

X_train_transform.head()

# Over Sampling

In [None]:
from imblearn.over_sampling import SMOTE
oversampling = SMOTE(random_state = 123)

In [None]:
X_over, y_over = oversampling.fit_resample(X_train_transform, y_train)
X_over = pd.DataFrame(X_over, columns=Xtrn.columns)
y_over = pd.DataFrame(y_over)
y_over.value_counts()

In [None]:
sns.barplot(x=[0,1], y=y_over.value_counts(normalize=True))

# Baseline Models

### Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_over,y_over.values.ravel())

In [None]:
lr.score(X_over, y_over), lr.score(X_test_transform, y_test)

In [None]:
lr_predict = lr.predict(Xtst_transform)
lr_predict = pd.DataFrame(lr_predict)
lr_predict.value_counts()

### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(X_over,y_over.values.ravel())
dtc.score(X_over, y_over), dtc.score(X_test_transform, y_test)

In [None]:
dtc_predict = dtc.predict(Xtst_transform)
dtc_predict = pd.DataFrame(dtc_predict)
dtc_predict.value_counts()

### SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

sgdc = SGDClassifier(loss='hinge', penalty='elasticnet',fit_intercept=True)
sgdc.fit(X_over,y_over.values.ravel())
sgdc.score(X_over, y_over), sgdc.score(X_test_transform, y_test)

In [None]:
sgdc_predict = sgdc.predict(Xtst_transform)
sgdc_predict = pd.DataFrame(sgdc_predict)
sgdc_predict.value_counts()

### LGBM Classifier

In [None]:
from lightgbm import LGBMClassifier

lgbmc = LGBMClassifier()
lgbmc.fit(X_over, y_over.values.ravel())


In [None]:
lgbmc.score(X_over, y_over), lgbmc.score(X_test_transform, y_test)

In [None]:
lgbmc_predict = lgbmc.predict(Xtst_transform)
lgbmc_predict = pd.DataFrame(lgbmc_predict)
lgbmc_predict.value_counts()

# Model Validation

In [None]:
from sklearn import model_selection

In [None]:
# Model logistic regression
kfold = model_selection.StratifiedKFold(n_splits=3,shuffle = True, random_state=123)
lr_validation = model_selection.cross_val_score(lr, Xtrn, y, cv=kfold)
print(lr_validation.mean()*100.0)

In [None]:
# Model decision tree
dtc_validation = model_selection.cross_val_score(dtc, Xtrn, y, cv=kfold)
print(dtc_validation.mean()*100.0)

In [None]:
# Model sgd
sgdc_validation = model_selection.cross_val_score(sgdc, Xtrn, y, cv=kfold)
print(sgdc_validation.mean()*100.0)

# Hyperparameter Tuning


### GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [{'penalty' : ['l2'],
                'C' : [0.001,.009,0.01,.09,1,5,10,25],
                'solver' : ['lbfgs','saga'],
               'max_iter': [1000,10000]
            }]
grid_clf = GridSearchCV(lr, param_grid = param_grid,scoring = 'accuracy',cv = kfold, verbose=True, n_jobs=-1)


In [None]:
grid_clf.fit(X_over,y_over.values.ravel())

In [None]:
grid_clf.score(X_over, y_over), grid_clf.score(X_test_transform, y_test)

In [None]:
grid_clf_predict = grid_clf.predict(Xtst_transform)
grid_clf_predict = pd.DataFrame(grid_clf_predict)
grid_clf_predict.value_counts()

### Random Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
param_grid = [{'penalty' : ['l2'],
                'C' : [0.001,.009,0.01,.09,1,5,10,25],
                'solver' : ['lbfgs','saga'],
               'max_iter': [1000,10000]
            }]
random_clf = RandomizedSearchCV(lr, param_distributions = param_grid,scoring = 'recall',cv = kfold, verbose=True, n_jobs=-1)


In [None]:
random_clf.fit(X_over,y_over.values.ravel())


In [None]:
random_clf.score(X_over, y_over), random_clf.score(X_test_transform, y_test)

In [None]:
random_clf_predict = random_clf.predict(Xtst_transform)
random_clf_predict = pd.DataFrame(random_clf_predict)
random_clf_predict.value_counts()

### submission

In [None]:
# Submission dataframe
submit = application_test.loc[:,['SK_ID_CURR']]
submit['TARGET'] = grid_clf_predict

submit.shape

In [None]:
submit.to_csv('log_reg_baseline.csv', index = False)