In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
# Importing necessary modules first -> Numpy, Pandas and LabelEncoder
import numpy as np
import pandas as pd
# Label Encoder for dealing with categorical labels
from sklearn.preprocessing import LabelEncoder
import os
import warnings
warnings.filterwarnings('ignore')
#Matplot and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Now lets see the files in the data we are dealing with
print(os.listdir())

In [None]:
# We need to work our dataset on application_train.csv and application_test.csv . Lets import the data in a dataframe.
train_data = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
test_data = pd.read_csv('../input/home-credit-default-risk/application_test.csv')

In [None]:
# Visualize train data
train_data.head(20)

In [None]:
# There are some NaN in data which indicate missing values. Lets see the test data now.
test_data.head(20)

In [None]:
#Test data too contains some missing values which we will have to deal later.
#Lets see shape of both data.
print ("Shape of Training data is ",train_data.shape)
print ("Shape of Testing data is ",test_data.shape)

In [None]:
# We see that target variable is missing from the testing dataset. The target variable is what system needs to predict. 
# Therefore the task needs to be done using a supervised classification algorithm. We have thus established the boundry.
# For this task, I will use logistic Regression , Naive Bayes and a neural network classifier.

In [None]:
#Since this is a supervised classification task.We need to see distribution of class with respect to the label for our training data.
train_data['TARGET'].value_counts()

In [None]:
# We see many traning examples belong 0 (no) and less to 1(yes). Lets visualize it too.
train_data['TARGET'].astype(int).plot.hist();

In [None]:
# The dataset has high imbalance in classes. Therefore accuracy might be a good metric to evaluate the performance of our trained classifier.
# Lets see datatypes of our features.
train_data.dtypes.value_counts()

In [None]:
#Lets see which of features are object
print(train_data.dtypes == 'object')

In [None]:
# Lets now start dealing with missing values.
train_data.isnull()

In [None]:
# Can we do better to find count of null values .
train_data.isnull().sum().sort_values(ascending=False)

In [None]:
# Can we do more better by making it visualized using a table. For that I define a function like the one on the kaggle kernel followed
# by the professor.
def missing_values_table(df):
    missing_val= df.isnull().sum()
    
    # Percentage of missing values
    
    missing_per = 100 *df.isnull().sum() /len(df)
    
    mis_values_table = pd.concat([missing_val,missing_per],axis=1)
    
    missing_val_table_columns = mis_values_table.rename(columns={0 : 'Missing Values', 1 : '% of Missing Values'})
    
    missing_val_table_columns = missing_val_table_columns[
            missing_val_table_columns.iloc[:,1] != 0].sort_values(
        '% of Missing Values', ascending=False).round(1)
    
    return missing_val_table_columns
    

In [None]:
# Lets see the missing values visualization for our train_data.
missing_values_table(train_data)

In [None]:
# Lets check if I drop all Nan rows. How much data are we left to deal with ? 
train_data_modify = train_data

In [None]:
train_data_modify.shape

In [None]:
train_data_modify=train_data_modify.dropna()

In [None]:
train_data_modify.shape

In [None]:
# This shows its not better to drop all Nan values. This will significantly decrease our data. Other solution is to drop columns with 
# more missing data.

In [None]:
missing_values= missing_values_table(train_data)

In [None]:
missing_values.head(40)

In [None]:
(missing_values_table(train_data)).head(30)

In [None]:
train_data.drop(["COMMONAREA_AVG","COMMONAREA_MODE","NONLIVINGAPARTMENTS_MODE","NONLIVINGAPARTMENTS_MEDI","NONLIVINGAPARTMENTS_AVG","FONDKAPREMONT_MODE","LIVINGAPARTMENTS_MODE","LIVINGAPARTMENTS_MEDI","LIVINGAPARTMENTS_AVG","FLOORSMIN_AVG","FLOORSMIN_MEDI","FLOORSMIN_MODE","YEARS_BUILD_AVG","YEARS_BUILD_MEDI","LANDAREA_MODE","LANDAREA_AVG","LANDAREA_MEDI","BASEMENTAREA_MEDI","BASEMENTAREA_MODE","BASEMENTAREA_AVG","EXT_SOURCE_1","NONLIVINGAREA_MODE","NONLIVINGAREA_AVG","NONLIVINGAREA_MEDI"],axis=1,inplace=True)

In [None]:
(missing_values_table(train_data)).head(30)

In [None]:
train_data.drop(["COMMONAREA_MEDI","YEARS_BUILD_MODE","OWN_CAR_AGE","ELEVATORS_MEDI","ELEVATORS_MODE","ELEVATORS_AVG","WALLSMATERIAL_MODE","APARTMENTS_MODE","APARTMENTS_MEDI","APARTMENTS_AVG","ENTRANCES_AVG","ENTRANCES_MEDI","ENTRANCES_MODE","LIVINGAREA_MODE","LIVINGAREA_AVG","LIVINGAREA_MEDI","HOUSETYPE_MODE","FLOORSMAX_AVG","FLOORSMAX_MEDI","FLOORSMAX_MODE","YEARS_BEGINEXPLUATATION_MEDI","YEARS_BEGINEXPLUATATION_AVG","YEARS_BEGINEXPLUATATION_MODE","TOTALAREA_MODE","EMERGENCYSTATE_MODE","OCCUPATION_TYPE"],axis=1,inplace=True)

In [None]:
train_data['EXT_SOURCE_3'].describe()

In [None]:
train_data['AMT_REQ_CREDIT_BUREAU_YEAR'].describe()

In [None]:
train_data['AMT_REQ_CREDIT_BUREAU_QRT'].describe()

In [None]:
train_data['NAME_TYPE_SUITE'].describe()

In [None]:
train_data['DEF_60_CNT_SOCIAL_CIRCLE'].describe()

In [None]:
train_data['EXT_SOURCE_2'].describe()

In [None]:
train_data['AMT_ANNUITY'].describe()

In [None]:
train_data['CNT_FAM_MEMBERS'].describe()

In [None]:
# in my opinion we have only one object dtype which has missing values and which can be very hard to replace in a dataset. Therefore
# I removed that feature.

In [None]:
train_data.drop(["NAME_TYPE_SUITE"],axis=1,inplace=True)

In [None]:
missing_values_test= missing_values_table(test_data)

In [None]:
missing_values_test.head(30)

In [None]:
test_data.drop(["COMMONAREA_MODE","COMMONAREA_MEDI","COMMONAREA_AVG","NONLIVINGAPARTMENTS_MEDI","NONLIVINGAPARTMENTS_MEDI","NONLIVINGAPARTMENTS_AVG","NONLIVINGAPARTMENTS_MODE","FONDKAPREMONT_MODE","LIVINGAPARTMENTS_MODE","LIVINGAPARTMENTS_MEDI","LIVINGAPARTMENTS_AVG","FLOORSMIN_MEDI","FLOORSMIN_MODE","FLOORSMIN_AVG","OWN_CAR_AGE","YEARS_BUILD_AVG","YEARS_BUILD_MEDI","YEARS_BUILD_MODE","LANDAREA_MODE","LANDAREA_AVG","LANDAREA_MEDI","BASEMENTAREA_MEDI","BASEMENTAREA_AVG","BASEMENTAREA_MODE","NONLIVINGAREA_AVG","NONLIVINGAREA_MODE","NONLIVINGAREA_MEDI","ELEVATORS_MEDI","ELEVATORS_MODE","ELEVATORS_AVG","WALLSMATERIAL_MODE"],axis=1,inplace=True)

In [None]:
missing_values_test= missing_values_table(test_data)

In [None]:
missing_values_test.head(40)

In [None]:
test_data.drop(["APARTMENTS_AVG","APARTMENTS_MEDI","APARTMENTS_MODE","HOUSETYPE_MODE","ENTRANCES_MODE","ENTRANCES_MEDI","ENTRANCES_AVG","LIVINGAREA_MEDI","LIVINGAREA_AVG","LIVINGAREA_MODE","FLOORSMAX_MODE","FLOORSMAX_MEDI","FLOORSMAX_AVG","YEARS_BEGINEXPLUATATION_MEDI","YEARS_BEGINEXPLUATATION_MODE","TOTALAREA_MODE","EMERGENCYSTATE_MODE","EXT_SOURCE_1"],axis=1,inplace=True)

In [None]:
missing_values_test= missing_values_table(test_data)

In [None]:
missing_values_test.head(20)

In [None]:
test_data.drop(["YEARS_BEGINEXPLUATATION_AVG"],axis=1,inplace=True)

In [None]:
missing_values_test= missing_values_table(test_data)

In [None]:
# Lets now check dimensions of our test and train data

In [None]:
train_data.shape

In [None]:
test_data.shape

In [None]:
test_data.head(20)

In [None]:
train_data.head(20)

In [None]:
# okay perfect . Now its time to do some encoding of our categorical labels.

In [None]:
list(train_data.select_dtypes(['object']).columns)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
train_data_encode = train_data
test_data_encode = test_data

In [None]:
encoder=LabelEncoder()

In [None]:
train_data_encode['NAME_CONTRACT_TYPE'] = encoder.fit_transform(train_data_encode['NAME_CONTRACT_TYPE'])
# Now lets replicate it for all object types.
train_data_encode['CODE_GENDER'] = encoder.fit_transform(train_data_encode['CODE_GENDER'])
train_data_encode['FLAG_OWN_CAR'] = encoder.fit_transform(train_data_encode['FLAG_OWN_CAR'])
train_data_encode['FLAG_OWN_REALTY'] = encoder.fit_transform(train_data_encode['FLAG_OWN_REALTY'])
train_data_encode['NAME_INCOME_TYPE'] = encoder.fit_transform(train_data_encode['NAME_INCOME_TYPE'])
train_data_encode['NAME_EDUCATION_TYPE'] = encoder.fit_transform(train_data_encode['NAME_EDUCATION_TYPE'])
train_data_encode['NAME_FAMILY_STATUS'] = encoder.fit_transform(train_data_encode['NAME_FAMILY_STATUS'])
train_data_encode['NAME_HOUSING_TYPE'] = encoder.fit_transform(train_data_encode['NAME_HOUSING_TYPE'])
train_data_encode['WEEKDAY_APPR_PROCESS_START'] = encoder.fit_transform(train_data_encode['WEEKDAY_APPR_PROCESS_START'])
train_data_encode['ORGANIZATION_TYPE'] = encoder.fit_transform(train_data_encode['ORGANIZATION_TYPE'])

In [None]:
train_data_encode['NAME_CONTRACT_TYPE'].head()

In [None]:
train_data_encode['ORGANIZATION_TYPE'].head()

In [None]:
# now over to our test set
list(test_data.select_dtypes(['object']).columns)

In [None]:
# I see some differences in categorical columns in both dataset.
missing_values_test.head(20)

In [None]:
# I see I can remove OCCUPATION TYPE AND NAME_TYPE_SUITE FROM MY TEST DATA.
test_data.drop(["OCCUPATION_TYPE","NAME_TYPE_SUITE"],axis=1,inplace=True)

In [None]:
test_data_encode = test_data

In [None]:
list(test_data_encode.select_dtypes(['object']).columns)

In [None]:
# Encoding test data
test_data_encode['NAME_CONTRACT_TYPE'] = encoder.fit_transform(test_data_encode['NAME_CONTRACT_TYPE'])
test_data_encode['CODE_GENDER'] = encoder.fit_transform(test_data_encode['CODE_GENDER'])
test_data_encode['FLAG_OWN_CAR'] = encoder.fit_transform(test_data_encode['FLAG_OWN_CAR'])
test_data_encode['FLAG_OWN_REALTY'] = encoder.fit_transform(test_data_encode['FLAG_OWN_REALTY'])
test_data_encode['NAME_INCOME_TYPE'] = encoder.fit_transform(test_data_encode['NAME_INCOME_TYPE'])
test_data_encode['NAME_EDUCATION_TYPE'] = encoder.fit_transform(test_data_encode['NAME_EDUCATION_TYPE'])
test_data_encode['NAME_FAMILY_STATUS'] = encoder.fit_transform(test_data_encode['NAME_FAMILY_STATUS'])
test_data_encode['NAME_HOUSING_TYPE'] = encoder.fit_transform(test_data_encode['NAME_HOUSING_TYPE'])
test_data_encode['WEEKDAY_APPR_PROCESS_START'] = encoder.fit_transform(test_data_encode['WEEKDAY_APPR_PROCESS_START'])
test_data_encode['ORGANIZATION_TYPE'] = encoder.fit_transform(test_data_encode['ORGANIZATION_TYPE'])

In [None]:
train_data.head(20)

In [None]:
test_data.head(20)

In [None]:
train_data = train_data_encode
test_data = test_data_encode

In [None]:
train_data.head(20)

In [None]:
test_data.head(20)

In [None]:
train_data.shape

In [None]:
test_data.shape

In [None]:
# Back to EDA , lets do some feature visualization and engineering
correlations=train_data.corr()['TARGET'].sort_values()

In [None]:
correlations.tail(20)

In [None]:
# We see that it has very high correlations with DAYS_BIRTH
train_data['DAYS_BIRTH'].describe()

In [None]:
# The values should not be negative 
train_data['DAYS_BIRTH'] = train_data['DAYS_BIRTH']/-365

In [None]:
# Check similar for the test data
test_data['DAYS_BIRTH'].describe()

In [None]:
# Again same issue
test_data['DAYS_BIRTH'] = test_data['DAYS_BIRTH']/-365

In [None]:
test_data['DAYS_BIRTH'].head()

In [None]:
test_data['DAYS_LAST_PHONE_CHANGE'].describe()

In [None]:
train_data['DAYS_LAST_PHONE_CHANGE'].describe()

In [None]:
# As its highly correlated feature, we must also transform it from being negative

train_data['DAYS_LAST_PHONE_CHANGE']=train_data['DAYS_LAST_PHONE_CHANGE']/-1
test_data['DAYS_LAST_PHONE_CHANGE']=test_data['DAYS_LAST_PHONE_CHANGE']/-1

In [None]:
train_data['DAYS_ID_PUBLISH'].describe()

In [None]:
train_data['DAYS_ID_PUBLISH']=train_data['DAYS_ID_PUBLISH']/-1
test_data['DAYS_ID_PUBLISH']=test_data['DAYS_ID_PUBLISH']/-1

In [None]:
train_data['DAYS_LAST_PHONE_CHANGE'].plot.hist()

In [None]:
train_data['DAYS_BIRTH'].plot.hist()

In [None]:
# lets now again see correlations
correlations=train_data.corr()['TARGET'].sort_values()
correlations.tail(10)

In [None]:
# Surprising ! :D 
# Now lets see our both datasets after transformations.

print ("Shape of training data is : ", train_data.shape)
print ("Shape of testing data is :",test_data.shape)




In [None]:
# now lets begin model development and tuning. # I will use three algorithms logistic regression , naive bayes and neural networks.
# Create label to predict
y_train = train_data.TARGET
# Create X_train for our data
X_train = train_data.drop(columns=['TARGET'])
# Create X_test for our data
X_test = test_data
#train_data_reg = train_data
#train_data_nb = train_data
#train_data_svm = train_data
#test_data_reg = test_data
#test_data_nb = test_data
#test_data_svm = test_data

In [None]:
X_train,X_test=X_train.align(X_test, join= 'inner', axis=1)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
#Check for null-values
print (X_test.isnull().values.any())

In [None]:
print (X_train.isnull().values.any())

In [None]:
# Use imputer for null-values ! 
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = "median")

In [None]:
imputer.fit(X_train)

In [None]:
X_train.loc[:] = imputer.transform(X_train)
X_test.loc[:] = imputer.transform(X_test)

In [None]:
# Perfect. Now that we are here, before regression. I would like to check which features are important for my data and model so
# that I can use them in future and store in an array.
# Check out feature importances and save important features for later use and improvement.
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X_train, y_train)
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, X_train.shape[1]])
plt.show()


In [None]:
# Necesary to seperate data for scaling and apply regression model
X_train_reg=X_train
X_test_reg=X_test
y_train_reg=y_train

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler(feature_range = (0, 1))

In [None]:
scaler.fit(X_train_reg)
X_train_reg = scaler.transform(X_train_reg)
X_test_reg = scaler.transform(X_test_reg)

In [None]:
print('Training data shape: ', X_train_reg.shape)
print('Testing data shape: ', X_test_reg.shape)

In [None]:
np.isnan(X_test_reg).any()

In [None]:
np.isnan(X_train_reg).any()

In [None]:
# Lets apply our first model - Logistic Regression
from sklearn.linear_model import LogisticRegression

In [None]:
logreg = LogisticRegression()

In [None]:
logreg.fit(X_train_reg,y_train_reg)

In [None]:
pred = logreg.predict_proba(X_test_reg)[:,1]

In [None]:
pred

In [None]:
result = test_data[['SK_ID_CURR']]

In [None]:
result['TARGET']=pred

In [None]:
result.head(40)

In [None]:
(result['TARGET']).describe()

In [None]:
result.to_csv('logisticRegression1.csv',index=False)

In [None]:
predictions = logreg.predict_proba(X_train_reg)[:,1]

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
print(roc_auc_score(y_train_reg,predictions))

In [None]:
fpr, tpr, thr = roc_curve(y_train_reg,predictions)

In [None]:
plt.figure()
plt.plot(fpr, tpr)

In [None]:
# Some hypermeter tuning and trying improvement on results
logreg = LogisticRegression(C=0.1,max_iter=1000)

In [None]:
logreg.fit(X_train_reg,y_train_reg)

In [None]:
pred = logreg.predict_proba(X_test_reg)[:,1]

In [None]:
pred

In [None]:
result = test_data[['SK_ID_CURR']]

In [None]:
result['TARGET']=pred

In [None]:
(result['TARGET']).describe()

In [None]:
result.to_csv('logisticRegression2.csv',index=False)

In [None]:
predictions = logreg.predict_proba(X_train_reg)[:,1]

In [None]:
accuracy = logreg.score(X_train_reg,y_train_reg)

In [None]:
print("Accuracy is : ",accuracy)

In [None]:
print(roc_auc_score(y_train_reg,predictions))

In [None]:
fpr, tpr, thr = roc_curve(y_train_reg,predictions)

In [None]:
plt.figure()
plt.plot(fpr, tpr)

In [None]:
# Lets now work with Guassain Naive Bayes and see results

X_train_nb = X_train
X_test_nb = X_test
y_train_nb = y_train

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
model = GaussianNB()

In [None]:
X_train_nb

In [None]:
X_test_nb.head()

In [None]:
X_train_nb.shape

In [None]:
X_test_nb.shape

In [None]:
print (X_test_nb.isnull().values.any())

In [None]:
# Good ! :D
model.fit(X_train_nb,y_train_nb)

In [None]:
accuracy = model.score(X_train_nb,y_train_nb)

In [None]:
print("Accuracy of model is : " , accuracy)

In [None]:
predictions = model.predict_proba(X_test_nb)[:,1]

In [None]:
predictions

In [None]:
submission = test_data[['SK_ID_CURR']]

In [None]:
submission['TARGET']=predictions

In [None]:
submission.head()

In [None]:
(submission['TARGET']).describe()

In [None]:
submission.to_csv('NaiveBayes1.csv',index=False)

In [None]:
predict = model.predict_proba(X_train_nb)[:,1]

In [None]:
print(roc_auc_score(y_train_nb,predict))

In [None]:
# Let me try randomn forest classifier
from sklearn.ensemble import RandomForestClassifier
rfc_rain = RandomForestClassifier(n_estimators=100, random_state=13).fit(X_train_nb, y_train_nb)
rfc_predict = rfc_rain.predict_proba(X_test_nb)[:, 1]

In [None]:
submission1 = test_data[['SK_ID_CURR']]

In [None]:
submission1['TARGET']=rfc_predict

In [None]:
submission.head()

In [None]:
submission1.to_csv('NaiveBayes2.csv',index=False)

In [None]:
predict = rfc_rain.predict_proba(X_train_nb)[:, 1]

In [None]:
print(roc_auc_score(y_train_nb,predict))

In [None]:
# Thats great .
# Now I move onto my final algorithm MLPClassifier.


In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mlp=MLPClassifier(hidden_layer_sizes=(10,10,10),max_iter=1000)

In [None]:
X_train_mlp=X_train_reg
X_test_mlp=X_test_reg
y_train_mlp=y_train_reg

In [None]:
mlp.fit(X_train_mlp,y_train_mlp)

In [None]:
pred=mlp.predict_proba(X_test_mlp)[:,1]

In [None]:
res = test_data[['SK_ID_CURR']]

In [None]:
res['TARGET']=pred

In [None]:
res.head()

In [None]:
res.to_csv('NeuralNetwork.csv',index=False)

In [None]:
(res['TARGET']).describe()

In [None]:
list(X_train.columns) 

In [None]:
# As per feature importances curve 
# Feature 37,38,15,18,0,17,43,6,29,8 are 10 most important features.
feature_columns=['ORGANIZATION_TYPE','EXT_SOURCE_2','REGION_POPULATION_RELATIVE','DAYS_REGISTRATION','DAYS_EMPLOYED','DEF_60_CNT_SOCIAL_CIRCLE','CNT_CHILDREN','WEEKDAY_APPR_PROCESS_START','AMT_CREDIT']

In [None]:
X_train_f = X_train[feature_columns]

In [None]:
X_test_f=X_test[feature_columns]

In [None]:
y_train_f = y_train

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler.fit(X_train_f)
X_train_f = scaler.transform(X_train_f)
X_test_f = scaler.transform(X_test_f)

In [None]:
print('Training data shape: ', X_train_f.shape)
print('Testing data shape: ', X_test_f.shape)

In [None]:
logreg = LogisticRegression()

In [None]:
logreg.fit(X_train_f,y_train_f)

In [None]:
pred = logreg.predict_proba(X_test_f)[:,1]

In [None]:
result = test_data[['SK_ID_CURR']]

In [None]:
result['TARGET']=pred

In [None]:
result.head()

In [None]:
result.to_csv('LogisticRegressionImp.csv',index=False)