# Data Preparation

In [1]:
# loading the required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
# loading the dataset

credit_data = pd.read_csv('credit_record.csv')
app_data = pd.read_csv('application_record.csv')

ParserError: Error tokenizing data. C error: Expected 1 fields in line 9, saw 2


In [None]:
# merging the two datasets 

data = app_data.merge(credit_data, how='inner', on=['ID'])
data.head()

In [None]:
# Drop Duplicates

data.drop_duplicates(subset='ID',inplace=True)

In [None]:
# deriving  Age and Work experience from days_birth and days_employed

data['AGE']=-(data['DAYS_BIRTH'])//365
data['WORKEXP']=-(data['DAYS_EMPLOYED'])//365


# dropping the dats_birth and days_employed
data.drop(['DAYS_BIRTH','DAYS_EMPLOYED'],axis = 1,inplace=True)

# transforming status variable into numerical values

data['STATUS'].replace({'C': 6, 'X' : 7}, inplace=True)
data['STATUS']=data['STATUS'].astype(int)

In [None]:
#Type casting for categorical data (object) it changes into categorical datatype . 
# for other datatypes it changes into categorical variables which  can help  optimize memory usage and improve performance .

data['CODE_GENDER'] = data['CODE_GENDER'].astype('category')
data['FLAG_OWN_CAR'] = data['FLAG_OWN_CAR'].astype('category')
data['FLAG_OWN_REALTY'] = data['FLAG_OWN_REALTY'].astype('category')
data['NAME_INCOME_TYPE'] = data['NAME_INCOME_TYPE'].astype('category')
data['NAME_EDUCATION_TYPE'] = data['NAME_EDUCATION_TYPE'].astype('category')
data['NAME_FAMILY_STATUS'] = data['NAME_FAMILY_STATUS'].astype('category')
data['NAME_HOUSING_TYPE'] = data['NAME_HOUSING_TYPE'].astype('category')
data['OCCUPATION_TYPE'] = data['OCCUPATION_TYPE'].astype('category')
data['STATUS'] = data['STATUS'].astype('category')
data['FLAG_MOBIL'] = data['FLAG_MOBIL'].astype('category')
data['FLAG_WORK_PHONE'] = data['FLAG_WORK_PHONE'].astype('category')
data['FLAG_PHONE'] = data['FLAG_PHONE'].astype('category')
data['FLAG_EMAIL'] = data['FLAG_EMAIL'].astype('category')


In [None]:
# generating target variable from status

def get_label(status):
    if status in (3,4,5):
        return 0 
    else:
        return 1 
    
data['LABEL'] = data['STATUS'].apply(get_label)
#data['LABEL'] = data['LABEL'].astype('category')
data.drop('STATUS',axis = 1,inplace=True)
data.head()

In [None]:
# finding unique values of each column 

for i in data.columns :
    print(i ,"--->")
    print(data[i].unique(),"\n")

In [None]:
# to remove outlier 
data['WORKEXP']= data['WORKEXP'].replace(-1001,-1)

# Data Visualization

## univariate analysis

In [None]:
plt1=data['CODE_GENDER'].value_counts()
l1 = data['CODE_GENDER'].unique()
plt.pie(plt1,autopct='%.2f',labels=l1)

In [None]:
plt2=data['FLAG_OWN_CAR'].value_counts()
l2 = data['FLAG_OWN_CAR'].unique()
plt.pie(plt2,autopct='%.2f',labels=l2)

In [None]:
plt3=data['FLAG_OWN_REALTY'].value_counts()
l3 = data['FLAG_OWN_REALTY'].unique()
plt.pie(plt3,autopct='%.2f',labels=l3)

In [None]:
sns.countplot(data['CNT_CHILDREN'])

In [None]:
sns.countplot(data['NAME_INCOME_TYPE'])

In [None]:
sns.countplot(data['NAME_EDUCATION_TYPE'])

In [None]:
sns.countplot(data['NAME_FAMILY_STATUS'])

In [None]:
sns.countplot(data['NAME_HOUSING_TYPE'])

In [None]:
sns.countplot(data['FLAG_MOBIL'])

In [None]:
sns.countplot(data['FLAG_WORK_PHONE'])

In [None]:
sns.countplot(data['FLAG_PHONE'])

In [None]:
sns.countplot(data['FLAG_EMAIL'])

In [None]:
sns.countplot(data['OCCUPATION_TYPE'])

In [None]:
plt.hist(data['CNT_FAM_MEMBERS'])

In [None]:
plt4=data['LABEL'].value_counts()
l4 = data['LABEL'].unique()
plt.pie(plt4,autopct='%.2f',labels=l4)

## bivariate analysis

In [None]:
data.head()

In [None]:
plt.scatter(data['AGE'],data['WORKEXP'])
plt.xlabel('age')
plt.ylabel('WORKEXP')
plt.show()

In [None]:
plt.scatter(data['AGE'],data['AMT_INCOME_TOTAL'])
plt.xlabel('AGE')
plt.ylabel('total income')
plt.show()

In [None]:
plt.scatter(data['WORKEXP'],data['AMT_INCOME_TOTAL'])
plt.xlabel('WORKEXP')
plt.ylabel('total income')
plt.show()

## Multivariate analysis

In [None]:
sns.pairplot(data)

In [None]:
com=data.corr()
sns.heatmap(com)

# Descriptive Statistics

In [None]:
data.describe()

In [None]:
# selecting numeric columns

numeric_df = data.select_dtypes(include='number')
numeric_df.head()


In [None]:
# selecting categorical columns

categorical_df = data.select_dtypes(include=['category'])
categorical_df.head()


In [None]:
# mode for categorical variables

categorical_df.mode()


# Data pre processing

In [None]:
data.shape

In [None]:
data.info()

In [None]:
# checking for null values

data.isnull().sum()

In [None]:
# data cleaning

data.drop(['OCCUPATION_TYPE','FLAG_WORK_PHONE','FLAG_PHONE','FLAG_EMAIL','FLAG_MOBIL','ID'],inplace=True,axis = 1)

In [None]:
# one hot encoding

encoded_data = pd.get_dummies(data,columns=['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY','NAME_INCOME_TYPE',
                                            'NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE'])
encoded_data.head()

In [None]:
# feature selection

encoded_data.drop(['CODE_GENDER_M','FLAG_OWN_CAR_Y','FLAG_OWN_REALTY_Y','NAME_INCOME_TYPE_Working',
                    'NAME_EDUCATION_TYPE_Secondary / secondary special','NAME_FAMILY_STATUS_Widow',
                   'NAME_HOUSING_TYPE_With parents'],
                    inplace=True,axis = 1)

In [None]:
x = encoded_data.drop(['LABEL'],axis=1)
y = encoded_data['LABEL']

In [None]:
# Handling imbalanced dataset

from imblearn.over_sampling import SMOTE
smote=SMOTE(random_state=0)
x_res,y_res = smote.fit_resample(x,y)
sns.countplot(y_res)

In [None]:
# splitting the data into train and test

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x_res,y_res,test_size=0.3,random_state=42)
x_train.isna().sum()

In [None]:
x_train.shape, y_train.shape

In [None]:
x_test.shape,y_test.shape

In [None]:
# scaling the data

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train_scaled = sc.fit_transform(x_train)
x_test_scaled = sc.fit_transform(x_test)

# Model Training and evaluation

## Decision tree

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier(criterion='entropy',random_state=0)
dtree = dtree.fit(x_train, y_train)

In [None]:
d_pred = dtree.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix

print(accuracy_score(y_test,d_pred))
print(f1_score(y_test,d_pred))
print(recall_score(y_test,d_pred))
print(precision_score(y_test,d_pred))

In [None]:
confusion_matrix(y_test,d_pred)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=29,criterion='entropy',random_state=0)
rf.fit(x_train,y_train)

In [None]:
y_pred = rf.predict(x_test)

In [None]:

print(accuracy_score(y_test,y_pred))
print(f1_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
print(precision_score(y_test,y_pred))

In [None]:
confusion_matrix(y_test,y_pred)

## Xgboost classifier

In [None]:
# xgboost

import xgboost as xgb
xg= xgb.XGBClassifier(n_estimators=150,random_state=0)
xg.fit(x_train,y_train)

In [None]:
xg_pred = xg.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

print(accuracy_score(y_test,xg_pred))
print(f1_score(y_test,xg_pred))
print(recall_score(y_test,xg_pred))
print(precision_score(y_test,xg_pred))

In [None]:
confusion_matrix(y_test,xg_pred)

## Catboost classifier

In [None]:
# catboost

from catboost import CatBoostClassifier

clf = CatBoostClassifier(
    iterations=150, 
    learning_rate=0.1, 
)

clf.fit(x_train,y_train)

In [None]:
cat_pred = clf.predict(x_test)

In [None]:
print(accuracy_score(y_test,cat_pred))
print(f1_score(y_test,cat_pred))
print(recall_score(y_test,cat_pred))
print(precision_score(y_test,cat_pred))

In [None]:
confusion_matrix(y_test,cat_pred)

## Logistic Regression

In [None]:
# logistic regression

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(x_train_scaled,y_train)

In [None]:
lr_pred = lr.predict(x_test_scaled)

In [None]:
print(accuracy_score(y_test,lr_pred))
print(f1_score(y_test,lr_pred))
print(recall_score(y_test,lr_pred))
print(precision_score(y_test,lr_pred))

In [None]:
confusion_matrix(y_test,lr_pred)

## MLP Classifier

In [None]:
from sklearn.neural_network import MLPClassifier
clf2 = MLPClassifier(random_state=1, max_iter=10).fit(x_train_scaled, y_train)

In [None]:
mlp_pred = clf2.predict(x_test_scaled)

In [None]:
print(accuracy_score(y_test,mlp_pred))
print(f1_score(y_test,mlp_pred))
print(recall_score(y_test,mlp_pred))
print(precision_score(y_test,mlp_pred))

In [None]:
confusion_matrix(y_test,mlp_pred)

## adaboost classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
ada_clf = AdaBoostClassifier(n_estimators=175, random_state=0)
ada_clf.fit(x_train, y_train)

In [None]:
ada_pred = ada_clf.predict(x_test)

In [None]:
print(accuracy_score(y_test,ada_pred))
print(f1_score(y_test,ada_pred))
print(recall_score(y_test,ada_pred))
print(precision_score(y_test,ada_pred))

In [None]:
confusion_matrix(y_test,ada_pred)

# SVM

In [None]:
from sklearn.svm import SVC
linear_svc=SVC(kernel="rbf")

In [None]:
msvc=linear_svc.fit(x_train_scaled,y_train)
svc_pred = msvc.predict(x_test_scaled)

In [None]:
print(accuracy_score(y_test,svc_pred))
print(f1_score(y_test,svc_pred))
print(recall_score(y_test,svc_pred))
print(precision_score(y_test,svc_pred))

# Pickle file

In [None]:
import pickle
pickle.dump(rf,open("model.pkl","wb"))

In [None]:
model= pickle.load(open("model.pkl","rb"))
encoded_data.head()
encoded_data.columns

In [None]:
print(model.predict([[0,90000,2,0,49,4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]]))

In [None]:
print(model.predict([[0,112500,2,0,60,-1,1,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0]]))