In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Load csv file
df = pd.read_csv('/kaggle/input/credit-approval-data-set/dataset/crxdata.csv',header=None)
df.head()

In [None]:
#convert column names into meaningful names
df.columns=['Gender', 'Age', 'Debt', 'Married', 'BankCustomer', 'EducationLevel', 'Ethnicity', 'YearsEmployed', 'PriorDefault', 'Employed', 'CreditScore', 'DriversLicense', 'Citizen', 'ZipCode', 'Income','ApprovalStatus']

In [None]:
df.head()

In [None]:
#inspect the column and its data type
df.info()

In [None]:
#The following columns are incorrectly set as object, these need to be investigated
#Age
df['Age'].value_counts()

In [None]:
# It is observed that Age has '?' hence it is treated as object. This must be replaced with Null and then convert data type to Integer
df['Age'] = df['Age'].replace('?',np.nan)

In [None]:
df['Age'] = df['Age'].astype(float)

In [None]:
#inspect numerical column description
df.describe()

In [None]:
#Exploratory Data Analysis
df.isnull().sum()

In [None]:
#Age feature has 12 NUll values

In [None]:
# Analyze categorical variables
categorical = [var for var in df.columns if df[var].dtype == 'O']
categorical

In [None]:
print(" There are {} categorical variables \n".format(len(categorical)))
print(" The Categorical variables are :",categorical)

In [None]:
# print unique values in each categorical variables
for var in categorical:
    print(df[var].value_counts())

In [None]:
# It is observed that features like Gender, Married, BankCustomer, EducationLevel, Ethnicity have ? values and needs to be replaced with Null
df['Gender'] = df['Gender'].replace('?',np.nan)
df['Married'] = df['Married'].replace('?',np.nan)
df['BankCustomer'] = df['BankCustomer'].replace('?',np.nan)
df['EducationLevel'] = df['EducationLevel'].replace('?',np.nan)
df['Ethnicity'] = df['Ethnicity'].replace('?',np.nan)

In [None]:
#All ? values are replaced
for str in categorical:
    print(df[str].str.contains('?',regex=False))

In [None]:
# Analyze the numerical values
numerical = [var for var in df.columns if df[var].dtype != 'O']

In [None]:
print(" There are {} numerical variables \n".format(len(numerical)))
print(" The numerical variables are :",numerical)

In [None]:
# print unique values in each categorical variables
for var in numerical:
    print(df[var].value_counts())

In [None]:
#Visual Analysis
#Univariate Analysis
for i,predictor in enumerate(df.drop(columns='ApprovalStatus')):
    plt.figure(i)
    sns.countplot(data=df,x=predictor,hue='ApprovalStatus')

In [None]:
#Quantitative features
#Histograms
features = numerical
df[features].hist(figsize=(10,4))

In [None]:
#Density plot
df[features].plot(
    kind="density", subplots=True, layout=(3, 4), sharex=False, figsize=(15, 10)
);

In [None]:
# distribution plot
sns.distplot(df["Age"])

In [None]:
#Boxplot
sns.boxplot(x="YearsEmployed",data=df)

In [None]:
sns.boxplot(x="Debt",data=df)

In [None]:
#Categorical features
#Barplot/Countplot
_, axes = plt.subplots(nrows=1, ncols=3, figsize=(12, 4))

sns.countplot(x="ApprovalStatus", data=df, ax=axes[0])
sns.countplot(x="EducationLevel", data=df, ax=axes[1])
sns.countplot(x="Ethnicity", data=df, ax=axes[1]);
sns.countplot(x="PriorDefault", data=df, ax=axes[1]);

In [None]:
_, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))
sns.countplot(x="EducationLevel", data=df, ax=axes[1]);

In [None]:
#Multivariate visualization
#Quantitative vs Quantitative
#Correlation matrix
corr_matrix = df[numerical].corr()
sns.heatmap(corr_matrix)

In [None]:
#Scatter Plot
plt.scatter(df['YearsEmployed'],df['Age'])

In [None]:
# Seaborn joinplot
sns.jointplot(x="YearsEmployed",y="Age",data=df,kind="scatter")

In [None]:
#scatterplot matrix - pairplot
%config InlineBackend.figure_format = 'png'
sns.pairplot(df[numerical]);

In [None]:
%config InlineBackend.figure_format = 'retina'

In [None]:
#Quantitative vs categorical
#lmplot
sns.lmplot(
    "Age", "YearsEmployed", data=df, hue="ApprovalStatus", fit_reg=False
);

In [None]:
#Boxplot
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))
for idx, feat in enumerate(numerical):
    ax = axes[int(idx / 3), idx % 3]
    sns.boxplot(x="ApprovalStatus", y=feat, data=df, ax=ax)
    ax.set_xlabel("")
    ax.set_ylabel(feat)
fig.tight_layout();

In [None]:
#categorical vs categorical
sns.countplot(x="Gender",hue="ApprovalStatus",data=df)

In [None]:
sns.countplot(x="EducationLevel",hue="ApprovalStatus",data=df)

In [None]:
#Feature Engineering
df.isnull().sum()

In [None]:
#Gender and Age features have null values, they need to be processed
df[df['Gender'].isnull() == True]

In [None]:
df.head()

In [None]:
#check the categorical features for NULL values
for var in categorical:
    print(var,df[var].isnull().sum())

In [None]:
#We have null values in Gender, Married, BankCustomer, EducationLevel, Ethnicity
#replace null values with mode value
for col in categorical:
    df[col].fillna(df[col].mode()[0],inplace = True)

In [None]:
df.isnull().values.any()

In [None]:
for var in categorical:
    print(var,df[var].isnull().sum())

In [None]:
df.isnull().sum()

In [None]:
#Age still has null values, we can replace with mode
df['Age'].fillna(df['Age'].mode()[0],inplace = True)

In [None]:
df['Age'].describe()

In [None]:
df.isnull().sum()

In [None]:
#All null values are replaced. 
df.head()

In [None]:
#convert Target variable to numeric
df['ApprovalStatus'] = np.where(df.ApprovalStatus == '+',1,0)

In [None]:
df.tail()

In [None]:
df_get_dummies = pd.get_dummies(df)
df_get_dummies.head()

In [None]:
X = df_get_dummies.drop('ApprovalStatus',axis = 1)

In [None]:
y = df_get_dummies['ApprovalStatus']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state= 42)

In [None]:
#Model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import (DecisionTreeClassifier,ExtraTreeClassifier)
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, 
GradientBoostingClassifier)
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10,shuffle=True,random_state=0)

In [None]:
clf = [KNeighborsClassifier(n_neighbors=13),DecisionTreeClassifier(),
      RandomForestClassifier(n_estimators=13),GaussianNB(),SVC(),
      ExtraTreeClassifier(),
      GradientBoostingClassifier(n_estimators=10,learning_rate=1,max_features=3,max_depth=3,random_state=10),
                                 AdaBoostClassifier(),ExtraTreesClassifier()]
def model_fit():
    scoring = 'accuracy'
    for i in range(len(clf)):
        score = cross_val_score(clf[i],X_train,y_train,cv=k_fold,n_jobs=1,scoring=scoring)
        print("Score of Model",i,":",round(np.mean(score)*100,2))
model_fit()

In [None]:
# Random Forest is picked as the best model in this case
clf1 = RandomForestClassifier()
clf1.fit(X_train,y_train)
prediction = clf1.predict(X_test)

In [None]:
X_test['ApprovalStatus'] = prediction
X_test.head()

In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,prediction)
print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, prediction))

In [None]:
TP = cm[0,0]
TN = cm[1,1]
FP = cm[0,1]
FN = cm[1,0]

In [None]:
# print classification accuracy

classification_accuracy = (TP + TN) / float(TP + TN + FP + FN)

print('Classification accuracy : {0:0.4f}'.format(classification_accuracy))

In [None]:
# print classification error

classification_error = (FP + FN) / float(TP + TN + FP + FN)

print('Classification error : {0:0.4f}'.format(classification_error))

In [None]:
# print precision score

precision = TP / float(TP + FP)


print('Precision : {0:0.4f}'.format(precision))

In [None]:
recall = TP / float(TP + FN)

print('Recall or Sensitivity : {0:0.4f}'.format(recall))

In [None]:
true_positive_rate = TP / float(TP + FN)


print('True Positive Rate : {0:0.4f}'.format(true_positive_rate))

In [None]:
false_positive_rate = FP / float(FP + TN)


print('False Positive Rate : {0:0.4f}'.format(false_positive_rate))

In [None]:
specificity = TN / (TN + FP)

print('Specificity : {0:0.4f}'.format(specificity))

In [None]:
# plot ROC Curve

from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, prediction)

plt.figure(figsize=(6,4))

plt.plot(fpr, tpr, linewidth=2)

plt.plot([0,1], [0,1], 'k--' )

plt.rcParams['font.size'] = 12

plt.title('ROC curve for Predicting a Pulsar Star classifier')

plt.xlabel('False Positive Rate (1 - Specificity)')

plt.ylabel('True Positive Rate (Sensitivity)')

plt.show()


In [None]:
# compute ROC AUC

from sklearn.metrics import roc_auc_score

ROC_AUC = roc_auc_score(y_test, prediction)

print('ROC AUC : {:.4f}'.format(ROC_AUC))