In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Modules for EDA
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

#Modules of ML
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix
%matplotlib inline

In [None]:
train_df = pd.read_csv('../input/loan-prediction-based-on-customer-behavior/Training Data.csv')
test_df = pd.read_csv('../input/loan-prediction-based-on-customer-behavior/Test Data.csv')
target_test = pd.read_csv('../input/loan-prediction-based-on-customer-behavior/Sample Prediction Dataset.csv')

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.head()

In [None]:
train_df.drop('Id',axis=1,inplace=True)
train_df.shape

In [None]:
test_df.head()

In [None]:
target_test.head()

In [None]:
test_df['Risk_Flag'] = target_test['risk_flag']
test_df.head()

In [None]:
test_df.drop('ID',axis=1,inplace=True)
test_df.shape

# **Combining Train and Test Data for Data Preprocessing <br>later will split the data at testing time**

In [None]:
df = pd.concat([train_df, test_df], axis=0)
df.shape

In [None]:
df.head()

# **Data Analysis**

In [None]:
plt.figure(figsize=(7,7))
train_df['Married/Single'].value_counts().plot(kind='pie',autopct="%.2f%%")
plt.title("Marital Status Ratio")
plt.ylabel("")
plt.show()
train_df['Married/Single'].value_counts()

In [None]:
def plotGraph(method, a, b, title=None, **kwargs):
    plt.figure(figsize=(7,7))
    method(data=df[[a, b]], **kwargs)
    plt.title(title)
    plt.show()

# **Histogram of Age Groups**

In [None]:
plotGraph(sns.histplot, 'Age', 'Married/Single', title="Histogram of Age Groups",element='poly',x='Age',hue='Married/Single')

# **Married/Single and Income**

In [None]:
plotGraph(sns.histplot, 'Married/Single', 'Income', title="Married/Single and Income", x='Income', hue='Married/Single',element='poly')

# **Different Professions**

In [None]:
plt.figure(figsize=(20,30))
sns.countplot(y=df['Profession'])
plt.title("Frequency of each Profession.")
plt.ylabel("Profession")
plt.xlabel("Count")
plt.show()

In [None]:
professions = df.groupby('Profession')
professions_list = df['Profession'].unique()
professions_list

# **Avaerage Income, Age and Experience of each profession**

In [None]:
avg_incomes = []
avg_age = []
avg_exp = []

In [None]:
for profession in professions_list:
    p_age_mean = professions.get_group(profession)['Age'].mean()
    p_income_mean = professions.get_group(profession)['Income'].mean()
    p_exp_mean = professions.get_group(profession)['Experience'].mean()
    
    avg_incomes.append(p_income_mean)
    avg_age.append(p_age_mean)
    avg_exp.append(p_exp_mean)

In [None]:
pdf = pd.DataFrame({
    'Profession':professions_list,
    'Average Age':avg_age,
    'Average Income':avg_incomes,
    'Average Experience':avg_exp
})

pdf.head()

In [None]:
pdf.shape

In [None]:
for col in pdf.columns[1:]:
    plt.figure(figsize=(20,20))
    sns.barplot(y=pdf['Profession'], x=pdf[col])
    plt.title(f"{col} of each profession.")
    plt.show()


# **States count**

In [None]:
plt.figure(figsize=(17,7))
df['STATE'].value_counts().plot(kind='barh')
plt.xscale('log')
plt.show()

# **Dropping CITY**

In [None]:
df.drop(['CITY'],axis=1,inplace=True)
df.head()

# **One Hot Encoding**

In [None]:
cols_to_encode = ['Married/Single','House_Ownership', 'Car_Ownership', 'Profession', 'STATE']
dummies = pd.get_dummies(df[cols_to_encode], drop_first=True)
dummies.shape

In [None]:
dummies.head()

# **Feature Scaling**

In [None]:
df.drop(cols_to_encode, axis=1, inplace=True)
df.head()

In [None]:
scale = MinMaxScaler()
scalled = scale.fit_transform(df.drop('Risk_Flag',axis=1))

In [None]:
i = 0
for col in df.columns[:-1]:
    df[col] = scalled[:,i]
    i += 1

In [None]:
df.head()

In [None]:
df.describe()

# **Combining the data**

In [None]:
ndf = pd.concat([df,dummies], axis=1)
ndf.shape

# **Risk Flag Ratio**

In [None]:
ndf['Risk_Flag'].value_counts().plot(kind='pie', autopct="%.2f%%")
plt.show()

# **Imbalanced Data :(**

In [None]:
ndf['Risk_Flag'].value_counts()

# **Under Sampling**

In [None]:
class0 = ndf[ndf['Risk_Flag'] == 0].sample(34589)
class0.shape

In [None]:
class1 = ndf[ndf['Risk_Flag'] == 1]
class1.shape

# **Combining class0 and class1**

In [None]:
ndf2 = pd.concat([class0,class1], axis=0) 
ndf2.shape

In [None]:
ndf2['Risk_Flag'].value_counts().plot(kind='pie', autopct="%.2f%%")
plt.show()

In [None]:
x, y = ndf2.drop('Risk_Flag',axis=1),ndf2['Risk_Flag']
x.shape, y.shape

# **Spitting Train and Test data**

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x,y,random_state=1)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

# **Model Building**

In [None]:
models = [LogisticRegression, DecisionTreeClassifier, KNeighborsClassifier, GaussianNB, RandomForestClassifier]

In [None]:
model = keras.Sequential([
    keras.layers.Dense(118, input_shape=(118,), activation='relu'),
#     keras.layers.Dense(60, activation='relu'),
#     keras.layers.Dense(30, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [None]:
model.fit(X_train, Y_train, epochs=150,batch_size=1024)

In [None]:
model.evaluate(X_test, Y_test)

In [None]:
def predict(model, X):
    pred = model.predict(X).flatten()
    pred[pred >= 0.5] = 1
    pred[pred < 0.5] = 0
    return pred

def plot_actual_vs_predicted(y_true,y_pred,title=None):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(7,7))
    sns.heatmap(cm, annot=True, fmt='g')
    
    #Labelling
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title(title)
    plt.show()

In [None]:
y_test_pred = predict(model, X_test)
plot_actual_vs_predicted(Y_test, y_test_pred, 'Test Data Predictions')
print(classification_report(Y_test, y_test_pred))

In [None]:
y_train_pred = predict(model, X_train)
plot_actual_vs_predicted(Y_train, y_train_pred, 'Train Data Predictions')
print(classification_report(Y_train, y_train_pred))