In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [None]:
credit_card=pd.read_csv("/kaggle/input/credit-card/Credit_card.csv")
credit_card.head()

In [None]:
credit_card.shape

In [None]:
credit_card_label=pd.read_csv("/kaggle/input/credit-card/Credit_card_label.csv")
credit_card_label.head()

In [None]:
credit_card_label.shape

* Label 0 : Approved
* Label 1 : Rejected

#### We have two different datasets, one containing independent variables and the other contains dependent variable,so merging the two datasets based on Ind_ID

In [None]:
credit_data_raw=credit_card.merge(credit_card_label )

credit_data_raw.head()

In [None]:
credit_data=credit_data_raw.copy()

In [None]:
credit_data.sample(3)

In [None]:
credit_data.shape

* The dataset contains 1548 observations with 19 features

In [None]:
credit_data.nunique()

In [None]:
credit_data.columns

#### Renaming the features

In [None]:
credit_data.rename(columns={'GENDER':'Gender','Propert_Owner':'Property_owner','CHILDREN':'Children','Type_Income':'Income_type',
                           'EDUCATION':'Education','EMAIL_ID':'Email_ID','Type_Occupation':'Occupation_type'}, inplace=True)

In [None]:
credit_data.info()

In [None]:
# Checking if there are any duplicates

credit_data.duplicated().sum()

* There are no duplicates in the dataset

In [None]:
credit_data.isna().sum()

* There are missing values in the below features:
    * Gender
    * Annual_income
    * Occupation_type
    * Birthday_count

In [None]:
credit_data.head(5)

* Birthday_count: Use backward count from current day (0), -1 means yesterday.

* Employed_days: Start date of employment. Use backward count from current day (0). Positive value means, individual is currently unemployed.

So changing Birthday_count feature to Age and Employed_days to Experience

In [None]:
# Checking if there are any observations of Birthday_count >0 

credit_data[credit_data['Birthday_count']>0]

* There are no observations with positive birthday count

In [None]:
# Creating the Age column by dividing the birthday_count with 365, because birthday_count is given as no of days 
# and we are converting this into age in years by dividing it with 365

credit_data['Age']=np.abs((credit_data['Birthday_count'])/365)

In [None]:
# Checking if there are any employed_days > 0

credit_data[credit_data['Employed_days']>0].shape

* There are 261 observations which have employed days in positive (so there are not working)

In [None]:
# Similar to age, we get experience in years by dividing employed_days with 365


credit_data['Experience']=np.abs(credit_data['Employed_days'])/365

### Removing the below features:
    > Ind_ID
    > Birthday_count : because using this feature, we have created age 
    > Employed_days  : using this feature, we have created experience

In [None]:
credit_data.drop(columns=['Ind_ID','Birthday_count','Employed_days'], inplace=True)

In [None]:
credit_data[credit_data['Experience']>=1000]

In [None]:
credit_data.describe()

* If we observe the above data:
    * age varies from 21 years to 68 years
    * but experience varies from 0.2 years to 1000 years (experience with 1000 years is not possible)

* Dropping the observations which have experience greater than or equal to 1000 years

In [None]:
credit_data=credit_data[credit_data['Experience']<1000]

In [None]:
credit_data.shape

In [None]:
credit_data.info()

In [None]:
credit_data.describe()

* Now the experience varies from 0.2 years to 40 years

### EDA

In [None]:
credit_data.sample(3)

In [None]:
sns.set_style(style='whitegrid')

In [None]:
cols=['Gender','Income_type','Education','Marital_status','Housing_type','Occupation_type']
fig,ax = plt.subplots(3,2, figsize=(19,24))
fig.tight_layout(pad=7)

for i in range(0, len(cols)):
    for m in range(3):
        for n in range(2):
            sns.countplot(data=credit_data, x=cols[i], ax=ax[m,n])
            plt.xticks(rotation=74)
            ax[m,n].set_title(f"Distribution of {cols[i]}")
            i+=1
    break

* The female applicants are more
* The no of applicants with income_type as working are more
* Majority of the applicants completed Secondary/secondary special Education
* Most people are married
* Majority are living in their House/apartment
* Majority of people are working as Laborers

In [None]:
cols=['Car_Owner', 'Property_owner', 'Children', 'Mobile_phone', 'Work_Phone', 'Phone', 'Email_ID','Family_Members']

fig, ax = plt.subplots(2,4, figsize=(18,12))
fig.tight_layout(pad=5)
for i in range(0, len(cols)):
    for m in range(2):
        for n in range(4):
            sns.countplot(data=credit_data, x=cols[i], ax=ax[m,n])
            ax[m,n].set_title(f"Distribution of {cols[i]}")
            i+=1
    break
            

* Everyone has a mobile phone (Either a personal phone or work_phone or even both)
* Most applicants do not own a car
* Most applicants own a property
* Majority of applicants do not have Email_ID
* Most applicants do not have children and also majority of family members in the households are 2


In [None]:
plt.subplot(1,2,1)
sns.histplot(data=credit_data, x='Annual_income',kde=True)
plt.axvline(x=credit_data['Annual_income'].mean(),color="red",label="Mean")
plt.axvline(x=credit_data['Annual_income'].median(),color="blue",label="Median", ls="--")
plt.legend()

plt.subplot(1,2,2)
sns.boxplot(data=credit_data, x='Annual_income')

plt.subplots_adjust(right=2.0)
plt.show()


In [None]:
print("Mean : ", credit_data['Annual_income'].mean())
print("Median : ", credit_data['Annual_income'].median())

* The Annual_income feature has some outliers and it is right skewed

In [None]:
plt.subplot(1,2,1)
sns.histplot(data=credit_data, x='Age',kde=True)
plt.axvline(x=credit_data['Age'].mean(),color="red",label="Mean")
plt.axvline(x=credit_data['Age'].median(),color="blue",label="Median", ls="--")
plt.legend()

plt.subplot(1,2,2)
sns.boxplot(data=credit_data, x='Age')

plt.subplots_adjust(right=2.0)

plt.show()


In [None]:
print("Mean : ", credit_data['Age'].mean())
print("Median : ", credit_data['Age'].median())

* Age feature has no outliers

In [None]:
plt.subplot(1,2,1)
sns.histplot(data=credit_data, x='Experience',kde=True)
plt.axvline(x=credit_data['Experience'].mean(),color="red",label="Mean")
plt.axvline(x=credit_data['Experience'].median(),color="blue",label="Median", ls="--")
plt.legend()

plt.subplot(1,2,2)
sns.boxplot(data=credit_data, x='Experience')

plt.subplots_adjust(right=2.0)

plt.show()


In [None]:
print("Mean : ", credit_data['Experience'].mean())
print("Median : ", credit_data['Experience'].median())

* The feature Experience is right skewed and has some outliers

In [None]:
sns.countplot(data=credit_data, x='label')
plt.title("Distribution of Label(target variable)")
plt.show()

* We have an imbalanced dataset as there is a huge difference between approved (0) and rejected (1) applications

In [None]:
credit_data.sample(3)

In [None]:
cat_fea=['Gender','Car_Owner','Property_owner','Income_type','Education','Marital_status','Housing_type','Occupation_type']

fig, ax = plt.subplots(4,2,figsize=(17,25))
fig.tight_layout(pad=5)
for i in range(0,len(cat_fea)):
    for m in range(4):
        for n in range(2):
            sns.boxplot(data=credit_data, x=cat_fea[i], y='Age', ax=ax[m,n])
            ax[m,n].set_title(cat_fea[i] +' '+ "vs Age")
            plt.xticks(rotation=74)
            i=i+1
    break

* Age of female applicants is more
* Low age applicants are tend to buy a car 
* People who owned property tend to have more age than people who do not have property
* People who are taking pensions has more age
* Applicants who are widowed has more age
* The age of people who are working as security staff is more


In [None]:
cat_fea=['Gender','Car_Owner','Property_owner','Income_type','Education','Marital_status','Housing_type','Occupation_type']

fig, ax = plt.subplots(4,2,figsize=(17,25))
fig.tight_layout(pad=5)
for i in range(0,len(cat_fea)):
    for m in range(4):
        for n in range(2):
            sns.boxplot(data=credit_data, x=cat_fea[i], y='Annual_income', ax=ax[m,n])
            ax[m,n].set_title(cat_fea[i] +' '+ "vs Annual_income")
            plt.xticks(rotation=74)
            i=i+1
    break
            
    

* Though there are more number of females the income of male is greater than female
* The applicants with more income has owned car
* Applicants who owned property and who do not owned property has almost same average income
* The applicants who have academic degree have high annual income
* Annual income of people having different marital status is almost similar
* The applicants who have more annual income tends to live in rented apartments
* People who are working as managers and drivers are earning more annual income


In [None]:
cat_fea=['Gender','Car_Owner','Property_owner','Income_type','Education','Marital_status','Housing_type','Occupation_type']

fig, ax = plt.subplots(4,2,figsize=(17,25))
fig.tight_layout(pad=5)
for i in range(0,len(cat_fea)):
    for m in range(4):
        for n in range(2):
            sns.boxplot(data=credit_data, x=cat_fea[i], y='Experience', ax=ax[m,n])
            ax[m,n].set_title(cat_fea[i] +' '+ "vs Experience")
            plt.xticks(rotation=74)
            i=i+1
    break
            
    

* The experience of male and female is almost similar
* People who are working as state servants have more experience
* Applicants who completed their academic degree have more experience
* Medicine staff has more experience


In [None]:
features=['Annual_income','Children','Family_Members','Age', 'Experience']

credit_data[features].corr()

In [None]:
sns.pairplot(data=credit_data[features])
plt.show()

In [None]:
sns.heatmap(credit_data[features].corr(), annot=True)
plt.show()

* Children and Family members have linear corelation, that means more no of children results in more members in the family,so one of these features should be dropped
* Age and experience also show some corelation

### Handling missing values

In [None]:
credit_data.isna().sum()

* Gender, Annual_income, age and occupation_type have missing values

In [None]:
# Percentage of missing values

credit_data.isna().mean()*100

In [None]:
import missingno as msno

msno.bar(credit_data)
plt.show()

In [None]:
credit_missing=credit_data.copy()

#### Handling missing values of Gender

In [None]:
gender_missing=credit_missing[credit_missing['Gender'].isna()==True]
gender_missing

In [None]:
# Gender_wise mean age before treating missing values of gender

age_by_gender=credit_missing.groupby("Gender")['Age'].mean()
age_by_gender

From EDA we found out that the age of female applicants is more than male, so computing the missing values of gender based on their respective mean ages 

In [None]:
# If age is greater than mean age of female then imputing that missing value of that gender as F and similarly for male also


for i in gender_missing.index:
    if gender_missing.loc[i,'Age'] > (age_by_gender['F']):
        gender_missing.loc[i,'Gender']='F'
    else:
        gender_missing.loc[i,'Gender']='M'

In [None]:
gender_missing

In [None]:
credit_missing[credit_missing["Gender"].isna()==True]=gender_missing

In [None]:
# After imputing missing values, gender-wise mean age

credit_missing.groupby("Gender")['Age'].mean()

In [None]:
credit_missing['Gender'].isna().sum()

#### Handling missing values of Annual_income

In [None]:
credit_missing['Annual_income'].isna().sum()

In [None]:
annual_income_missing=credit_missing[credit_missing['Annual_income'].isna()==True]
annual_income_missing

In [None]:
mean_inc_byincome_type=credit_missing.groupby(['Income_type'])['Annual_income'].mean()
mean_inc_byincome_type

* Imputing the missing values of Annual_income based on the mean of Income_type

In [None]:
for i in annual_income_missing.index:
    for j in mean_inc_byincome_type.index:
        if annual_income_missing.loc[i,'Income_type']==j:
            annual_income_missing.loc[i,'Annual_income']=mean_inc_byincome_type[j]

In [None]:
annual_income_missing

In [None]:
credit_missing[credit_missing['Annual_income'].isna()==True]=annual_income_missing

In [None]:
credit_missing.groupby(["Income_type"])['Annual_income'].mean()

In [None]:
credit_missing['Annual_income'].isna().sum()

#### Handling missing values of Age

In [None]:
age_missing=credit_missing[credit_missing['Age'].isna()==True] 
age_missing

In [None]:
meanage_income_type=credit_missing.groupby("Income_type")['Age'].mean()
meanage_income_type

In [None]:
for i in age_missing.index:
    for j in meanage_income_type.index:
        if age_missing.loc[i,'Income_type']==j:
            age_missing.loc[i,'Age']=meanage_income_type[j]

In [None]:
age_missing

In [None]:
credit_missing[credit_missing['Age'].isna()==True]=age_missing

In [None]:
credit_missing.groupby(['Income_type'])["Age"].mean()

In [None]:
credit_missing['Age'].isna().sum()

#### Handling missing values of Occupation_type

In [None]:
credit_missing['Occupation_type'].isna().sum()

In [None]:
credit_missing['Occupation_type'].unique()

In [None]:
ocuupation_type_missing=credit_missing[credit_missing['Occupation_type'].isna()==True]

ocuupation_type_missing.head(10)

In [None]:
contigency_table=pd.crosstab(credit_missing['Income_type'],credit_missing['Occupation_type'])

In [None]:
contigency_table

In [None]:
contigency_table.sum(axis=1)

In [None]:
cond_prob=contigency_table.div(contigency_table.sum(axis=1), axis=0)

In [None]:
cond_prob

* If the applicant has income_type as Commercial associate, then the probability of that person being Accountants is 0.059028

In [None]:
for i in ocuupation_type_missing.index:
    for j in cond_prob.index:
        if ocuupation_type_missing.loc[i,'Income_type']==j:
            ocuupation_type_missing.loc[i,'Occupation_type']=np.random.choice(cond_prob.columns, p=cond_prob.loc[j])
            

In [None]:
ocuupation_type_missing[ocuupation_type_missing['Income_type']=='Pensioner']

In [None]:
credit_missing[credit_missing['Occupation_type'].isna()==True]=ocuupation_type_missing

In [None]:
credit_missing['Occupation_type'].isna().sum()

In [None]:
credit_missing.isna().sum()

### Handling Outliers

In [None]:
credit_outliers=credit_missing.copy()

In [None]:
credit_outliers.shape

In [None]:
credit_outliers.columns

#### Handling outliers Annual_income

In [None]:
plt.subplot(1,2,1)
sns.histplot(data=credit_outliers, x='Annual_income',kde=True)
plt.axvline(x=credit_outliers['Annual_income'].mean(),color="red",label="Mean")
plt.axvline(x=credit_outliers['Annual_income'].median(),color="blue",label="Median", ls="--")
plt.legend()

plt.subplot(1,2,2)
sns.boxplot(data=credit_outliers, x='Annual_income')

plt.subplots_adjust(right=2.0)
plt.show()


In [None]:
Q1=np.percentile(credit_outliers['Annual_income'],25)
Q3=np.percentile(credit_outliers['Annual_income'],75)

IQR=Q3-Q1

lower_limit=Q1-1.5*IQR
upper_limit=Q3+1.5*IQR

In [None]:
upper_limit

In [None]:
credit_outliers=credit_outliers[credit_outliers['Annual_income']<upper_limit]

#### After handling outliers of Annual_income

In [None]:
plt.subplot(1,2,1)
sns.histplot(data=credit_outliers, x='Annual_income',kde=True)
plt.axvline(x=credit_outliers['Annual_income'].mean(),color="red",label="Mean")
plt.axvline(x=credit_outliers['Annual_income'].median(),color="blue",label="Median", ls="--")
plt.legend()

plt.subplot(1,2,2)
sns.boxplot(data=credit_outliers, x='Annual_income')

plt.subplots_adjust(right=2.0)
plt.show()


#### Handling outliers Experience

In [None]:
plt.subplot(1,2,1)
sns.histplot(data=credit_outliers, x='Experience',kde=True)
plt.axvline(x=credit_outliers['Experience'].mean(),color="red",label="Mean")
plt.axvline(x=credit_outliers['Experience'].median(),color="blue",label="Median", ls="--")
plt.legend()

plt.subplot(1,2,2)
sns.boxplot(data=credit_outliers, x='Experience')

plt.subplots_adjust(right=2.0)

plt.show()


In [None]:
Q1=np.percentile(credit_outliers['Experience'],25, interpolation='midpoint')
Q3=np.percentile(credit_outliers['Experience'],75,interpolation='midpoint')

IQR=Q3-Q1

lower_limit=Q1-1.5*IQR
upper_limit=Q3+1.5*IQR

In [None]:
upper_limit

In [None]:
credit_outliers=credit_outliers[credit_outliers['Experience']<upper_limit]

In [None]:
credit_outliers.shape

##### After handling outliers of experience

In [None]:
plt.subplot(1,2,1)
sns.histplot(data=credit_outliers, x='Experience',kde=True)
plt.axvline(x=credit_outliers['Experience'].mean(),color="red",label="Mean")
plt.axvline(x=credit_outliers['Experience'].median(),color="blue",label="Median", ls="--")
plt.legend()

plt.subplot(1,2,2)
sns.boxplot(data=credit_outliers, x='Experience')

plt.subplots_adjust(right=2.0)

plt.show()


In [None]:
credit_outliers1=credit_outliers.copy()

In [None]:
credit_outliers1['Experience']=np.sqrt(credit_outliers1['Experience'])

In [None]:
plt.subplot(1,2,1)
sns.histplot(data=credit_outliers1, x='Experience',kde=True)
plt.axvline(x=credit_outliers1['Experience'].mean(),color="red",label="Mean")
plt.axvline(x=credit_outliers1['Experience'].median(),color="blue",label="Median", ls="--")
plt.legend()

plt.subplot(1,2,2)
sns.boxplot(data=credit_outliers1, x='Experience')

plt.subplots_adjust(right=2.0)

plt.show()


#### Handling outliers of Family_Members

In [None]:
sns.boxplot(data=credit_outliers1, x='Family_Members')

plt.show()


In [None]:
credit_outliers1=credit_outliers1[credit_outliers1['Family_Members']<14]

In [None]:
sns.boxplot(data=credit_outliers1, x='Family_Members')

plt.show()

In [None]:
credit_outliers1.columns

* The below are columns are dropped from the dataset:
    * Children ---> because children and family_members are corelated to each other, so dropping one of these columns
    * Mobile_phone ---> because it does not show any variation in the data that means it has only one value in the entire column

In [None]:
credit_clean=credit_outliers.drop(columns=['Children','Mobile_phone'])

### Feature Encoding

In [None]:
credit_encoding=credit_clean.copy()

In [None]:
credit_encoding.info()

In [None]:
credit_encoding['Gender']=credit_encoding['Gender'].map({'M':1,'F':0})
credit_encoding['Car_Owner']=credit_encoding['Car_Owner'].map({'Y':1,'N':0})
credit_encoding['Property_owner']=credit_encoding['Property_owner'].map({'Y':1,'N':0})
    

#### Performing Ordinal Encoding for Education

In [None]:
credit_encoding['Education'].unique()

The order of Education is:
* lower secondary
* secondary special
* incomplete higher
* higher
* academic

In [None]:
from sklearn.preprocessing import OrdinalEncoder
columns_order=['Lower secondary','Secondary / secondary special','Incomplete higher','Higher education', 'Academic degree']
encoder_oe=OrdinalEncoder(categories=[columns_order])
credit_encoding['Education']=encoder_oe.fit_transform(credit_encoding[['Education']])

In [None]:
credit_encoding.sample(3)

Performing One hot Encoder for the below features:
* Income_type
* Marital_status
* Housing_type
* Occupation_type

In [None]:
credit_encoding_ohe=credit_encoding[['Income_type','Marital_status','Housing_type','Occupation_type']]

In [None]:
credit_encoding['Occupation_type'].unique()

In [None]:
credit_encoding=pd.get_dummies(credit_encoding, columns=['Income_type','Marital_status','Housing_type','Occupation_type'],
                              drop_first=True)

In [None]:
pd.set_option('display.max_columns', 100)
credit_encoding.sample(5)

In [None]:
credit_encoding.shape

### Treating imbalanced dataset

In [None]:
sns.countplot(data=credit_encoding, x='label')
plt.title("Distribution of Label(target variable)")
plt.show()

In [None]:
credit_encoding['label'].value_counts()

In [None]:
X=credit_encoding.drop(columns=['label'])
y=credit_encoding['label']


In [None]:
from imblearn.over_sampling import SMOTE

oversample=SMOTE()
X, y = oversample.fit_resample(X, y)

* After performing SMOTE

In [None]:
y.value_counts()

#### Splitting the dataset into train and test

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.20, random_state=10)

In [None]:
X_train.shape

In [None]:
X_test.shape

#### Feature Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
log_model=LogisticRegression()
log_model.fit(X_train_scaled, y_train)

In [None]:
y_train_predict=log_model.predict(X_train_scaled)
y_train_predict

In [None]:
from sklearn.metrics import confusion_matrix, classification_report,ConfusionMatrixDisplay


fig, ax=plt.subplots(figsize=(3,5))
conf_matrix_log=confusion_matrix(y_train, y_train_predict)
ConfusionMatrixDisplay.from_predictions(y_train, y_train_predict,cmap="Blues", ax=ax, colorbar=False)
plt.grid(visible=None)
plt.show()

In [None]:
print(classification_report(y_train, y_train_predict))

In [None]:
from sklearn.metrics import accuracy_score

accu_score_train_log=accuracy_score(y_train, y_train_predict)
accu_score_train_log

In [None]:
y_test_predict=log_model.predict(X_test_scaled)
y_test_predict

In [None]:
fig, ax=plt.subplots(figsize=(3,5))
conf_matrix_log=confusion_matrix(y_test, y_test_predict)
ConfusionMatrixDisplay.from_predictions(y_test, y_test_predict,cmap="Blues", ax=ax, colorbar=False)
plt.grid(visible=None)
plt.show()

In [None]:
print(classification_report(y_test, y_test_predict))

In [None]:
accu_score_test_log=accuracy_score(y_test, y_test_predict)
accu_score_test_log

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_model=KNeighborsClassifier(n_neighbors=5,metric='minkowski', p=2)
knn_model.fit(X_train_scaled, y_train)

In [None]:
y_train_predict=knn_model.predict(X_train_scaled)
y_train_predict

In [None]:
fig, ax=plt.subplots(figsize=(3,5))
knn_conf_matrix=confusion_matrix(y_train, y_train_predict)
ConfusionMatrixDisplay.from_predictions(y_train, y_train_predict,cmap="Blues",ax=ax, colorbar=False)
plt.grid(visible=None)
plt.show()

In [None]:
print(classification_report(y_train, y_train_predict))

In [None]:
accu_score_train_knn=accuracy_score(y_train, y_train_predict)
accu_score_train_knn

In [None]:
y_test_predict=knn_model.predict(X_test_scaled)
y_test_predict

In [None]:
fig, ax=plt.subplots(figsize=(3,5))
knn_conf_matrix=confusion_matrix(y_test, y_test_predict)
ConfusionMatrixDisplay.from_predictions(y_test, y_test_predict,cmap="Blues",ax=ax, colorbar=False)
plt.grid(visible=None)
plt.show()

In [None]:
print(classification_report(y_test, y_test_predict))

In [None]:
accu_score_test_knn=accuracy_score(y_test, y_test_predict)
accu_score_test_knn

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model=DecisionTreeClassifier(criterion='entropy',max_depth=5)
dt_model.fit(X_train_scaled, y_train)

In [None]:
y_train_predict=dt_model.predict(X_train_scaled)
y_train_predict

In [None]:
fig, ax=plt.subplots(figsize=(3,5))
dt_conf_matrix=confusion_matrix(y_train, y_train_predict)
ConfusionMatrixDisplay.from_predictions(y_train, y_train_predict,cmap="Blues",ax=ax, colorbar=False)
plt.grid(visible=None)
plt.show()

In [None]:
print(classification_report(y_train, y_train_predict))

In [None]:
accu_score_train_dt=accuracy_score(y_train, y_train_predict)
accu_score_train_dt

In [None]:
y_test_predict=dt_model.predict(X_test_scaled)
y_test_predict

In [None]:
fig, ax=plt.subplots(figsize=(3,5))
dt_conf_matrix=confusion_matrix(y_test, y_test_predict)
ConfusionMatrixDisplay.from_predictions(y_test, y_test_predict,cmap="Blues",ax=ax, colorbar=False)
plt.grid(visible=None)
plt.show()

In [None]:
print(classification_report(y_test, y_test_predict))

In [None]:
accu_score_test_dt=accuracy_score(y_test, y_test_predict)
accu_score_test_dt

### SVM

In [None]:
from sklearn import svm

svm_model=svm.SVC(kernel='rbf')
svm_model.fit(X_train_scaled, y_train)

In [None]:
y_train_predict=svm_model.predict(X_train_scaled)
y_train_predict

In [None]:
fig, ax=plt.subplots(figsize=(3,5))
svm_conf_matrix=confusion_matrix(y_train, y_train_predict)
ConfusionMatrixDisplay.from_predictions(y_train, y_train_predict,cmap="Blues",ax=ax, colorbar=False)
plt.grid(visible=None)
plt.show()

In [None]:
print(classification_report(y_train, y_train_predict))

In [None]:
accu_score_train_svm=accuracy_score(y_train, y_train_predict)
accu_score_train_svm

In [None]:
y_test_predict=svm_model.predict(X_test_scaled)
y_test_predict

In [None]:
fig, ax=plt.subplots(figsize=(3,5))
svm_conf_matrix=confusion_matrix(y_test, y_test_predict)
ConfusionMatrixDisplay.from_predictions(y_test, y_test_predict,cmap="Blues",ax=ax, colorbar=False)
plt.grid(visible=None)
plt.show()

In [None]:
print(classification_report(y_test, y_test_predict))

In [None]:
accu_score_test_svm=accuracy_score(y_test, y_test_predict)
accu_score_test_svm

In [None]:
models=['Logistic_Regression', 'KNN', 'Decision_Tree', 'SVM']

data={'train':[accu_score_train_log,accu_score_train_knn,accu_score_train_dt,accu_score_train_svm],
'test':[accu_score_test_log,accu_score_test_knn,accu_score_test_dt,accu_score_test_svm]}

accuracy_scores=pd.DataFrame(data, index=[models])
accuracy_scores

* Based on accuaracy, choosing the svm model for prediction.

#### Hyperparameter tuning for SVM

In [None]:
from sklearn.model_selection import GridSearchCV
grid_parameters={'C':[0.1,1,10,100],'gamma':[1,0.1,0.01,0.001],'kernel':['rbf']}
grid=GridSearchCV(svm.SVC(),grid_parameters)
grid.fit(X_train_scaled, y_train)
print(grid.best_params_)

##### For these parameters again fitting the model

In [None]:
svm_model_hyp_para=svm.SVC(C=10,gamma=1,kernel='rbf')
svm_model_hyp_para.fit(X_train_scaled, y_train)

In [None]:
y_train_predict=svm_model_hyp_para.predict(X_train_scaled)
y_train_predict

In [None]:
fig, ax=plt.subplots(figsize=(3,5))
svm_conf_matrix=confusion_matrix(y_train, y_train_predict)
ConfusionMatrixDisplay.from_predictions(y_train, y_train_predict,cmap="Blues",ax=ax, colorbar=False)
plt.grid(visible=None)
plt.show()

In [None]:
print(classification_report(y_train, y_train_predict))

#### Testing the model

In [None]:
y_test_predict=svm_model_hyp_para.predict(X_test_scaled)
y_test_predict

In [None]:
fig, ax=plt.subplots(figsize=(3,5))
svm_conf_matrix=confusion_matrix(y_test, y_test_predict)
ConfusionMatrixDisplay.from_predictions(y_test, y_test_predict,cmap="Blues",ax=ax, colorbar=False)
plt.grid(visible=None)
plt.show()

In [None]:
print(classification_report(y_test, y_test_predict))