**Importing Libraries and Loading the Dataset**

In [1]:
#Importing all the necesary libraries 

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')


In [2]:
#loading the heart disease dataset
heart_df = pd.read_csv('heart.csv')
heart_df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [3]:
#Displaying Random 5 samples from the dataset
heart_df.sample(5)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
907,44,M,ASY,120,169,0,Normal,144,Y,2.8,Down,1
551,62,M,NAP,120,220,0,LVH,86,N,0.0,Up,0
479,56,M,NAP,170,0,0,LVH,123,Y,2.5,Flat,1
768,64,F,ASY,130,303,0,Normal,122,N,2.0,Flat,0
911,59,M,ASY,164,176,1,LVH,90,N,1.0,Flat,1


In [4]:
#Getting dataset information summary
heart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [5]:
#getting the statistical measures of the dataset
heart_df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [6]:
#getting the statistical measures of the dataset including categorical columns
heart_df.describe(include='all')

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,918.0,918,918,918.0,918.0,918.0,918,918.0,918,918.0,918,918.0
unique,,2,4,,,,3,,2,,3,
top,,M,ASY,,,,Normal,,N,,Flat,
freq,,725,496,,,,552,,547,,460,
mean,53.510893,,,132.396514,198.799564,0.233115,,136.809368,,0.887364,,0.553377
std,9.432617,,,18.514154,109.384145,0.423046,,25.460334,,1.06657,,0.497414
min,28.0,,,0.0,0.0,0.0,,60.0,,-2.6,,0.0
25%,47.0,,,120.0,173.25,0.0,,120.0,,0.0,,0.0
50%,54.0,,,130.0,223.0,0.0,,138.0,,0.6,,1.0
75%,60.0,,,140.0,267.0,0.0,,156.0,,1.5,,1.0


__Data Preprocessing__

In [7]:
#checking for missing values
heart_df.isnull().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [8]:
#checking for the duplicate values
heart_df.duplicated().sum()

np.int64(0)

In [9]:
#checking number of unique values in each feature
heart_df.nunique()

Age                50
Sex                 2
ChestPainType       4
RestingBP          67
Cholesterol       222
FastingBS           2
RestingECG          3
MaxHR             119
ExerciseAngina      2
Oldpeak            53
ST_Slope            3
HeartDisease        2
dtype: int64

In [10]:

#we will now separate the numerical and categorical features
cat_col =heart_df.select_dtypes(include=['object']).columns

In [11]:
heart_df.select_dtypes(include=['object']).columns

Index(['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], dtype='object')

In [12]:
heart_df['ChestPainType'].value_counts()

ChestPainType
ASY    496
NAP    203
ATA    173
TA      46
Name: count, dtype: int64

In [13]:
heart_df['ChestPainType'].unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

In [14]:
#so we will replace the values with numerical values like ATA = 0 , NAP = 1 , ASY = 2 , TA = 3
range(heart_df['ChestPainType'].nunique())
# we are getting the range of unique values in the chestpaintype column

# after  getting the range we will replace the values with numerical values aslo 
# we will use inplace = True to make the changes in the original dataset 

range(0, 4)

**Converting Categorical Variables to Numeric**


Sex : M = 0 , F =1

ChestPainType : ATA = 0 , NAP = 1 , ASY = 2 , TA = 3

RestingECG : Normal = 0,ST = 1,LVH = 2

ExcerciseAngina : N = 0 ,Y = 1

ST_Slope : Up = 0 ,Flat = 1,Down = 2

In [15]:
#Replacing categorical columns with numerical values
for col in cat_col:
    #we will print the column name first
    print(f' {col}' )
    #we will print the unique values and the range of unique values
    print((heart_df[col].unique()) , list(range(heart_df[col].nunique())))
    #now we will replace the values with numerical values in the original dataset
    heart_df[col].replace(heart_df[col].unique() , range(heart_df[col].nunique()) , inplace = True)
    print('*' *90)
    print()

 Sex
['M' 'F'] [0, 1]
******************************************************************************************

 ChestPainType
['ATA' 'NAP' 'ASY' 'TA'] [0, 1, 2, 3]
******************************************************************************************

 RestingECG
['Normal' 'ST' 'LVH'] [0, 1, 2]
******************************************************************************************

 ExerciseAngina
['N' 'Y'] [0, 1]
******************************************************************************************

 ST_Slope
['Up' 'Flat' 'Down'] [0, 1, 2]
******************************************************************************************



In [16]:
#Final dataset after replacing categorical values with numerical values

heart_df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,0,140,289,0,0,172,0,0.0,0,0
1,49,1,1,160,180,0,0,156,0,1.0,1,1
2,37,0,0,130,283,0,1,98,0,0.0,0,0
3,48,1,2,138,214,0,0,108,1,1.5,1,1
4,54,0,1,150,195,0,0,122,0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,0,3,110,264,0,0,132,0,1.2,1,1
914,68,0,2,144,193,1,0,141,0,3.4,1,1
915,57,0,2,130,131,0,0,115,1,1.2,1,1
916,57,1,0,130,236,0,2,174,0,0.0,1,1


In [17]:
#here i will check the value counts of the Cholesterol column
heart_df['Cholesterol'].value_counts()

Cholesterol
0      172
254     11
220     10
223     10
204      9
      ... 
353      1
278      1
157      1
176      1
131      1
Name: count, Length: 222, dtype: int64

In [18]:
#after checking the value counts of the cholesterol column
# i found that there are some 0 values which is not possible in real life scenario


__cholesterol cannot be 0 , it states that reading was not properly noted .__


__Imputing 0 values in cholesterol column with KNN Imputer__

In [19]:
# I will replace those  0 values with NAN values by using numpy library  in the original dataset
heart_df['Cholesterol'].replace(0 , np.nan , inplace = True)

In [20]:
# now i will use KNN imputer to fill the missing values in the Cholesterol column

#Importing KNN imputer from sklearn library

from sklearn.impute import KNNImputer

#Creating KNN imputer object  

#here i am using 3 nearest neighbors to impute the missing values

imputer = KNNImputer(n_neighbors=3)

#Fitting the imputer object to the dataset and transforming the dataset

after_impute=imputer.fit_transform(heart_df)

#now i will convert the after_impute array back to dataframe
heart_df = pd.DataFrame(after_impute , columns= heart_df.columns)



In [21]:
#now i will check for missing values again in the Cholesterol column
heart_df['Cholesterol'].isna().sum()

np.int64(0)

In [22]:
# to confirm that there are no 0 values in the Cholesterol column
count  = 0 
for i in heart_df['Cholesterol']:
    if i == 0:
        count += 1
print(count)

0


**Doing the same for Resting Blood Pressure**

In [23]:
heart_df['RestingBP'][heart_df['RestingBP']  == 0]


449    0.0
Name: RestingBP, dtype: float64

In [24]:
from sklearn.impute import KNNImputer

#Replacing 0 values in RestingBP column with NaN values
heart_df['RestingBP'].replace(0 , np.nan , inplace = True)

#Creating KNN imputer object for RestingBP column
imputer = KNNImputer(n_neighbors=3)

#Fitting the imputer object to the dataset and transforming the dataset
after_impute=imputer.fit_transform(heart_df)

#now i will convert the after_impute array back to dataframe
heart_df = pd.DataFrame(after_impute , columns= heart_df.columns)

In [25]:
heart_df['RestingBP'].unique()

array([140., 160., 130., 138., 150., 120., 110., 136., 115., 100., 124.,
       113., 125., 145., 112., 132., 118., 170., 142., 190., 135., 180.,
       108., 155., 128., 106.,  92., 200., 122.,  98., 105., 133.,  95.,
        80., 137., 185., 165., 126., 152., 116., 144., 154., 134., 104.,
       139., 131., 141., 178., 146., 158., 123., 102.,  96., 143., 172.,
       156., 114., 127., 101., 174.,  94., 148., 117., 192., 129., 164.])

In [26]:
heart_df['RestingBP'].isnull().sum()

np.int64(0)

**Change columns type to int** 

In [27]:
#Converting all the columns except Oldpeak to int32 datatype

#here i will get all the column names except Oldpeak column
withoutOldPeak = heart_df.columns

#hear i will drop the Oldpeak column from the list
withoutOldPeak = withoutOldPeak.drop('Oldpeak')

#here i will convert the columns to int32 datatype
heart_df[withoutOldPeak] = heart_df[withoutOldPeak].astype('int32')

In [28]:
#let's check the info of the final dataset
heart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int32  
 1   Sex             918 non-null    int32  
 2   ChestPainType   918 non-null    int32  
 3   RestingBP       918 non-null    int32  
 4   Cholesterol     918 non-null    int32  
 5   FastingBS       918 non-null    int32  
 6   RestingECG      918 non-null    int32  
 7   MaxHR           918 non-null    int32  
 8   ExerciseAngina  918 non-null    int32  
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    int32  
 11  HeartDisease    918 non-null    int32  
dtypes: float64(1), int32(11)
memory usage: 46.7 KB


In [29]:
heart_df
# now the dataset is clean and ready for further analysis and model building


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,0,140,289,0,0,172,0,0.0,0,0
1,49,1,1,160,180,0,0,156,0,1.0,1,1
2,37,0,0,130,283,0,1,98,0,0.0,0,0
3,48,1,2,138,214,0,0,108,1,1.5,1,1
4,54,0,1,150,195,0,0,122,0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,0,3,110,264,0,0,132,0,1.2,1,1
914,68,0,2,144,193,1,0,141,0,3.4,1,1
915,57,0,2,130,131,0,0,115,1,1.2,1,1
916,57,1,0,130,236,0,2,174,0,0.0,1,1


In [30]:
heart_df.sample()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
359,53,0,1,105,217,0,0,115,0,0.0,1,1


In [31]:

# here i will use the corr() function to find the correlation between different features in the dataset
heart_df.corr()

#here the value with -ve sign indicates negative correlation between each other 
#while +ve sign indicates positive correlation between each other
#while the value close to 0 indicates no correlation between each other



Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
Age,1.0,-0.05575,0.214164,0.263128,0.057108,0.198039,0.213152,-0.382045,0.215793,0.258612,0.268264,0.282039
Sex,-0.05575,1.0,-0.179766,-0.009658,0.080395,-0.120076,0.018343,0.189186,-0.190664,-0.105734,-0.150693,-0.305445
ChestPainType,0.214164,-0.179766,1.0,0.073016,0.038855,0.155111,0.112067,-0.273076,0.324049,0.300846,0.352364,0.459017
RestingBP,0.263128,-0.009658,0.073016,1.0,0.113194,0.067556,0.094144,-0.109329,0.152621,0.174464,0.082401,0.118333
Cholesterol,0.057108,0.080395,0.038855,0.113194,1.0,0.058798,0.074925,-0.028855,0.083761,0.065988,0.071345,0.101263
FastingBS,0.198039,-0.120076,0.155111,0.067556,0.058798,1.0,0.050707,-0.131438,0.060451,0.052698,0.175774,0.267291
RestingECG,0.213152,0.018343,0.112067,0.094144,0.074925,0.050707,1.0,0.048552,0.036119,0.114428,0.078807,0.061011
MaxHR,-0.382045,0.189186,-0.273076,-0.109329,-0.028855,-0.131438,0.048552,1.0,-0.370425,-0.160691,-0.343419,-0.400421
ExerciseAngina,0.215793,-0.190664,0.324049,0.152621,0.083761,0.060451,0.036119,-0.370425,1.0,0.408752,0.428706,0.494282
Oldpeak,0.258612,-0.105734,0.300846,0.174464,0.065988,0.052698,0.114428,-0.160691,0.408752,1.0,0.501921,0.403951


In [32]:


heart_df.corr()['HeartDisease'][:-1]

Age               0.282039
Sex              -0.305445
ChestPainType     0.459017
RestingBP         0.118333
Cholesterol       0.101263
FastingBS         0.267291
RestingECG        0.061011
MaxHR            -0.400421
ExerciseAngina    0.494282
Oldpeak           0.403951
ST_Slope          0.558771
Name: HeartDisease, dtype: float64

**Data Visualization**

In [33]:
# here i will use plotly express to create some visualizations
import plotly.express as px

# here i will plot the correlation values of different features with respect to HeartDisease feature
px.line(heart_df.corr()['HeartDisease'][:-1].sort_values() , title='Correlation of different features with Heart Disease')


**Age and HeartDisease Distribution**

In [34]:
#here i will create a sunburst plot to visualize the relationship between HeartDisease and Age features

px.sunburst(heart_df, path = ['HeartDisease','Age'] ,title='Age vs Heart Disease' )



In [35]:
#here i will create a histogram to visualize the relationship between Cholesterol and HeartDisease features
px.histogram(heart_df,x= 'Age', color='HeartDisease',  title='Age vs Heart Disease')

In [36]:

# here i will create a histogram to visualize the relationship between Cholesterol and HeartDisease features
px.histogram(heart_df, x='Cholesterol', color='HeartDisease', barmode='group', nbins=50, title='Cholesterol Levels vs Heart Disease')

**Percentage of HeartDisease data distribution** 


In [37]:

# here i will create a pie chart to visualize the percentage of heart disease in the dataset
px.pie(heart_df, names='HeartDisease', title='Percentage of Heart Disease in the Dataset')

**Sex Vs Heart Disease**

In [38]:
#here i will create a histogram to visualize the relationship between
px.histogram(heart_df,x= 'Sex',color='HeartDisease', title='Sex vs Heart Disease')

# SEX : 0 = Male , 1 = Female

**ChestPainType Vs HeartDisease**


In [39]:
# Here i will create a histogram to visualize the relationship between ChestPainType and HeartDisease features
px.histogram(heart_df , x = 'ChestPainType' ,color='HeartDisease' , title='Chest Pain Type vs Heart Disease' )

# ChestPainType  :  ATA = 0 , NAP = 1 , ASY = 2 , TA = 3

**RestingBP Vs HeartDisease**

In [40]:
heart_df['RestingBP'].unique()

array([140, 160, 130, 138, 150, 120, 110, 136, 115, 100, 124, 113, 125,
       145, 112, 132, 118, 170, 142, 190, 135, 180, 108, 155, 128, 106,
        92, 200, 122,  98, 105, 133,  95,  80, 137, 185, 165, 126, 152,
       116, 144, 154, 134, 104, 139, 131, 141, 178, 146, 158, 123, 102,
        96, 143, 172, 156, 114, 127, 101, 174,  94, 148, 117, 192, 129,
       164], dtype=int32)

In [41]:
#here i will create a sunburst plot to visualize the relationship between HeartDisease and RestingBP features
px.sunburst(heart_df, path=['HeartDisease','RestingBP'], title='RestingBP vs Heart Disease')

**FastingBS Vs HeartDisease**


In [42]:
#Here i will create a histogram to visualize the relationship between
px.histogram(heart_df , x = 'FastingBS' , color='HeartDisease' , title='FastingBS vs Heart Disease' )

**MaxHR vs Heart Disease**

In [43]:
#here i will create a sunburst plot to visualize the relationship between HeartDisease and MaxHR features
px.sunburst(heart_df, path=['HeartDisease','MaxHR'] , title='MaxHR vs Heart Disease' )
#this plot will help us to understand the distribution of MaxHR for both classes of HeartDisease

In [44]:
# here i will create a violin plot to visualize the distribution of MaxHR with respect to HeartDisease feature
#this plot will help us to understand the distribution of MaxHR for both classes of HeartDisease
px.violin(heart_df,x = 'HeartDisease' ,  y =  'MaxHR' , color='HeartDisease' , title='MaxHR vs Heart Disease' )

**Oldpeak vs Heart Disease**

In [45]:
#here i will create a violin plot to visualize the distribution of Oldpeak with respect to HeartDisease feature
#this plot will help us to understand the distribution of Oldpeak for both classes of HeartDisease

px.violin(heart_df, y='Oldpeak', x='HeartDisease', color= 'HeartDisease',title='Oldpeak vs Heart Disease')

**ST_Slope vs Heart Disease**

In [46]:
#here i will create a histogram to visualize the relationship between ST_Slope and HeartDisease features
#this plot will help us to understand the distribution of ST_Slope for both classes of HeartDisease

px.histogram(heart_df, x = 'ST_Slope', color='HeartDisease', title='St_Slope vs Heart Disease')

**ExerciseAngina Vs Heart Disease**

In [47]:
#here i will create a histogram to visualize the relationship between ExerciseAngina and HeartDisease features
#this plot will help us to understand the distribution of ExerciseAngina for both classes of Heart
px.histogram(heart_df, x='ExerciseAngina', color='HeartDisease', title='Exercise Angina vs Heart Disease')

****Train Test Split****

In [48]:

#here i will split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

#Here i will split the dataset into training and testing sets
X_train , X_test, y_train, y_test = train_test_split(
    
    #here i will set the feature variables by dropping the target variable HeartDisease column
    heart_df.drop('HeartDisease', axis=1) ,
    
    #here i will set the target variable as HeartDisease column
    heart_df['HeartDisease'] ,
    
    #here i will set the test size to 20% of the total dataset
    test_size=0.2 , 
    
    #here i will set random state to 42 to ensure that the results are reproducible
    random_state=42,
    
    #here i will use stratified sampling to ensure that the distribution of the target variable is similar in both training and testing sets
    
    #in simple tearms stratify is used for maintaining the same proportion of classes in both training and testing sets
    stratify=heart_df['HeartDisease']
    )

**MODEL TRAINING**

**Logistic Regression**

In [49]:
#here i will import LogisticRegression model from sklearn library
#also import accuracy_score metric to evaluate the model performance
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score 

#here i will create a list of different solvers to test 
#and find the best solver for our dataset

solvers = ['lbfgs', 'newton-cholesky', 'newton-cg', 'liblinear', 'sag', 'saga']

#here i create a variables to track the best solver and best accuracy
best_solver = ''
best_accuracy = 0

#here i will loop through each solver in the solvers list
for solver in solvers:
    #here i will create a LogisticRegression model with the current solver
    # Create and train model
    log_reg = LogisticRegression(solver=solver, max_iter=1000)
    #fit the model to the training data
    log_reg.fit(X_train, y_train)
    
    #here i will use the trained model to make predictions on the test data
    # Predict and evaluate
    y_pred = log_reg.predict(X_test)
    #here i will calculate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    
    #here i will print the accuracy for the current solver
    print(f'Accuracy for {solver}: {accuracy}')
    
    
    #here i will check if the current accuracy is better than the best accuracy
    # Track best solver
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_solver = solver
#here i will print the best solver and best accuracy
print(f'\nBest solver: {best_solver} with accuracy: {best_accuracy:.4f}')

Accuracy for lbfgs: 0.8586956521739131
Accuracy for newton-cholesky: 0.8586956521739131
Accuracy for newton-cg: 0.8586956521739131
Accuracy for liblinear: 0.8586956521739131
Accuracy for sag: 0.8315217391304348
Accuracy for saga: 0.8206521739130435

Best solver: lbfgs with accuracy: 0.8587


In [50]:
#Here i am importing LogisticRegression model from sklearn library
#also import accuracy_score metric to evaluate the model performance
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

#here i will create a list of different solvers to test
#and find the best solver for our dataset
solvers =['lbfgs', 'newton-cholesky', 'newton-cg', 'liblinear', 'sag', 'saga']

#here i create a variable to track the best solver
best_solver = ''

#here i will create an array to store the test scores for each solver
test_score  =  np.zeros(len(solvers))

#here i will loop through each solver in the solvers list
for i , n in enumerate(solvers):
    #here i will create a LogisticRegression model with the current solver
   lr = LogisticRegression(solver=n ).fit(X_train , y_train)
    #here i used test_score method to evaluate the model performance on the test data
   test_score[i] = lr.score(X_test , y_test)
   
   #test_score array will store the accuracy score for each solver
   
   #here i will check if the current solver has the best accuracy score
   if test_score[i] == max(test_score):
       
       best_solver = n

#here i will print the best solver
print(f'best solver : {best_solver}')       
#here i will create a LogisticRegression model with the best solver
lr =  LogisticRegression(solver=best_solver)
#here i will fit the model to the training data
lr.fit(X_train , y_train)
#here i will use the trained model to make predictions on the test data
lr_pred = lr.predict(X_test)

#here i will calculate the accuracy of the model
accuracy = accuracy_score(y_test , lr_pred)
#here i will print the accuracy of the model
print(f'Best solver is {best_solver} with accuracy of {accuracy}')

best solver : liblinear
Best solver is liblinear with accuracy of 0.8586956521739131


In [51]:
#here i will import pickle library
import pickle
#here i will save the trained model to a file using pickle library
file =  open('LogisticRegression.pkl' , 'wb')

pickle.dump(lr , file)
#here i will close the file

**Support Vector Machine (SVM)**

In [52]:
#here i will import SVC model from sklearn library
#also import f1_score metric to evaluate the model performance for SVM model
from sklearn.svm import SVC
from sklearn.metrics import f1_score

#here i will create a list of different kernels to test
#and find the best kernel for our dataset
kernels = {'linear' : 0, 'poly' : 0, 'rbf' : 0, 'sigmoid' : 0}

best = ''

#here i will loop through each kernel in the kernels list

for i in kernels:
    #here i will create a SVC model with the current kernel
    svm = SVC(kernel=i)
    #here i will fit the model to the training data
    svm.fit(X_train , y_train)
    #yhat will be the predicted values for the test data
    yhat = svm.predict(X_test)
    
    #here i will calculate the f1 score of the model
    kernels[i] = f1_score(y_test , yhat , average='weighted')
    
    #here we are checking for the best kernel
    if  kernels[i] == max(kernels.values()):
        best = i

#here i will print the best kernel and its f1 score       
print(f'Best kernel is {best} with f1 score of {kernels[best]}')  

#here i will create a SVC model with the best kernel     
svm   = SVC(kernel=best)
#here i will fit the model to the training data
svm.fit(X_train , y_train)
#here i will use the trained model to make predictions on the test data
svm_pred = svm.predict(X_test)


#here i will calculate the f1 score of the model
print(f' SVM F1 score  Kernel  : {f1_score(y_test , svm_pred , average="weighted")}')


Best kernel is linear with f1 score of 0.8422922535440344
 SVM F1 score  Kernel  : 0.8422922535440344


In [53]:
import pickle
#here i will save the trained model to a file using pickle library
file =  open('SVM.pkl' , 'wb')
pickle.dump(svm , file)

**Decision Tree Classifier**

In [54]:
#here i will import DecisionTreeClassifier from sklearn library
#also import GridSearchCV for  hyperparameter tuning
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

#here i will create a DecisionTreeClassifier model

dtree =  DecisionTreeClassifier(class_weight='balanced')

#here i will create a parameter grid for hyperparameter tuning

param_grid = {
    
    'max_depth' : [3,5,7,9,11,13,15],
   
    'min_samples_split' : [2,4,6,8,10],
    
    'min_samples_leaf' : [1,2,3,4,5],
    
    'random_state' : [0,42]
}

#here i will create a GridSearchCV object with the DecisionTreeClassifier model and parameter grid
grid_search = GridSearchCV(estimator=dtree, param_grid=param_grid, cv=5,)

#here i will fit the GridSearchCV object to the training data
grid_search.fit(X_train , y_train)

#here i will create a DecisionTreeClassifier model with the best parameters from the grid search
Ctree = DecisionTreeClassifier(**grid_search.best_params_, class_weight='balanced')

#here i will fit the model to the training data
Ctree.fit(X_train , y_train)

#here i will use the trained model to make predictions on the test data
dtc_pred = Ctree.predict(X_test)

#here i will calculate the accuracy of the model
print(f'Decision Tree Classifier Accuracy : {accuracy_score(y_test , dtc_pred)}')



Decision Tree Classifier Accuracy : 0.8097826086956522


In [55]:
import pickle
#here i will save the trained model to a file using pickle library
file =  open('tree.pkl' , 'wb')
pickle.dump(Ctree , file)

**Random Forest Classifier**

In [56]:
# ================== IMPORTS ==================
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import pickle

# ================== MODEL ==================
rfc = RandomForestClassifier(random_state=42)

# ================== PARAM GRID ==================
param_grid = {
    'n_estimators': [50, 100, 150, 500],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9, 19],
    'max_leaf_nodes': [3, 6, 9]
}

# ================== GRID SEARCH ==================
grid_search = GridSearchCV(
    estimator=rfc,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring='accuracy'
)

grid_search.fit(X_train, y_train)

# ================== BEST MODEL ==================
rf_model = RandomForestClassifier(
    **grid_search.best_params_,
    random_state=42
)

rf_model.fit(X_train, y_train)

# ================== EVALUATION ==================
rfc_pred = rf_model.predict(X_test)
print(f"✅ Random Forest Accuracy: {accuracy_score(y_test, rfc_pred):.4f}")

# ================== SAVE MODEL (IMPORTANT) ==================
with open("RandomForest.pkl", "wb") as file:
    pickle.dump(rf_model, file)

print("✅ RandomForest.pkl saved successfully")


✅ Random Forest Accuracy: 0.8424
✅ RandomForest.pkl saved successfully
