<a href="https://colab.research.google.com/github/plk456/strokeprediction/blob/main/strokeprediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import  accuracy_score, f1_score,recall_score
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [3]:
n=pd.read_csv('hearthstoke.csv')
print(n.head())

      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female  61.0             0              0          Yes   
2  31112    Male  80.0             0              1          Yes   
3  60182  Female  49.0             0              0          Yes   
4   1665  Female  79.0             1              0          Yes   

       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0        Private          Urban             228.69  36.6  formerly smoked   
1  Self-employed          Rural             202.21   NaN     never smoked   
2        Private          Rural             105.92  32.5     never smoked   
3        Private          Urban             171.23  34.4           smokes   
4  Self-employed          Rural             174.12  24.0     never smoked   

   stroke  
0       1  
1       1  
2       1  
3       1  
4       1  


In [4]:
le_gender=LabelEncoder()
le_ever_married=LabelEncoder()
le_work_type=LabelEncoder()
le_Residence=LabelEncoder()
le_smoking=LabelEncoder()
n['gender']=le_gender.fit_transform(n['gender'])
n['ever_married']=le_ever_married.fit_transform(n['ever_married'])
n['work_type']=le_work_type.fit_transform(n['work_type'])
n['Residence_type']=le_Residence.fit_transform(n['Residence_type'])
n['smoking_status']=le_smoking.fit_transform(n['smoking_status'])
print(n)

         id  gender   age  hypertension  heart_disease  ever_married  \
0      9046       1  67.0             0              1             1   
1     51676       0  61.0             0              0             1   
2     31112       1  80.0             0              1             1   
3     60182       0  49.0             0              0             1   
4      1665       0  79.0             1              0             1   
...     ...     ...   ...           ...            ...           ...   
5105  18234       0  80.0             1              0             1   
5106  44873       0  81.0             0              0             1   
5107  19723       0  35.0             0              0             1   
5108  37544       1  51.0             0              0             1   
5109  44679       0  44.0             0              0             1   

      work_type  Residence_type  avg_glucose_level   bmi  smoking_status  \
0             2               1             228.69  36.6   

# i am deleting "id" column because it is not useful for me in this dataset

In [5]:
n=n.drop(['id'],axis=1)

In [6]:
print(n)

      gender   age  hypertension  heart_disease  ever_married  work_type  \
0          1  67.0             0              1             1          2   
1          0  61.0             0              0             1          3   
2          1  80.0             0              1             1          2   
3          0  49.0             0              0             1          2   
4          0  79.0             1              0             1          3   
...      ...   ...           ...            ...           ...        ...   
5105       0  80.0             1              0             1          2   
5106       0  81.0             0              0             1          3   
5107       0  35.0             0              0             1          3   
5108       1  51.0             0              0             1          2   
5109       0  44.0             0              0             1          0   

      Residence_type  avg_glucose_level   bmi  smoking_status  stroke  
0              

# Here you can see Nan values

In [7]:
print(n.iloc[:,8:9])

       bmi
0     36.6
1      NaN
2     32.5
3     34.4
4     24.0
...    ...
5105   NaN
5106  40.0
5107  30.6
5108  25.6
5109  26.2

[5110 rows x 1 columns]


In [8]:
imputer=SimpleImputer(missing_values=np.nan,strategy='mean')
imputer.fit(n.iloc[:,8:9])
n.iloc[:,8:9]=imputer.transform(n.iloc[:,8:9])


In [9]:
print(n)

      gender   age  hypertension  heart_disease  ever_married  work_type  \
0          1  67.0             0              1             1          2   
1          0  61.0             0              0             1          3   
2          1  80.0             0              1             1          2   
3          0  49.0             0              0             1          2   
4          0  79.0             1              0             1          3   
...      ...   ...           ...            ...           ...        ...   
5105       0  80.0             1              0             1          2   
5106       0  81.0             0              0             1          3   
5107       0  35.0             0              0             1          3   
5108       1  51.0             0              0             1          2   
5109       0  44.0             0              0             1          0   

      Residence_type  avg_glucose_level        bmi  smoking_status  stroke  
0         

In [10]:
x=n.drop(['stroke'],axis=1)
X=x.head(500)
y=n['stroke']
Y=y.head(500)

x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [None]:
result={}

models={
    'linear':LogisticRegression(max_iter=500),
    'random':RandomForestClassifier(),
    'tree':DecisionTreeClassifier(),
    'svm':SVC(kernel='rbf',C=1,gamma='scale'),
    'neighbors':KNeighborsClassifier(),   

}



param_grids = {
        'linear': {
            'C': [0.1, 1, 10],
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear']
        },
    'random': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
    },
    'tree': {
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    'svm': {
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto']
    },
    'neighbors': {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance']
    }
}

smote=SMOTE(random_state=42)  
x_train, y_train = smote.fit_resample(x_train, y_train) 

CV=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

for model_name,model in models.items():
    pipeline=Pipeline([
    ('imputer',SimpleImputer()),
    ('scaler',StandardScaler()),
    ('classification',model)
    ])

    param_grid = {f'classification__{key}': value for key, value in param_grids[model_name].items()}
    print(f'training the current model is {model_name} is running...')

    grid_params=GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=CV,
        scoring='accuracy'
    )
    grid_params.fit(x_train, y_train)
    result[model_name] = grid_params

for model_name, grid_params in result.items():
        best_params = grid_params.best_params_
        best_score = grid_params.best_score_
        print(f"Model: {model_name}")
        print(f"Best Parameters: {best_params}")
        print(f"Best Score: {best_score:.4f}")
        print("-" * 40)

training the current model is linear is running...
training the current model is random is running...
training the current model is tree is running...
training the current model is svm is running...
training the current model is neighbors is running...
Model: linear
Best Parameters: {'classification__C': 0.1, 'classification__penalty': 'l1', 'classification__solver': 'liblinear'}
Best Score: 0.7612
----------------------------------------
Model: random
Best Parameters: {'classification__max_depth': None, 'classification__min_samples_split': 5, 'classification__n_estimators': 200}
Best Score: 0.7760
----------------------------------------
Model: tree
Best Parameters: {'classification__max_depth': 20, 'classification__min_samples_split': 10}
Best Score: 0.6949
----------------------------------------
Model: svm
Best Parameters: {'classification__C': 0.1, 'classification__gamma': 'scale'}
Best Score: 0.7366
----------------------------------------
Model: neighbors
Best Parameters: {'clas