Importing requirements

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
import warnings

warnings.filterwarnings("ignore")

Read data from csv

In [None]:
data=pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
data

Drop unnecessary index

In [None]:
data.drop("id",axis=1,inplace=True)

In [None]:
data.head(4)

Visualize the data

In [None]:
# data plot function
def plot_bar(temp1):
    for temp in temp1:
        data_temp=data[[temp,'stroke']].groupby([temp] , as_index=False).mean().sort_values(by='stroke', ascending=False)
        plt.bar(data_temp[temp],data_temp.stroke)
        plt.ylabel('stroke')
        plt.xlabel(temp)
        plt.subplot()
        plt.show()

In [None]:
plot_bar(["gender",'work_type','Residence_type','smoking_status','ever_married','heart_disease','hypertension'])

Fin all the catagorical data

In [None]:
categorical = (data.dtypes == "object")
categorical_list = list(categorical[categorical].index)
print(categorical_list)

In [None]:
gender_encode=LabelEncoder()
married_encode=LabelEncoder()
work_type_encode=LabelEncoder()
Residence_type_encode=LabelEncoder()
smoking_status_encode=LabelEncoder()
data['gender']=gender_encode.fit_transform(data['gender'])
data['ever_married']=married_encode.fit_transform(data['ever_married'])
data['work_type']=work_type_encode.fit_transform(data['work_type'])
data['Residence_type']=Residence_type_encode.fit_transform(data['Residence_type'])
data['smoking_status']=smoking_status_encode.fit_transform(data['smoking_status'])
data.head(5)

In [None]:
plt.figure(figsize=(19,12)) 
sns.heatmap(data.corr(), annot=True, cmap='Dark2_r', linewidths = 2)

FIND NULL VALUES

In [None]:
data.isnull().sum()

REPLACE NULL VAUES

In [None]:
data['bmi'] = data['bmi'].fillna(0)
data1=data[data['bmi']!=0]
data2=data[data['bmi']==0]
temp1_Y=data1['bmi']
temp1_X=data1.drop(['bmi'] , axis=1)
temp2_Y=data2['bmi']
temp2_X=data2.drop(['bmi'] , axis=1)

sk=StandardScaler()
temp1_X = sk.fit_transform(temp1_X)
temp2_X = sk.transform(temp2_X)

from sklearn.ensemble import RandomForestRegressor
mod=RandomForestRegressor()

mod.fit(temp1_X,temp1_Y)
pred=mod.predict(temp2_X)

k=0
for i in range(len(data['bmi'])):
    if(data['bmi'][i]==0.0):
        data['bmi'][i]=pred[k]
        k=k+1

In [None]:
data.isnull().sum()

ROUND OFF THE AGE

In [None]:
data['age'] = data['age'].apply(lambda x : round(x))

**FIND BEST MODEL**

In [None]:
Y = data["stroke"]
X=data.drop(['stroke'] , axis=1)

In [None]:
class evaluate_all_model:
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import SVC
    from sklearn.naive_bayes import GaussianNB
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.naive_bayes import BernoulliNB
    from sklearn.naive_bayes import CategoricalNB
    from sklearn.cluster import KMeans
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.preprocessing import StandardScaler
    from xgboost import XGBClassifier
    from sklearn.metrics import confusion_matrix, accuracy_score
    from sklearn.model_selection import train_test_split

    import time
    def __init__(self,x,y):
        self.x=x
        self.y=y
        self.train_test_split()
        self.define_models()
        self.evaluate_model()
        print("best model base on Accuracy")
        print(self.best_model)
        
    def train_test_split(self):
        self.X_train, self.X_test, self.y_train,self.y_test = train_test_split(self.x, self.y, test_size=0.33, random_state=3)
        sc=StandardScaler()
        self.X_train = sc.fit_transform(self.X_train)
        self.X_test = sc.transform(self.X_test)
    def define_models(self):
        self.models={'LogisticRegression': self.LogisticRegression(),
    'RandomForestClassifier': self.RandomForestClassifier(),
     'KNeighborsClassifier': self.KNeighborsClassifier(),
    'DecisionTreeClassifier': self.DecisionTreeClassifier(),
    'SupportVectorMachine':self.SVC(),
    'GaussianNB': self.GaussianNB(),
    'BernoulliNB': self.BernoulliNB(),
    'GradientBoostingClassifier': self.GradientBoostingClassifier()
                    }
        
        self.modelNames =['LogisticRegression', 'RandomForestClassifier','KNeighborsClassifier','DecisionTreeClassifier','SupportVectorMachine',
                         'GaussianNB','BernoulliNB','GradientBoostingClassifier']
        self.trainScores = []
        self.testScores = []
        self.Time_taken=[]
        self.best_model_score=0
        self.best_model={}
        self.less_time=123
        
        
    def evaluate_model(self):
        for i in self.models:
            start = self.time.time()
            
            model=self.models[i]
            model.fit(self.X_train,self.y_train)
            train_score = model.score(self.X_train, self.y_train)
            self.trainScores.append(train_score)
            print(f'Model:- {i}')
            print(f'training score:- {train_score}')
            test_score = model.score(self.X_test, self.y_test)
            self.testScores.append(test_score)
            print(f'test Score:- {test_score}')
            
            y_predictions = model.predict(self.X_test)
            conf_matrix = confusion_matrix(y_predictions, self.y_test)
            print(f'Confussion Matrix: \n{conf_matrix}\n')
            
            tn = conf_matrix[0,0]
            fp = conf_matrix[0,1]
            tp = conf_matrix[1,1]
            fn = conf_matrix[1,0]
            accuracy  = (tp + tn) / (tp + fp + tn + fn)
            precision = tp / (tp + fp)
            recall    = tp / (tp + fn)
            f1score  = 2 * precision * recall / (precision + recall)
            specificity = tn / (tn + fp)
            print(f'Accuracy : {accuracy}')
            print(f'Precision: {precision}')
            print(f'Recall   : {recall}')
            print(f'F1 score : {f1score}')
            print(f'Specificity : {specificity}')

            end = self.time.time()
            time_taken=end-start
            self.Time_taken.append(time_taken)
            print(f'Time required {end-start}')
            print("***************************************************************************")
            print("____________________________________________________________________________")
            print("\n\n\n")
            if(float(test_score)>self.best_model_score):
                self.best_model["model Name"]=i
                self.best_model["Time Required on train and test"]=time_taken
                self.best_model["Accuracy on train data"]=train_score
                self.best_model["Accuracy on test data"]=accuracy
                self.best_model_score=test_score
                
            if(time_taken<self.less_time):
                self.less_time=time_taken
    def plot_bar(self):
        plt.bar(np.arange(len(self.trainScores)), self.trainScores, color='blue', width=0.25, edgecolor='white', label='train')
        plt.bar([x + 0.25 for x in np.arange(len(self.trainScores))], self.testScores, color='red', width=0.25, edgecolor='white', label='Test')
        plt.xlabel('Models', fontweight='bold', size = 24)
        plt.ylabel('Scores', fontweight='bold', size = 24)
        plt.xticks([r - 0.25 for r in range(len(self.trainScores))], self.modelNames, rotation = 60)
        plt.legend()
        plt.show()
    def get_data(self):
        self.temp_dict={}
        self.temp_dict["Model"]=self.modelNames
        self.temp_dict["Training Score"]=self.trainScores
        self.temp_dict["Accuracy on Test"]=self.testScores
        self.temp_dict["Time Taken"]=self.Time_taken
        return self.temp_dict   
    def get_dataframe(self):
        return pd.DataFrame.from_dict(self.get_data()) 
            

In [None]:
atul=evaluate_all_model(X,Y)

In [None]:
atul.get_dataframe()

In [None]:
atul.plot_bar()