# Importing Modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
import warnings
import pickle

# Turning off the warnings

In [2]:
warnings.filterwarnings("ignore")


# Reading the Dataset

In [3]:
df=pd.read_csv("Stroke prediction.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


# Preprocessing the dataset

In [4]:
df['gender'].unique()

array(['Male', 'Female', 'Other'], dtype=object)

In [5]:
tmp = pd.get_dummies(df.gender)
df = pd.concat([df,tmp], axis='columns')
df = df.drop(['gender', 'Other'], axis='columns')
df.head()

Unnamed: 0,id,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,Female,Male
0,9046,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,0,1
1,51676,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1,1,0
2,31112,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,0,1
3,60182,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,1,0
4,1665,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,1,0


In [6]:
df['smoking_status'].unique()
df = df.drop(df[df['smoking_status']=='Unknown'].index, axis=0)
df['smoking_status'].unique()

array(['formerly smoked', 'never smoked', 'smokes'], dtype=object)

In [7]:

# Dropping id column
df.drop(columns=["id", "ever_married", "work_type", "Residence_type", "bmi"],inplace=True)
d1 = pd.get_dummies(df.smoking_status)
d1


Unnamed: 0,formerly smoked,never smoked,smokes
0,1,0,0
1,0,1,0
2,0,1,0
3,0,0,1
4,0,1,0
...,...,...,...
5102,0,1,0
5105,0,1,0
5106,0,1,0
5107,0,1,0


In [8]:
df = pd.concat([df, d1], axis='columns')
df = df.drop(['never smoked', 'smoking_status'], axis='columns')
df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,stroke,Female,Male,formerly smoked,smokes
0,67.0,0,1,228.69,1,0,1,1,0
1,61.0,0,0,202.21,1,1,0,0,0
2,80.0,0,1,105.92,1,0,1,0,0
3,49.0,0,0,171.23,1,1,0,0,1
4,79.0,1,0,174.12,1,1,0,0,0


In [9]:
df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,stroke,Female,Male,formerly smoked,smokes
0,67.0,0,1,228.69,1,0,1,1,0
1,61.0,0,0,202.21,1,1,0,0,0
2,80.0,0,1,105.92,1,0,1,0,0
3,49.0,0,0,171.23,1,1,0,0,1
4,79.0,1,0,174.12,1,1,0,0,0


# Training the model

In [10]:
X=df[["Male","Female","age","hypertension","heart_disease","avg_glucose_level","formerly smoked", "smokes"]]
y=df["stroke"]
X=X.astype("int")
y=y.astype("int")
sc=StandardScaler()
X=sc.fit_transform(X)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=40)
#rf=LogisticRegression()
rf = RandomForestClassifier()
#rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
#rf_random.fit(X_train, y_train)
rf.fit(X_train,y_train)
predict_rf=rf.predict(X_test)
#predict = rf_random.predict(X_test)
print(predict_rf)  

[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [11]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
confusion_matrix(y_test, predict_rf)
# Display accuracy score
accuracy_score(y_test, predict_rf)
# Display F1 score
f1_score(y_test,predict_rf)

0.03225806451612903

# Saving the model for using it in Flask

In [12]:
# Save model
pickle_out= open("Stroke_model.pkl", "wb")
pickle.dump(rf,pickle_out )
pickle_out.close()

In [13]:
X_test.head(10)

AttributeError: 'numpy.ndarray' object has no attribute 'head'