# **Setup**


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# **Get the data**

In [None]:
data = pd.read_csv("/kaggle/input/heart-stroke/train_strokes.csv")
data

In [None]:
data.info()

In [None]:
data.work_type.value_counts()

In [None]:
data.smoking_status.value_counts()

# **Discover and visualize the data to gain insights**

In [None]:
data = data.drop('id',axis=1) # drop the id column
data.bmi.fillna(data.bmi.mean(),inplace=True) #replace NaN values with the mean of bmi
data.smoking_status.fillna('Unknown',inplace=True) #replace NaN values with Unknown string

data.gender.value_counts()

In [None]:
data["gender"] = data["gender"].replace(["Other"],"Female") #replace Other gender with Female gender
data.gender.value_counts()

In [None]:
data.info() #no missing (null) values

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
data.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
import seaborn as sns
sns.relplot(data=data, x = "avg_glucose_level", y="bmi", hue = "stroke" )

In [None]:
corr_matrix = data.corr()
print(corr_matrix["stroke"].sort_values(ascending=False))

plt.figure(figsize=(15,10))
sns.heatmap(corr_matrix,annot=True) #annot = true to show the corr value for each square.

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["stroke","avg_glucose_level",'hypertension','bmi']
scatter_matrix(data[attributes], figsize=(15, 10))

# **Prepare the data for Machine Learning algorithms**

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42) #test_size = 0.25 by diffult
for train_index, test_index in split.split(data,data.stroke ):
    strat_train_set = data.loc[train_index] #(32550, 11)
    strat_test_set =  data.loc[test_index]  #(10850, 11)

In [None]:
print('shape  =', strat_train_set.shape)
print(strat_train_set.stroke.value_counts())

In [None]:
print('shape  =', strat_test_set.shape)
print(strat_test_set.stroke.value_counts())

In [None]:
X_train = strat_train_set.drop('stroke',axis=1)
Y_train = strat_train_set.stroke.copy()

X_test = strat_test_set.drop('stroke',axis=1)
Y_test = strat_test_set.stroke.copy()

In [None]:
X_train.info() #no null or missing values.

In [None]:
X_train

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

X_train_num = X_train.drop(["gender", "ever_married","Residence_type", "work_type", "smoking_status"], axis=1)
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

X_train_num_tr = num_pipeline.fit_transform(X_train_num)

In [None]:
print(X_train_num) #5 columns (pandas.core.frame.DataFrame)

In [None]:
print(X_train_num_tr) #numpy.ndarray

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

num_attribs = list(X_train_num)
cat_attribs1 = ["gender", "ever_married","Residence_type"]
cat_attribs2 = ["work_type", "smoking_status"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat1", OrdinalEncoder(), cat_attribs1),
        ("cat2", OneHotEncoder(), cat_attribs2)
    ])

X_train = full_pipeline.fit_transform(X_train)
X_test = full_pipeline.fit_transform(X_test)

In [None]:
X_train

In [None]:
X_train.shape # columns = (num_attribs + Binary_cat + multible_cat) = (5 + 3 + (5+4))

# **Select a model and train it**

In [None]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import mean_squared_error


def Model(model,X_train,Y_train,X_test,Y_test):
    model.fit(X_train,Y_train)
    model_train_score = model.score(X_train, Y_train)
    model_test_score = model.score(X_test, Y_test)
    prediction = model.predict(X_test)
    cm = confusion_matrix(Y_test,prediction)
    plot_confusion_matrix(model,X_train,Y_train)
    plot_confusion_matrix(model,X_test,Y_test)
    print('Training Score \n',model_train_score)
    print('Testing Score \n',model_test_score)
    Y_pred = model.predict(X_test)
    print('Accuracy  =', accuracy_score(Y_test, Y_pred))              #calculating accuracy
    print('Precision =', precision_score(Y_test,Y_pred))              #calculating precision
    print('Recall    =', recall_score(Y_test, Y_pred))                #calculating recall
    print('F1-score  =', f1_score(Y_test, Y_pred))                    #calculating f1_score
    print('RMSE      =', np.sqrt(mean_squared_error(Y_test, Y_pred))) #calculating RMSE

In [None]:
from sklearn.linear_model import LogisticRegression

lg_reg = LogisticRegression(random_state=42)
Model(lg_reg,X_train,Y_train,X_test,Y_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier

DTC = DecisionTreeClassifier(random_state=42)
Model(DTC,X_train,Y_train,X_test,Y_test)

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score

sgd_clf = SGDClassifier(random_state=42)
Model(sgd_clf,X_train,Y_train,X_test,Y_test)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

GBC = GradientBoostingClassifier(random_state=42)
Model(GBC,X_train,Y_train,X_test,Y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf1 = RandomForestClassifier(random_state=42)
Model(rf1,X_train,Y_train,X_test,Y_test) #overfitting

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
Model(knn_clf,X_train,Y_train,X_test,Y_test)

In [None]:
# try balancing data
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

data_prepared = full_pipeline.fit_transform(data)
y = data.stroke.copy()
rs = RandomOverSampler()
X, Y = rs.fit_resample(data_prepared, y)
x = pd.DataFrame (X)
sns.relplot(data=x, x = x[3], y=x[4], hue = Y ) #x[3] = avg_glucose_level, x[4] = bmi

x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=42, stratify = Y)

In [None]:
Model(rf1,x_train,y_train,x_test,y_test) #by balancing the data got a very good result :).

In [None]:
Model(DTC,x_train,y_train,x_test,y_test)

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state = 42)
rf2 = RandomForestClassifier(random_state=42)
Model(rf2,x_train,y_train,x_test,y_test)

In [None]:
Model(knn_clf,x_train,y_train,x_test,y_test)

# **Fine-tune your model**

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [{'n_estimators': [3, 10, 30], 'max_features': [4, 6, 8, 10]},               # try 12 (3×4) combinations of hyperparameters
              {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}] # try 6 (2×3) combinations with bootstrap = False

rf = RandomForestClassifier(random_state = 42)
# train across 5 folds (cv=5), that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error',return_train_score=True, verbose=3)
grid_search.fit(x_train,y_train)

In [None]:
print(grid_search.best_params_)
print(grid_search.best_estimator_)

In [None]:
final_model = grid_search.best_estimator_
Model(final_model,x_train,y_train,x_test,y_test)

In [None]:
some_data = x_test[:10]
some_labels = y_test[:10]

print("Predictions: ", final_model.predict(some_data))
print("Actual value:", list(some_labels))

# **Neural Networks**

In [None]:
#try using neural networks.
import tensorflow as tf
from tensorflow import keras
from keras import models, layers

model = keras.models.Sequential([keras.layers.Dense(5,input_shape= X_train.shape[1:]),keras.layers.Dense(1)])
model.compile(loss="binary_crossentropy", metrics=["accuracy"]) #activation is linear layer (default)
keras.utils.plot_model(model,"my_model.png",show_shapes=True)

In [None]:
history = model.fit(X_train, Y_train, epochs=20)
history

In [None]:
mse_test = model.evaluate(X_test, Y_test)

In [None]:
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0, 1)
plt.show()

# **Conclusion**

In [None]:
bar_categories = ['LG1', 'DTC1', 'SGD1', 'GBC1', 'RFC1', 'KNN1', 'RFC2', 'DTC2', 'SMOTE2', 'KNN2', 'Final_model', 'Neural_Nets']
bars = [13.4, 20.2, 13.4, 13.6, 13.5, 13.6, 4.22, 10.2, 4.22, 18.2, 3.81, 27.8]

plt.figure(figsize=(15,10))
plt.bar(bar_categories, bars)
plt.xlabel('Models') 
plt.ylabel('Loss (%)') 
plt.show()

# **Save Your Best Model for the Production System**

In [None]:
import joblib

my_model  = Pipeline([
        ("preparation", full_pipeline),
        ("RFC", RandomForestClassifier())
    ])


joblib.dump(my_model, "my_model.pkl") #Saving my model
#Later...
my_model_loaded = joblib.load("my_model.pkl") 