### Stroke Prediction using Logistic Regression

Try the app at [stroke-prediction heroku](https://stroke-prediction.herokuapp.com/)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

print(df.shape)
# drop rows with null values for bmi
df = df.dropna()
df.drop(columns=['id'], inplace=True)

print(df.shape)

In [None]:
df.columns

In [None]:
df.nunique()

In [None]:
df.head()

In [None]:
# one hot encoding 
from sklearn.preprocessing import OneHotEncoder

def one_hot_encode(feature, dataf):
    feature_array = dataf[feature].to_numpy().reshape(-1,1)
    enc_feature = OneHotEncoder(handle_unknown = 'ignore').fit(feature_array)
    enc_feature.fit(feature_array)
    encoded_array = enc_feature.transform(feature_array).toarray()
    col_names = [feature + "_" + v for v in enc_feature.categories_[0].tolist()]
    df_feature = pd.DataFrame(encoded_array, columns = col_names)  
    columns = dataf.columns.tolist() + (df_feature.columns.tolist()) 
    new_df = pd.concat([dataf.reset_index(drop=True), df_feature.reset_index(drop=True)], axis=1)
    new_df.columns = columns
    return new_df, enc_feature

#########################################################################################################

one_hot_features = ['gender', 'ever_married','work_type', 'Residence_type']

encoders = []

for feature in one_hot_features:
    df_copy = df.copy()
    df_copy_new, enc = one_hot_encode(feature, df_copy)
    encoders.append(enc)
    df_copy_new.drop(columns=[feature], inplace=True)
    df = df_copy_new
    
# label encoding smoking status
from sklearn.preprocessing import LabelEncoder
smoking_label_encoder = LabelEncoder()
df['smoking_status'] = smoking_label_encoder.fit_transform(df['smoking_status'])

In [None]:
# X and y for training
X = df.drop(columns=['stroke'])
y = df['stroke']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clf = LogisticRegression().fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
# example
age = 87.0
hypertension = 0
heart_disease = 1
avg_glucose_level = 668.69 
bmi = 36.6
smoking_status = 'formerly smoked'
gender = 'Male'
ever_married = 'Yes'
work_type = 'Private'
Residence_type = 'Urban'

smoking = smoking_label_encoder.transform([smoking_status])
smoking_status = smoking[0]

input_list = [age,hypertension,heart_disease, avg_glucose_level, bmi, smoking_status]

input_cat_features = [gender, ever_married, work_type, Residence_type]
one_hot_encoded_feature_values = []

# one_hot_features = ['gender', 'ever_married','work_type', 'Residence_type']

ans = []

for i in range(0,len(one_hot_features)):
    enc = encoders[i]
    val = enc.transform(np.array(input_cat_features[i]).reshape(-1,1)).toarray()[0].tolist()
    ans.append(val)
    print(val)

result = sum(ans, [])

input_list = input_list + result

print(input_list)

In [None]:
y_pred = clf.predict(np.array(input_list).reshape(1,-1))
print(y_pred[0])

print(clf.predict_proba(np.array(input_list).reshape(1,-1)))

In [None]:
y_pred = clf.predict(np.array(input_list).reshape(1,-1))
print("Prediction for stroke = ", y_pred[0])

print("Percent of prediction = ", clf.predict_proba(np.array(input_list).reshape(1,-1))[:,y_pred[0]][0])

In [None]:
# Save model and encoders
from joblib import dump, load

for i in range(len(one_hot_features)):
    dump(encoders[i], one_hot_features[i] + '.joblib')
    
dump(smoking_label_encoder,'smoking_label_encoder.joblib')

dump(clf,'classifier.joblib')