In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
#from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/projects/ml/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
df=pd.read_csv("dataset.csv")
df.drop(['id'],axis=1,inplace=True)
df.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
8,Female,59.0,0,0,Yes,Private,Rural,76.15,,Unknown,1
9,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1


# About the dataset



According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths. This dataset is used to predict whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relavant information about the patient.


### Attribute Information

    1) id: unique identifier
    2) gender: "Male", "Female" or "Other"
    3) age: age of the patient
    4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
    5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
    6) ever_married: "No" or "Yes"
    7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
    8) Residence_type: "Rural" or "Urban"
    9) avg_glucose_level: average glucose level in blood
    10) bmi: body mass index
    11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*-
    12) stroke: 1 if the patient had a stroke or 0 if not *Note: "Unknown" in smoking_status means that the information is unavailable for this patient



## Processing string values to float
Using Label Binarizer and Label Encoder, we convert string cells to float for training the data.


In [3]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder

lb = LabelBinarizer()
label_encoder = LabelEncoder()
df['gender'] = lb.fit_transform(df['gender']) # male = 0 female = 1
df['ever_married'] = lb.fit_transform(df['ever_married']) # no = 0 yes = 1
df['work_type'] = label_encoder.fit_transform(df['work_type']) # private = 2 self = 3 gov = 0 child = 1
df['Residence_type'] = label_encoder.fit_transform(df['Residence_type']) # urban = 1 rural = 0 
df['smoking_status'] = label_encoder.fit_transform(df['smoking_status']) # formerly = 1 never = 2 regular = 3 unknown = 0
df.dropna(inplace=True)

df.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,2,1,228.69,36.6,1,1
2,0,80.0,0,1,1,2,0,105.92,32.5,2,1
3,1,49.0,0,0,1,2,1,171.23,34.4,3,1
4,1,79.0,1,0,1,3,0,174.12,24.0,2,1
5,0,81.0,0,0,1,2,1,186.21,29.0,1,1
6,0,74.0,1,1,1,2,0,70.09,27.4,2,1
7,1,69.0,0,0,0,2,1,94.39,22.8,2,1
9,1,78.0,0,0,1,2,1,58.57,24.2,0,1
10,1,81.0,1,0,1,2,0,80.43,29.7,2,1
11,1,61.0,0,1,1,0,0,120.46,36.8,3,1


# Using RandomForestClassifer

## 95% Precision and Accuracy

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report

X=df.drop(['stroke'],axis=1)
y=df['stroke']

train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.1, random_state = 0)

# RFC
# model = RandomForestClassifier(random_state = 1)
# model.fit(train_X, train_y)

# RFC with RandomizedSearchCV
param_grid = {
'n_estimators': [50, 75,100, 150, 200,300],
}
rcv=RandomizedSearchCV(RandomForestClassifier(random_state=42),param_grid,cv=5)
rcv.fit(train_X,train_y)


y_pred_rcv=rcv.predict(val_X)
mae = mean_absolute_error(val_y, y_pred_rcv)
print(classification_report(val_y, y_pred_rcv))




              precision    recall  f1-score   support

           0       0.95      1.00      0.97       467
           1       0.00      0.00      0.00        24

    accuracy                           0.95       491
   macro avg       0.48      0.50      0.49       491
weighted avg       0.90      0.95      0.93       491



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Trying out Logistic Regression

## 95% Accuracy model

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

reg = LogisticRegression()
reg.fit(x_train, y_train)
pre = reg.predict(x_test)
acc = accuracy_score(pre, y_test)
print(acc)


0.9562118126272913


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Code below for custom user input

In [6]:
gen = float(input("gender (enter 0 for male and 1 for female): "))
age = float(input("age: "))
hyp = float(input("ever had hypertension? (enter 0 for no 1 for yes): "))
hear = float(input("ever had heart disease? (enter 0 for no 1 for yes): "))
mar = float(input("ever married? (enter 0 for no 1 for yes): "))
work = float(input("what is your work type? (enter 0 for government 1 if u r child 2 for private 3 for self employed):"))
res = float(input("enter your residance (enter 0 for rural 1 for urban): "))
gl = float(input("enter your glucose level: "))
bm = float(input("enter your bmi: "))
sm = float(input("do u smoke? (enter 1 for formerly 2 for never 3 for regular): "))

data_input = [[gen, age, hyp, hear, mar, work, res, gl, bm, sm]]

new =reg.predict(data_input)
print(new)

[0]


In [7]:
import pickle
pickle.dump(reg, open('model.pkl','wb'))