In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Preprocessing

In [None]:
import sklearn.metrics as metrics
from sklearn.metrics import classification_report # for accuracy and F1 score

In [None]:
# Read the data into a pandas dataframe
health_df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
health_df.head()

In [None]:
# Drop the id column since we don't need it.
health_df.drop('id', axis = 1,inplace = True)
health_df.head()

In [None]:
# Find null data
health_df.isnull().sum()

In [None]:
# Lets see how many datapoints are null and if small delete those rows
print(f"% BMI missing  {(health_df['bmi'].isnull().sum()/health_df.shape[0])*100:0.2f}")

In [None]:
# Remove null rows
health_df.dropna(axis = 0, inplace = True)

In [None]:
# Check gender data 
health_df.gender.value_counts()

In [None]:
# Remove "Other"
health_df.drop(health_df[health_df["gender"] == "Other"].index, axis = 0, inplace = True)

In [None]:
# Classify BMI as group
health_df['bmi_group'] = 0 # create new column
for i in range(len(health_df.index)):
    if health_df.iloc[i, 8] < 18.5:
        health_df.iloc[i, 11] = 'Underweight'
    elif health_df.iloc[i, 8] < 25.0 and health_df.iloc[i, 8] >= 18.5:
        health_df.iloc[i, 11] = 'Normal weight'
    elif health_df.iloc[i, 8] < 30.0 and health_df.iloc[i, 8] >= 25.0:
        health_df.iloc[i, 11] = 'Overweight'
    else:
        health_df.iloc[i, 11] = 'Obese'
        
health_df.head()

In [None]:
health_df.bmi_group.value_counts()

In [None]:
# Classify glucose as group
health_df['glucose_group'] = 0 # create new column
for i in range(len(health_df.index)):
    if health_df.iloc[i, 7] < 100.0:
        health_df.iloc[i, 12] = 'Normal'
    elif health_df.iloc[i, 7] >= 100.0 and health_df.iloc[i, 7] < 125.0:
        health_df.iloc[i, 12] = 'Prediabetes'
    else:
        health_df.iloc[i, 12] = 'Diabetes'

health_df.head()

In [None]:
health_df.glucose_group.value_counts()

In [None]:
# Transform string lables into numeric ones
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

health_df["gender"] = le.fit_transform(health_df["gender"])
health_df["ever_married"] = le.fit_transform(health_df["ever_married"])
health_df["work_type"] = le.fit_transform(health_df["work_type"])
health_df["Residence_type"] = le.fit_transform(health_df["Residence_type"])
health_df["smoking_status"] = le.fit_transform(health_df["smoking_status"])
health_df["bmi_group"] = le.fit_transform(health_df["bmi_group"])
health_df["glucose_group"] = le.fit_transform(health_df["glucose_group"])

health_df.head()

In [None]:
# Assigning values and spliting test & train
X = health_df.drop(['stroke', 'avg_glucose_level', 'bmi'], axis = 1)
y = health_df['stroke']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# SVC

In [None]:
from sklearn.svm import SVC
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)
outcome_SVC = svc.predict(X_test)

print(classification_report(y_test, outcome_SVC))

# Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion = "entropy", max_depth = 3)
clf = clf.fit(X_train, y_train)
outcome_DTC = clf.predict(X_test)

print(classification_report(y_test, outcome_DTC))

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100)
forest_fit = forest.fit(X_train, y_train)
outcome_forest = forest.predict(X_test)

print(classification_report(y_test, outcome_forest))

# XGBoost

In [None]:
from xgboost.sklearn import XGBClassifier
xgb = XGBClassifier(n_estimators = 100, learning_rate = 0.1,
                    max_depth = 5, subsample = 1, gamma = 0,
                    reg_lambda = 1, max_delta_step = 0, colsample_bytree = 1,
                    min_child_weight = 1, seed = 1000)
xgb_fit = xgb.fit(X_train, y_train)
outcome_xgb = xgb.predict(X_test)

print(classification_report(y_test, outcome_xgb))

# Export file

In [None]:
submission_df_1.to_csv('submission_1.csv', index=False) 