In [None]:
import numpy as np 
import pandas as pd
import pandas_profiling as pp
import seaborn as sns 
import matplotlib.pyplot as plt 
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
df = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
df["stroke"].hist()

In [None]:
pp.ProfileReport(df)

<h2>Treating missing values</h2>

In [None]:
df.fillna(value=df["bmi"].median(), inplace=True)
df.isnull().sum()

In [None]:
df.drop("id", axis=1, inplace=True)
df.drop(df[df["gender"] == "Other"].index, inplace=True)

<h2>Seaborn Visualizations</h2>

In [None]:
df.head(2)

In [None]:
sns.pairplot(df)

In [None]:
g = sns.FacetGrid(df, col="gender", hue="ever_married")
g.map(sns.scatterplot, "age", "bmi", alpha=.7)
plt.figure(figsize=(20,15))
g.add_legend()

In [None]:
fig,ax = plt.subplots(2,2, figsize=(15 ,10))  
sns.distplot(df['age'], ax = ax[0,0], color = 'b') 
sns.distplot(df['avg_glucose_level'], ax = ax[0,1], color = 'b')
sns.distplot(df['bmi'], ax = ax[1,0], color = 'b')
plt.show()

<h2>Feature Engineering</h2>

In [None]:
df.head()

In [None]:
# Encoding Columns
# df = pd.get_dummies(df, columns=["ever_married"], drop_first=True)
# df = pd.get_dummies(df, columns= ["work_type"])
# df = pd.get_dummies(df, columns=["Residence_type"], drop_first=True)
# df["smoking_status"] = df["smoking_status"].replace({"never smoked": 0, "Unknown": 1, "formerly smoked": 2, "smokes": 3})
df["gender"] = df["gender"].replace({"Male": 1, "Female": 0})
df

In [None]:
# Splitting data into train and test data. 
X = df.drop("stroke", axis=1)
y = df["stroke"]

In [None]:
os = RandomOverSampler(sampling_strategy=1)
X_ros, y_ros = os.fit_resample(X, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_ros, y_ros, test_size=0.25, random_state=1 ,shuffle=True)

In [None]:
data_scaler = StandardScaler()
X_train = pd.DataFrame(data=data_scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(data=data_scaler.fit_transform(X_test), columns=X_test.columns, index=X_test.index)

<h2>Machine Learning Models</h2>

In [None]:
# Applying Logistic Regression model
LR_model = LogisticRegression().fit(X_train, y_train)
predictions = LR_model.predict(X_test)

In [None]:
df1 = pd.DataFrame(columns = ["Not Stroke", "Stroke"], index = ["Not Stroke", "Stroke"], data = confusion_matrix(y_test, predictions))
print(df1)
print("\n")
print(classification_report(y_test, predictions))

In [None]:
# Applying K-NEAREST NEIGHBORS
accuracy = []

for k in range(2, 10):
    model = KNeighborsClassifier(n_neighbors=k).fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(f'Accuracy at {k}: {accuracy_score(y_test, predictions)}')



In [None]:
# Highest Accucary achieved when n_neighbors coefficient was 2. So, we proceed with that. 
model = KNeighborsClassifier(n_neighbors=2).fit(X_train, y_train)
predictions = model.predict(X_test)

In [None]:
df1 = pd.DataFrame(columns = ["Not Stroke", "Stroke"], index = ["Not Stroke", "Stroke"], data = confusion_matrix(y_test, predictions))
print(df1)
print("\n")
print(classification_report(y_test, predictions))

In [None]:
# Applying Decision Tree Classifier
model = DecisionTreeClassifier().fit(X_train, y_train)
predictions = model.predict(X_test)

In [None]:
df1 = pd.DataFrame(columns = ["Not Stroke", "Stroke"], index = ["Not Stroke", "Stroke"], data = confusion_matrix(y_test, predictions))
print(df1)
print("\n")
print(classification_report(y_test, predictions))

In [None]:
# Applying RANDOM FOREST CLASSIFIER 
tree_numbers = [100,150,200,250,300,350,400,450,500]

for i in tree_numbers:
    model = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(f'Accuracy at {i}: {accuracy_score(y_test, predictions)}')

In [None]:
model = RandomForestClassifier(n_estimators=350).fit(X_train, y_train)
predictions = model.predict(X_test)

In [None]:
df1 = pd.DataFrame(columns = ["Not Stroke", "Stroke"], index = ["Not Stroke", "Stroke"], data = confusion_matrix(y_test, predictions))
print(df1)
print("\n")
print(classification_report(y_test, predictions))