In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df=pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df.head()

**Getting Know the Data**

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.duplicated().sum()

In [None]:
df.describe().T

In [None]:
df.describe(include="object").T

**Data Visualization and Analysis**

In [None]:
# Drop the unnecessary column
df.dropna(inplace=True)
df.drop(columns=["id"],axis=1, inplace=True)

In [None]:
df["gender"].value_counts()

In [None]:
df.loc[df["gender"]=="Other"]

In [None]:
df.drop([3116], inplace=True)

In [None]:
fig, axis=plt.subplots(2, figsize=(15,13))
sns.set_palette("pastel")


sns.violinplot(data=df, x=df["ever_married"], y=df["stroke"], hue=df["gender"],
           ax=axis[0])

sns.pointplot(data=df, x=df["ever_married"], y=df["stroke"], hue=df["gender"],
           ax=axis[1])

In [None]:
fig, axis=plt.subplots(2, figsize=(15,13))
sns.set_palette("pastel")


sns.violinplot(data=df, x=df["work_type"], y=df["stroke"], hue=df["gender"],
           ax=axis[0])

sns.pointplot(data=df, x=df["work_type"], y=df["stroke"], hue=df["gender"],
           ax=axis[1])

In [None]:
# Never worked data consist of children. That's why there are no stroke.
df.loc[df["work_type"]=="Never_worked"]

In [None]:
fig, axis=plt.subplots(2, figsize=(15,13))
sns.set_palette("pastel")


sns.violinplot(data=df, x=df["Residence_type"], y=df["stroke"], hue=df["gender"],
           ax=axis[0])

sns.pointplot(data=df, x=df["Residence_type"], y=df["stroke"], hue=df["gender"],
           ax=axis[1])

In [None]:
fig, axis=plt.subplots(2, figsize=(15,13))
sns.set_palette("pastel")

sns.violinplot(data=df, x=df["smoking_status"], y=df["stroke"], hue=df["gender"],
           ax=axis[0])

sns.pointplot(data=df, x=df["smoking_status"], y=df["stroke"], hue=df["gender"],
           ax=axis[1])

In [None]:
df_categorical=["ever_married","work_type","Residence_type","smoking_status"]
for i in df_categorical:
    print(df[i].value_counts())

In [None]:
for i in df_categorical:
    plt.figure(figsize=(12,5))
    sns.countplot(x=df[i])
    plt.show()

**Creating Some New Columns**

In [None]:
def Age(age):
    if (age>=0 and age<18): return "Child"
    if (age>=18 and age<44): return "Adult"
    if (age>=44 and age<60): return "Late Adult"
    if (age>=60 and age<82): return "Old"

df["age_gap"]=df.apply(lambda x: Age(x["age"]),axis=1)

df.drop(columns=["age"],axis=1,inplace=True)

In [None]:
df.age_gap.value_counts()

In [None]:
def BMI(bmi):
    if (bmi>=0 and bmi<18): return "Underweight"
    if (bmi>=18 and bmi<25): return "Normal_weight"
    if (bmi>=25 and bmi<29): return "Overweight"
    if (bmi>30): return "Obesity"

df["bmi_gap"]=df.apply(lambda x: BMI(x["bmi"]),axis=1)

df.drop(columns=["bmi"],axis=1,inplace=True)

In [None]:
def Glucose(avg_glucose_level):
    if (avg_glucose_level>=0 and avg_glucose_level<145): return "Good"
    if (avg_glucose_level>=145 and avg_glucose_level<200): return "Okay"
    if (avg_glucose_level>=200): return "Bad"

df["Glucose"]=df.apply(lambda x: Glucose(x["avg_glucose_level"]),axis=1)

df.drop(columns=["avg_glucose_level"],axis=1,inplace=True)

In [None]:
sns.pointplot(x=df["Glucose"],y=df["stroke"])

In [None]:
# Dummy variable
one_hot_encoded_data=pd.get_dummies(df, columns=["gender","ever_married",
                                                 "work_type","Residence_type",
                                                "smoking_status","age_gap",
                                                "bmi_gap","Glucose"])
one_hot_encoded_data

In [None]:
df=pd.concat([df,one_hot_encoded_data], axis=1)
df.head()

In [None]:
df.drop(columns=["gender","ever_married","work_type",
                 "Residence_type",
                "smoking_status","age_gap",
                "bmi_gap","Glucose"], axis=1, inplace=True)

In [None]:
df.info()

In [None]:
# I realized that there are some duplicated columns. We need to get rid of that.
df=df.T.drop_duplicates().T
df.head()

In [None]:
df.info()

In [None]:
# Stroke=0 values are much more than Stroke=1 values. We need to balance this to increase our model accuracy 
count_class_0, count_class_1=df["stroke"].value_counts()

df_class_0 = df[df['stroke'] == 0]
df_class_1 = df[df['stroke'] == 1]

df_class_0_under = df_class_0.sample(count_class_1)
df_new = pd.concat([df_class_0_under, df_class_1], axis=0)

sns.countplot(data=df_new, y=df_new["stroke"])

In [None]:
y=df_new["stroke"]
x=df_new.drop(columns=["stroke"],axis=1)
y=pd.DataFrame(y)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.3,random_state=42)

In [None]:
print(y_train.shape)
print(x_train.shape)
print(y_test.shape)
print(x_test.shape)

In [None]:
# Scaling
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(x_train)
X_test=sc.transform(x_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import metrics


logr=LogisticRegression(random_state=0)
logr.fit(X_train,y_train)
y_pred=logr.predict(X_test)
cm=confusion_matrix(y_test,y_pred)


print(cm)
print("Training Accuracy :", logr.score(X_train, y_train)*100)
print("Testing Accuracy :", logr.score(X_test, y_test)*100)

In [None]:
from sklearn.svm import SVC
svc=SVC(kernel="poly")
svc.fit(X_train,y_train)

y_pred=svc.predict(X_test)
cm=confusion_matrix(y_test,y_pred)

print(cm)
print("Training Accuracy :", svc.score(X_train, y_train)*100)
print("Testing Accuracy :", svc.score(X_test, y_test)*100)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier(criterion="entropy")
dtc.fit(X_train,y_train)

y_pred=dtc.predict(X_test)
cm=confusion_matrix(y_test,y_pred)

print(cm)
print("Training Accuracy :", dtc.score(X_train, y_train)*100)
print("Testing Accuracy :", dtc.score(X_test, y_test)*100)

**PCA**

In [None]:
from sklearn.decomposition import PCA
pca=PCA(n_components=3)
X_train=pca.fit_transform(X_train)
X_test=pca.transform(X_test)

In [None]:
logr=LogisticRegression(random_state=0)
logr.fit(X_train,y_train)

y_pred=logr.predict(X_test)
cm=confusion_matrix(y_test,y_pred)

print(cm)
print("Training Accuracy :", logr.score(X_train, y_train)*100)
print("Testing Accuracy :", logr.score(X_test, y_test)*100)

In [None]:
from sklearn.svm import SVC
svc=SVC(kernel="poly")
svc.fit(X_train,y_train)

y_pred=svc.predict(X_test)
cm=confusion_matrix(y_test,y_pred)

print(cm)
print("Training Accuracy :", svc.score(X_train, y_train)*100)
print("Testing Accuracy :", svc.score(X_test, y_test)*100)

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier(criterion="entropy")
dtc.fit(X_train,y_train)
y_pred=dtc.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
print(cm)
print("Training Accuracy :", dtc.score(X_train, y_train)*100)
print("Testing Accuracy :", dtc.score(X_test, y_test)*100)