In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings 
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.head()

In [None]:
df['stroke'].value_counts()

In [None]:
df['work_type'].value_counts().plot(kind='bar', color='cyan')

In [None]:
labels = ['Private', 'Self-employed', 'children', 'Govt_job', 'Never_worked']


fig1, ax1 = plt.subplots(figsize=(10, 5))
colors = ['green', 'brown', 'yellow', 'maroon', 'blue']
ax1.pie(df['work_type'].value_counts(),  labels=labels, autopct='%1.1f%%',shadow=True, startangle=30, colors=colors)
#Theme
ax1.axis('equal')
plt.show()

In [None]:
df['Residence_type'].value_counts().plot(kind='bar', color=['grey', 'orange'])
plt.xlabel('Resedence')
plt.ylabel('Count')

#Set the style 
plt.style.use('seaborn-whitegrid')


In [None]:
labels = ['Private', 'Self-employed', 'children', 'Govt_job', 'Never_worked']


fig1, ax1 = plt.subplots(figsize=(10, 5))
colors = ['green', 'brown', 'yellow', 'maroon', 'blue']
ax1.pie(df['work_type'].value_counts(),  labels=labels, autopct='%1.1f%%',shadow=True, startangle=30, colors=colors)
#Theme
ax1.axis('equal')
plt.show()

In [None]:
plt.figure(figsize=(30, 10))
df['age'].value_counts().plot(kind='bar', color='chocolate')
plt.xlabel('Age', fontsize=40)
plt.ylabel('Counts', fontsize=50)

In [None]:
df['smoking_status'].value_counts()


In [None]:
labels = ['never smoked', 'Unknown', 'formerly smoked', 'smokes']


fig1, ax1 = plt.subplots(figsize=(10, 10))
colors = ['lightblue', 'salmon', 'gold', 'lightgreen']
ax1.pie(df['smoking_status'].value_counts(),  labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=30, colors=colors)
#Theme
ax1.axis('equal')
plt.show()

In [None]:
df.columns

In [None]:
## OO method from scratch
fig, ax = plt.subplots(figsize=(10,6))
## plot the data
scatter = ax.scatter(x= df["age"],
                    y=df["avg_glucose_level"],
                    c=df["stroke"], cmap='YlGn');
# Customize the plot
ax.set(
      xlabel = "Age",
     ylabel ="Gloucose level");
# Addd a legend
ax.legend(*scatter.legend_elements(),title="Stroke");

# Add a horizontal line
ax.axhline(df["avg_glucose_level"].mean(),linestyle='--');


In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(data=df, x='age', y='gender')


In [None]:
plt.figure(figsize=(30,20))
sns.barplot(data=df, x='age', y='stroke')
plt.xticks(rotation=90)
plt.xlabel('Age', fontsize=20)
plt.ylabel('Stroke', fontsize=20)

In [None]:
plt.figure(figsize=(30,10))
sns.boxplot(data=df, x='age', y='hypertension')
plt.xticks(rotation=90)
plt.xlabel('Age', fontsize=20)
plt.ylabel('Hypertension', fontsize=20)

In [None]:
df.columns

In [None]:
plt.figure(figsize=(20,10))
sns.violinplot(data=df, x='age', y='heart_disease')
plt.xticks(rotation=90)
plt.xlabel('Age', fontsize=20)
plt.ylabel('Hear Disease', fontsize=20)

## Datacleaning

In [None]:
df.isna().sum()

In [None]:
df.dtypes

In [None]:
def preprocess_image(df):
    for label, content in df.items():
        if pd.api.types.is_numeric_dtype(content):
            if pd.isnull(content).sum():
                df[label] = content.fillna(content.median())
        if not pd.api.types.is_numeric_dtype(content):
            df[label] = pd.Categorical(content).codes+1
    return df

In [None]:
df = preprocess_image(df=df)
df.head()

In [None]:
# Check again
df.isna().sum()

In [None]:
df.dtypes

# split the dataset


In [None]:
X = df.drop('stroke', axis=1)
y = df['stroke']

from sklearn.model_selection import train_test_split, cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modelling

### Import required libraries

In [None]:
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score, precision_score, recall_score, f1_score

### RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_preds = clf.predict(X_test)
clf_score = clf.score(X_test, y_test)*100
print(f"Score : {clf.score(X_test, y_test)}")
print(f"Cross_val_score : {np.mean(cross_val_score(clf, X, y, cv=5))}")
print("*"* 50)
print(f"Accuracy_score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f"Precision_score : {precision_score(y_test, y_preds)}")
print(f"Recall_score : {recall_score(y_test, y_preds)}")
print(f"F1_score : {f1_score(y_test, y_preds)}")
print("*"*50)
sns.heatmap(confusion_matrix(y_test, y_preds), annot=True)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
print("Classification Report")
print(f"{classification_report(y_test, y_preds)}")

## GradientBoostClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

np.random.seed(42)

clf2 = GradientBoostingClassifier()
clf2.fit(X_train, y_train)
y_preds = clf2.predict(X_test)
clf2_score = clf2.score(X_test, y_test)*100

print(f"Score : {clf2.score(X_test, y_test)}")
print(f"Cross_val_score : {np.mean(cross_val_score(clf2, X, y, cv=5))}")
print("*"* 50)
print(f"Accuracy_score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f"Precision_score : {precision_score(y_test, y_preds)}")
print(f"Recall_score : {recall_score(y_test, y_preds)}")
print(f"F1_score : {f1_score(y_test, y_preds)}")
print("*"*50)
sns.heatmap(confusion_matrix(y_test, y_preds), annot=True)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
print("Classification Report")
print(f"{classification_report(y_test, y_preds)}")

### AdaBoostClassifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

np.random.seed(42)
clf3 = AdaBoostClassifier()
clf3.fit(X_train, y_train)
y_preds = clf3.predict(X_test)
clf3_score = clf3.score(X_test, y_test)*100
print(f"Score : {clf3.score(X_test, y_test)}")
print(f"Cross_val_score : {np.mean(cross_val_score(clf3, X, y, cv=5))}")
print("*"* 50)
print(f"Accuracy_score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f"Precision_score : {precision_score(y_test, y_preds)}")
print(f"Recall_score : {recall_score(y_test, y_preds)}")
print(f"F1_score : {f1_score(y_test, y_preds)}")
print("*"*50)
sns.heatmap(confusion_matrix(y_test, y_preds), annot=True)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
print("Classification Report")
print(f"{classification_report(y_test, y_preds)}")

## LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
np.random.seed(42)
clf4 = LogisticRegression()
clf4.fit(X_train, y_train)
y_preds = clf4.predict(X_test)
clf4_score = clf4.score(X_test, y_test)*100

print(f"Score : {clf4.score(X_test, y_test)}")
print(f"Cross_val_score : {np.mean(cross_val_score(clf4, X, y, cv=5))}")
print("*"* 50)
print(f"Accuracy_score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f"Precision_score : {precision_score(y_test, y_preds)}")
print(f"Recall_score : {recall_score(y_test, y_preds)}")
print(f"F1_score : {f1_score(y_test, y_preds)}")
print("*"*50)
sns.heatmap(confusion_matrix(y_test, y_preds), annot=True)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
print("Classification Report")
print(f"{classification_report(y_test, y_preds)}")

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

np.random.seed(42)
clf5= DecisionTreeClassifier()
clf5.fit(X_train, y_train)

y_preds = clf5.predict(X_test)
clf5_score = clf5.score(X_test, y_test)*100
print(f"Score : {clf5.score(X_test, y_test)}")
print(f"Cross_val_score : {np.mean(cross_val_score(clf5, X, y, cv=5))}")
print("*"* 50)
print(f"Accuracy_score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f"Precision_score : {precision_score(y_test, y_preds)}")
print(f"Recall_score : {recall_score(y_test, y_preds)}")
print(f"F1_score : {f1_score(y_test, y_preds)}")
print("*"*50)
sns.heatmap(confusion_matrix(y_test, y_preds), annot=True)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
print("Classification Report")
print(f"{classification_report(y_test, y_preds)}")

### LinearSVC

In [None]:
from sklearn.svm import LinearSVC
np.random.seed(42)
clf6 = LinearSVC()
clf6.fit(X_train, y_train)
y_preds = clf6.predict(X_test)

clf6_score = clf6.score(X_test, y_test)*100
print(f"Score : {clf6.score(X_test, y_test)}")
print(f"Cross_val_score : {np.mean(cross_val_score(clf6, X, y, cv=5))}")
print("*"* 50)
print(f"Accuracy_score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f"Precision_score : {precision_score(y_test, y_preds)}")
print(f"Recall_score : {recall_score(y_test, y_preds)}")
print(f"F1_score : {f1_score(y_test, y_preds)}")
print("*"*50)
sns.heatmap(confusion_matrix(y_test, y_preds), annot=True)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
print("Classification Report")
print(f"{classification_report(y_test, y_preds)}")

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
np.random.seed(42)
clf7 = KNeighborsClassifier()
clf7.fit(X_train, y_train)
y_preds = clf7.predict(X_test)

clf7_score = clf7.score(X_test, y_test)*100
print(f"Score : {clf7.score(X_test, y_test)}")
print(f"Cross_val_score : {np.mean(cross_val_score(clf7, X, y, cv=5))}")
print("*"* 50)
print(f"Accuracy_score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f"Precision_score : {precision_score(y_test, y_preds)}")
print(f"Recall_score : {recall_score(y_test, y_preds)}")
print(f"F1_score : {f1_score(y_test, y_preds)}")
print("*"*50)
sns.heatmap(confusion_matrix(y_test, y_preds), annot=True)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
print("Classification Report")
print(f"{classification_report(y_test, y_preds)}")

### XGBoost

In [None]:
from xgboost import XGBClassifier
np.random.seed(42)
clf8 = XGBClassifier()
clf8.fit(X_train, y_train)

y_preds = clf8.predict(X_test)
clf8_score = clf8.score(X_test, y_test)*100

print(f"Score : {clf8.score(X_test, y_test)}")
print(f"Cross_val_score : {np.mean(cross_val_score(clf8, X, y, cv=5))}")
print("*"* 50)
print(f"Accuracy_score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f"Precision_score : {precision_score(y_test, y_preds)}")
print(f"Recall_score : {recall_score(y_test, y_preds)}")
print(f"F1_score : {f1_score(y_test, y_preds)}")
print("*"*50)
sns.heatmap(confusion_matrix(y_test, y_preds), annot=True)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
print("Classification Report")
print(f"{classification_report(y_test, y_preds)}")

### Catboost

In [None]:
from catboost import CatBoostClassifier
np.random.seed(42)
clf9 = CatBoostClassifier(verbose=0)
clf9.fit(X_train, y_train)
y_preds = clf9.predict(X_test)

clf9_score = clf9.score(X_test, y_test)*100
print(f"Score : {clf9.score(X_test, y_test)}")
print(f"Cross_val_score : {np.mean(cross_val_score(clf9, X, y, cv=5))}")
print("*"* 50)
print(f"Accuracy_score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f"Precision_score : {precision_score(y_test, y_preds)}")
print(f"Recall_score : {recall_score(y_test, y_preds)}")
print(f"F1_score : {f1_score(y_test, y_preds)}")
print("*"*50)
sns.heatmap(confusion_matrix(y_test, y_preds), annot=True)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
print("Classification Report")
print(f"{classification_report(y_test, y_preds)}")

### LGBMClassifier

In [None]:
from lightgbm import LGBMClassifier
np.random.seed(42)
clf10 = LGBMClassifier()
clf10.fit(X_train, y_train)
y_preds = clf10.predict(X_test)

clf10_score = clf10.score(X_test, y_test)*100
print(f"Score : {clf10.score(X_test, y_test)}")
print(f"Cross_val_score : {np.mean(cross_val_score(clf10, X, y, cv=5))}")
print("*"* 50)
print(f"Accuracy_score : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f"Precision_score : {precision_score(y_test, y_preds)}")
print(f"Recall_score : {recall_score(y_test, y_preds)}")
print(f"F1_score : {f1_score(y_test, y_preds)}")
print("*"*50)
sns.heatmap(confusion_matrix(y_test, y_preds), annot=True)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
print("Classification Report")
print(f"{classification_report(y_test, y_preds)}")

In [None]:
scores = pd.DataFrame({
    "Model" : ['KNN', 'LGBM', 'CatBoost','XGBoost', 'RandomForest',
               'Decision Tree', 'LogisticRegression','LinearSVC', 'GradientBoost', 'AdaBoost'],
    
    'Score': [
          95.12,94.50,94.73,94.38,94.99,
        90.76,95.06,77.47,94.75,94.99]
})

print('-----Cross-validation Accuracy Scores-----')
scores.sort_values(by='Score', ascending=False)