In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df=pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
df.head()

## Exploring Data

In [None]:
df.info()

In [None]:
#Dropping id coolumn since it is not relevant
df=df.drop('id',axis=1)

In [None]:
cat_cols=df.select_dtypes('object').columns
num_cols=df.select_dtypes(['int64','float']).columns

In [None]:
#Imputing Null Values
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp2=SimpleImputer(missing_values=np.nan, strategy='mean')
df[cat_cols]=imp.fit_transform(df[cat_cols])
df[num_cols]=imp2.fit_transform(df[num_cols])

In [None]:
#Checking for nulls
df.isnull().sum()

In [None]:
#Looking for duplicates
df.duplicated().sum()

In [None]:
df.describe()

In [None]:
#Univariate Analysis
for i in num_cols:
    sns.set(rc={'figure.figsize':(8,6)})
    ax = sns.distplot(df[i])
    plt.title(i)
    plt.show()

In [None]:
for i in cat_cols:
    sns.countplot(df[i],palette='mako')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
for col in df:
    if df[col].dtype=='object':
        print(col,"\t:",df[col].unique())


In [None]:
Replacem={'Male':0, 'Female':1, 'Other':2,'Yes':0,'No':1,'Private':0, 'Self-employed':1,'Govt_job':2, 'children':3,
          'Never_worked':4, 
         'Urban':0, 'Rural':1,'formerly smoked':0, 'never smoked':1, 'smokes':2, 'Unknown':4}

In [None]:
df2=df.replace(Replacem)

In [None]:
df2

In [None]:
X=df2.drop('stroke', axis=1)
y=df2['stroke']

In [None]:
#Splitting Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=2)

In [None]:
#scaling the data
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

In [None]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

ntree_range = list(range(1, 55))
depth_range = [4,8,12,16,25,66,78,90,76,554]
param_dist = dict(n_estimators=ntree_range, max_depth=depth_range)
c_values = [100, 10,1.0,15,45,75]
clf_rf = RandomForestClassifier(n_estimators = 1000)
# Create grid search object

Grid= GridSearchCV(clf_rf, param_dist, cv=20, scoring='accuracy',verbose=True, n_jobs=20, error_score =0)

# Fit on data

best_grid = Grid.fit(X_train, y_train)
rfcpred = best_grid.predict(X_test)

In [None]:
#Importing libraries
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report,roc_auc_score,accuracy_score,precision_score, recall_score,roc_curve
from sklearn.preprocessing import StandardScaler

In [None]:
#Accuracy Check
print(accuracy_score(y_test,rfcpred))

In [None]:
#Accuracy Check

from sklearn.metrics import accuracy_score,classification_report

print(classification_report(y_test,rfcpred))

In [None]:
from sklearn.cluster import KMeans
train_data=df2
error_rate = []

for i in range(1,21):
    KM = KMeans(n_clusters=i)
    KM.fit(train_data)
    
    error_rate.append(KM.inertia_)
    # Plotting Elbow (Error) curve
plt.figure(figsize=(8,4))
plt.plot(range(1,21),error_rate,marker='o')
plt.xticks(range(1,21))
plt.title('Elbow Curve')
plt.xlabel('Number of clusters (k)')
plt.ylabel('WSS')
plt.show()

In [None]:
from sklearn.metrics import silhouette_score

sil_score = []

for i in range(2,21):
    KM = KMeans(n_clusters=i)
    KM.fit(train_data)
    
    labels = KM.labels_
    sil_score.append(silhouette_score(train_data, labels, metric = 'euclidean'))
    
# Plotting Silhouette Score Vs k
plt.figure(figsize=(8,4))
plt.plot(range(2,21),sil_score,marker='o')
plt.xticks(range(2,21))
plt.xlabel('Number of clusters (k)')
plt.ylabel('Silhouette score')
plt.show()

In [None]:
# Creating the final cluster solution with optimal number of clusters

km = KMeans(n_clusters = 3, random_state = 0)

In [None]:
y_pp=km.fit_predict(train_data)
df['cluster']=y_pp

In [None]:
res=km.labels_
res=pd.DataFrame(res,columns=['cluster'])
res.groupby('cluster').size()

In [None]:
cluster1=df[df['cluster']==0]
cluster2=df[df['cluster']==1]
cluster3=df[df['cluster']==2]

In [None]:
#Cluster1
for i in cluster1.columns:
    sns.set(rc={'figure.figsize':(10,5)})
    ax = sns.countplot(cluster1[i])
    plt.xticks(rotation=90)
    plt.title(i)
    plt.show()