 # Dataset Information.
##### 1) id: unique identifier
##### 2) gender: "Male", "Female" or "Other"
##### 3) age: age of the patient
##### 4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
##### 5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
##### 6) ever_married: "No" or "Yes"
##### 7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
##### 8) Residence_type: "Rural" or "Urban"
##### 9) avg_glucose_level: average glucose level in blood
##### 10) bmi: body mass index
##### 11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*
##### 12) stroke: 1 if the patient had a stroke or 0 if not

In [None]:
import warnings 
warnings.filterwarnings('ignore')

In [None]:
#Reading the data
import pandas as pd
df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')


In [None]:
df.head()

In [None]:
df.columns

In [None]:
#returns the number summation of all missing values in the data set.
df.isnull().sum()

In [None]:
#print the no. of records and features
df.shape

In [None]:
#1 if the patient had a stroke or 0 if not
#count of 0's and 1's
df['stroke'].value_counts()

In [None]:
df.drop(['id'], axis='columns', inplace=True)

In [None]:
df.columns


In [None]:
#Printing some statistical information of all the numerical features.
df.describe()

In [None]:
df['age']=df['age'].apply(lambda x: round(x))

In [None]:
df['age'][:5]

In [None]:
df['bmi'].isnull().sum()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



In [None]:
#A boxplot is a standardized way of displaying the distribution of data
#It can tell  about the outliers and what their values in the data sets.
sns.boxplot(df['bmi'])
plt.show()

In [None]:
#filling the NaN values in df['bmi'] column with the mean value of df['bmi']
df['bmi']=df['bmi'].fillna(df['bmi'].mean())

In [None]:
df['bmi'].isnull().sum()

In [None]:
sns.boxplot(df['avg_glucose_level'])
plt.show()


In [None]:
# info() function is used to print a concise summary of a DataFrame
df.info()

In [None]:
#The unique element present in all the categorical features
col=df[['gender','ever_married','work_type','Residence_type','smoking_status']]
for i in col:
    print(f'{i}: unique elements | {col[i].unique()}')

In [None]:
plt.figure(figsize=(10,5))
plt.scatter(x='bmi',y='age', data=df[df['stroke']==0])
plt.scatter(x='bmi',y='age', data=df[df['stroke']==1], marker='*')

plt.xlabel('BMI')
plt.ylabel('Age')
plt.title('Age Vs BMI')
plt.legend()
plt.show()

# Observation
From the above plot, I can conclude that the patient whose age is more than 40 has the maximum possibility of getting stroke and BMI within 20 to 50.

In [None]:
plt.figure(figsize=(10,5))
plt.scatter(x='bmi',y='avg_glucose_level', data=df[df['stroke']==0])
plt.scatter(x='bmi',y='avg_glucose_level', data=df[df['stroke']==1], marker='+')

plt.xlabel('BMI')
plt.ylabel('Average glucose level')
plt.title('Average glucose level Vs BMI')
plt.legend()
plt.show()

# Observation
From the above plot, I can conclude that the person having the maximum possibility of getting stroke whose BMI within 20 to 50.

In [None]:
plt.figure(figsize=(10,5))
plt.scatter(x='age',y='avg_glucose_level', data=df[df['stroke']==0])
plt.scatter(x='age',y='avg_glucose_level', data=df[df['stroke']==1], marker='d')

plt.xlabel('Age')
plt.ylabel('Average glucose level')
plt.title('Average glucose level Vs Age')
plt.legend()
plt.show()

# Observation
From the above plot, I can conclude that the person having the maximum possibility of getting a stroke whose Age within 40 to 80.

In [None]:
plt.figure(figsize=(10,5))
sns.set_style(style='darkgrid')
sns.countplot(df['gender'], hue=df['stroke'],palette="Accent")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.set_style(style='darkgrid')
sns.countplot(df['ever_married'], hue=df['stroke'],palette="magma")
plt.show()

plt.figure(figsize=(10,5))
sns.set_style(style='darkgrid')
sns.countplot(df['work_type'], hue=df['stroke'],palette="viridis")
plt.show()

plt.figure(figsize=(10,5))
sns.set_style(style='darkgrid')
sns.countplot(df['Residence_type'], hue=df['stroke'],palette="rocket_r")
plt.show()

plt.figure(figsize=(10,5))
sns.set_style(style='darkgrid')
sns.countplot(df['smoking_status'], hue=df['stroke'],palette="cubehelix")
plt.show()

In [None]:
Q1 = df['bmi'].quantile(0.25)
Q3 = df['bmi'].quantile(0.75)
IQR = Q3 - Q1
print(IQR)
print(Q3)
print(Q1)

In [None]:
#removing all the outlier from the dataframe
df_numeric_remove_outlier = df['bmi'][~((df['bmi'] < (Q1 - 1.5 * IQR)) |(df['bmi'] > (Q3 + 1.5 * IQR)))]
df_numeric_remove_outlier.shape

In [None]:
df_numeric_remove_outlier[:5]

In [None]:
df_new=pd.DataFrame(data=df_numeric_remove_outlier)

In [None]:
df_new.head()

In [None]:
sns.boxplot(df_new['bmi'])

In [None]:
col=df[['gender','ever_married','work_type','Residence_type','smoking_status']]

df = df[~df['gender'].isin(['Other'])]

In [None]:
df['gender'].unique()

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df['gender']=encoder.fit_transform(df['gender'])
df['ever_married']=encoder.fit_transform(df['ever_married'])
df['work_type']=encoder.fit_transform(df['work_type'])
df['Residence_type']=encoder.fit_transform(df['Residence_type'])
df['smoking_status']=encoder.fit_transform(df['smoking_status'])

In [None]:
col=df[['gender','ever_married','work_type','Residence_type','smoking_status']]
for i in col:
    print(f'{i}: unique elements | {col[i].unique()}')

In [None]:
col.columns

In [None]:
df.columns

In [None]:
df1=pd.concat([col,df_new,df['stroke'],df['age'],df['hypertension'],df['heart_disease'],
              df['avg_glucose_level']],axis=1, join='inner')

In [None]:
df1.head()

In [None]:
df1.isnull().sum()

In [None]:
df.shape

In [None]:
df1.shape

In [None]:
df.info()

In [None]:
x=df1.drop(['stroke'], axis='columns')
y=df1['stroke']

In [None]:
x.head()

In [None]:
y[:5]

In [None]:
from sklearn.preprocessing import Normalizer
scaler=Normalizer()
x_scaled=scaler.fit_transform(x)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_scaled,y,test_size=0.25)

In [None]:
from imblearn.over_sampling import SMOTE
counter=len(y_train)
print('Before Over Sampling', counter)
sns.countplot(y_train)
plt.title('Plotting of number count before sampling')
plt.show()
#oversampling the train datsets using SMOTE
sm=SMOTE()
X_train_sm, y_train_sm=sm.fit_resample(X_train,y_train)
counter=len(y_train_sm)
print('After Over Sampling', counter)
plt.title('Plotting of number count after sampling')
sns.countplot(y_train_sm)
plt.show()

# Making The model by using RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, plot_confusion_matrix
model_RF=RandomForestClassifier(n_estimators=40)
model_RF.fit(X_train_sm,y_train_sm)

In [None]:
model_RF.score(X_test,y_test)*100

In [None]:
pred_RF=model_RF.predict(X_test)
pred_RF[:5]

In [None]:
print(f"Accuracy Score : {round(accuracy_score(y_test, pred_RF) * 100, 2)}%")

In [None]:
print(classification_report(y_test, pred_RF))

In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,pred_RF)
cm

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import  DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.model_selection import cross_val_score, GridSearchCV

In [None]:
model_params = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20,25,30,40],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10,15,20,25,30]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear'),
        'params': {
            'C': [1,5,10,15,20,25]
        }
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params':{
            'criterion':['gini','entropy']
        }
    }
}

In [None]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df_score = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df_score

**Hence, we can conclude that this dataset is working best in Logistic Regression and  SVM  with Accuracy score 0.95 and 0.95 respectively.**