# **Importing all the required libraries.**


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE


1. **Reading the csv file with pandas**

In [None]:
df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

df.head()

# Data Visualisation

1. ****Plot --> Hypertension****

In [None]:
sns.countplot(x=df['hypertension'],data=df)
print(df.hypertension.value_counts())

****Plot -> Heart_disease****

In [None]:
sns.countplot(x=df['heart_disease'],data=df)
print(df.heart_disease.value_counts())

**Plot -> gender**

In [None]:
sns.countplot(x=df['gender'],data=df)
print(df.gender.value_counts())

**plot -> Married**

In [None]:
sns.countplot(x=df['ever_married'],data=df)
print(df.ever_married.value_counts())

**Plot -> WorkType**

In [None]:
sns.countplot(x=df.work_type,data=df)
print(df.work_type.value_counts())

***Plot -> Residence Type***

In [None]:
sns.countplot(x=df.Residence_type,data=df)
print(df.Residence_type.value_counts())

**Plot -> Smoking Status**

In [None]:
sns.countplot(x=df.smoking_status,data=df)
print(df.smoking_status.value_counts())

**Plot -> Strok**

In [None]:
sns.countplot(x=df.stroke,data=df)
print(df.stroke.value_counts())

# **Corelation Plots**

**Plot b/w gender and stroke**

In [None]:
sns.countplot(x=df['gender'],hue=df['stroke'],data=df)

*****Plot B/w hypertension and stroke*****

In [None]:
sns.countplot(x=df['hypertension'],hue=df['stroke'],data=df)

***Plot b/w heart_disease and stroke***

In [None]:
sns.countplot(x=df['heart_disease'],hue=df['stroke'],data=df)

***Plot b/w married  and storke***

In [None]:
sns.countplot(x=df['ever_married'],hue=df['stroke'],data=df)

***Plot b/w worktype and stroke***

In [None]:
sns.countplot(x=df['work_type'],hue=df['stroke'],data=df)

***Plot b/w Residence type and strole***

In [None]:
sns.countplot(x=df['Residence_type'],hue=df['stroke'],data=df)

***Plot b/w smoking status and stroke***

In [None]:
sns.countplot(x=df['smoking_status'],hue=df['stroke'],data=df)

In [None]:
df.corr()

# ***Data Preprocessing***

# 1. **Handling Missing Values**

***checking for null values ***

In [None]:
df.isnull()

In [None]:
sns.heatmap(df.isnull())

**We see in bmi cloumn we have null values**
**Lets check how many null values are there in bmi cloumn**

In [None]:
df['bmi'].isnull().sum()

**There are 201 Missing values in bmi we have to fill them, Here We use mean and fill them**

In [None]:
df['bmi'].fillna(df['bmi'].mean(),inplace=True)

**Now lets still there are missing values**

In [None]:
df['bmi'].isnull().sum()

In [None]:
sns.heatmap(df.isnull())

 # 2. ***Outlier Removal***

***Outlier Visualization***

In [None]:
sns.boxplot(x='bmi',data=df)

In [None]:
sns.histplot(df.bmi,kde=True)

In [None]:
sns.histplot(df.avg_glucose_level,kde=True)

* **outlier removal by iqr method**

In [None]:
q1,q3=np.percentile(df['bmi'],[25,75])

In [None]:
q1

In [None]:
q3

In [None]:
iqr=q3-q1
iqr

In [None]:
lower_limit = q1-1.5*iqr
upper_limit = q3+1.5*iqr

In [None]:
lower_limit

In [None]:
upper_limit

In [None]:
df.drop(df[df['bmi']>upper_limit].index,inplace=True)
df.drop(df[df['bmi']<lower_limit].index,inplace=True)

**Outliers are removed lets plot graph again**

In [None]:
sns.histplot(df.bmi,kde=True)

In [None]:
sns.boxplot(x='bmi',data=df)

***gender has a one column with other so we can remove it***

In [None]:
print(df.gender.value_counts())
df.drop(df[df['gender']== 'Other'].index,inplace=True)
print(df.gender.value_counts())

1. ***Categorical values - Handling  ***

One hot encoding 

In [None]:
sex=pd.get_dummies(df['gender'],drop_first=True)
married = pd.get_dummies(df['ever_married'],drop_first=True)
residence=pd.get_dummies(df['Residence_type'],drop_first=True)
df = pd.concat([df,sex],axis=1)
df = pd.concat([df,married],axis=1)
df=pd.concat([df,residence],axis=1)

In [None]:
df.head(1)

In [None]:
print(df['work_type'].value_counts())

In [None]:
print(df['smoking_status'].value_counts())

***Target encoding***


In [None]:
df["Work_Type"] = df["work_type"].map({'children':0,'Never_worked':1, 'Govt_job':2, 'Self-employed':3, 'Private':4})
# children majorly wont affect by strokes and never worked too but private ppl will have pressures may get more chnace
df["Smoking_Status"] = df["smoking_status"].map({'never smoked':0, 'formerly smoked':1, 'smokes':2, 'Unknown':3})

***Now drop our columns which are not encoded***

In [None]:
df.drop(['id','gender','ever_married','work_type','Residence_type','smoking_status'],axis=1,inplace=True)

 ***renaming to orginal column names ***

In [None]:
df.rename(columns={'Male':'gender','Yes':'ever_married','Urban':'Residence_type'},inplace=True)

In [None]:
df.head()

# ***Balanacing the target field***

***We observed in stroke(target field) the data is imbalanced 
that is stroke->0 has around 4700 and with 1 around 270 
If we create model with this it will impact our model, our model will only consider majority,
so we get always 0 has prediction.
To overcome this we have to do Under sampling or over sampling***

***Im using Oversampling - SMOTE method to balance the data ,
This will bring minority class equal to majority class***

In [None]:
X = df.drop(['stroke'],axis=1)
y = df['stroke']
smote = SMOTE()
X,y = smote.fit_resample(X,y)

In [None]:
y.value_counts()

In [None]:
y = pd.DataFrame({'stroke':y})
sns.countplot(data = y, x = 'stroke', y= None)

***Now we see our target data are balanced
Our data points are doubled in x and y so that we can get more accurate model
Now lets make this as datafram***

In [None]:
y.head(2)

In [None]:
X.head(2)

In [None]:
df = pd.concat([X,y],axis = 1)
df.head()

***Our Data set is clean now we can split dataset into x and y***

In [None]:
X=df.drop(['stroke'],axis=1)
y=df['stroke']

***From Train and split library we will split data into training set and testing set***

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=42)

***Lets create models***

**> Logistic Regression Model**

In [None]:
Logistic_model = LogisticRegression()
Logistic_model.fit(X_train,y_train)

***Lets Predict***

In [None]:
Logistic_pred = Logistic_model.predict(X_test)

***Accuracy of the model***

In [None]:
print("accuracy of Logistic model is : ",accuracy_score(y_test,Logistic_pred))

***Decision tree classification model***

In [None]:
Decision_model=DecisionTreeClassifier()
Decision_model.fit(X_train,y_train)

In [None]:
Decision_pred = Decision_model.predict(X_test)
print("accuracy of Decision_model is : ",accuracy_score(y_test,Decision_pred))

***Random Forest Classification Model***

In [None]:
Random_model = RandomForestClassifier()
Random_model.fit(X_train,y_train)

In [None]:
Random_pred=Random_model.predict(X_test)
print("accuracy of Random model is : ",accuracy_score(y_test,Random_pred))

**SVC MODEL**

In [None]:
Svc_model = SVC()
Svc_model.fit(X_train,y_train)

In [None]:
Svc_predict = Svc_model.predict(X_test)
print("accuracy of Svc model is : ",accuracy_score(y_test,Svc_predict))

In [None]:
# lets save the model 
import joblib
joblib_file = "BrainStroke_prediction_mlops_rf.h5"
joblib.dump(Random_model, joblib_file)