In [None]:
#pandas
import pandas as pd 

#numpy
import numpy as np 

#matplotlib
import matplotlib.pyplot as plt 

#seaborn
import seaborn as sns
sns.set_theme(style="darkgrid")

#sklearn
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score


## Read Data From CSV.

In [None]:
stroke_df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
stroke_df.head()

## **Identify Categorical and Numerical Fetures**

In [None]:
categorical_vars = list()
numerical_vars = list()

for i in stroke_df.columns:
    if stroke_df[i].dtype =='object':
        categorical_vars.append(i)
    else:
        numerical_vars.append(i)


### **Categorical Variables**

In [None]:
print(categorical_vars)

### **Numerical Variables**

In [None]:
print(numerical_vars)

## If you observe we dont requiredd id column for our prediction.So we drop it.

In [None]:
stroke_df.drop('id',axis=1,inplace=True)
stroke_df.head()

## **Check for NULL Values**

In [None]:
stroke_df.isnull().sum()

## **Inference :**
## In our dataset there is no null values present except bmi column.

In [None]:
print("Total Rows In BMI column :",len(stroke_df.bmi))
print("Total null values present in bmi column :",stroke_df.bmi.isnull().sum())

# **Handling Missing Values**
## There are many ways to handle missing values.
## One could be delete rows in which we have null values present.
## But because of this we can can loss lot of information
## Another way is replace null vvalues with mean/median.
## The secod method is effective when dataset is numeric and continous & good news is our bmi column fit perfectly in this condition.
## So we use second method.

In [None]:
stroke_df['bmi'] = stroke_df.bmi.replace(np.NAN,stroke_df.bmi.mean())

In [None]:
stroke_df.isnull().sum()

# **EDA**

In [None]:
stroke_df.head()

## **Count Plot for Stroke Feature**

## **1**:Patient had stroke.
## **0**:Patient had no stroke.

In [None]:
sns.countplot(x='stroke',data=stroke_df)
plt.title("Countplot for Stroke",{'fontsize':20});

In [None]:
print("Total Observations :",stroke_df.shape[0])
print("Patients does not have stroke :",stroke_df.stroke.value_counts()[0])
print("Patients have stroke :",stroke_df.stroke.value_counts()[1])
print("Event Rate :",(stroke_df.stroke.value_counts()[1] / stroke_df.shape[0])*100)

## **Inference :**
## Based on distribution of stroke feature we can say that dataset is imbalance.
## We have more records of patients had no stroke as compare to patients had stroke.
## Lets handle the imbalance data later.

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 8), sharey=True)
fig.suptitle('Distribution CountPlot Some Features')

sns.countplot(ax=axes[0][0], x=stroke_df['hypertension'],palette="viridis")

sns.countplot(ax=axes[0][1], x=stroke_df['work_type'],palette="rocket");

sns.countplot(ax=axes[1][0], x=stroke_df['ever_married'],palette="husl");

sns.countplot(ax=axes[1][1], x=stroke_df['work_type'],palette="husl");


## **Distribution based on Stroke Patients**

In [None]:
stroke_df.work_type[stroke_df.stroke == 1].value_counts()

In [None]:
stroke_df.smoking_status.unique()

In [None]:
plt.figure(figsize=(7,8))
labels = [ "formely smoked" , "neber smoked","smokes","unknown"]
plt.pie(x=stroke_df.smoking_status[stroke_df.stroke == 1].value_counts(),
        # explode = (0, 0, 0, 0.2),
        autopct='%1.1f%%',
        shadow=True, colors=['plum','lightpink','lawngreen','cyan']);
plt.legend(labels,bbox_to_anchor=(1.05,1.025), loc="upper left");
plt.title("Patients have stroke based on work type",{'fontsize':20});

In [None]:
plt.figure(figsize=(7,8))
labels = [ "Private" , "Self-employed","Govt_job","children"]
plt.pie(x=stroke_df.work_type[stroke_df.stroke == 1].value_counts(),
        explode = (0, 0, 0, 0.2),
        autopct='%1.1f%%',
        shadow=True, colors=['royalblue','darkorange','springgreen','lightcyan','lavender']);
plt.legend(labels,bbox_to_anchor=(1.05,1.025), loc="upper left");
plt.title("Patients have stroke based on work type",{'fontsize':20});

## **Inference :**
## Based on distribution the people whos work type is private having stroke as compared to gov job.

In [None]:
stroke_df.head()

In [None]:
X = stroke_df.drop('stroke',axis=1)
y = stroke_df.stroke

In [None]:
X.age = round(X.age)

In [None]:
encoder = LabelEncoder()

In [None]:
objList = X.select_dtypes(include = "object").columns
for feat in objList:
    X[feat] = encoder.fit_transform(X[feat])

# **Handling Imbalance Data**

## SMOTE algorithm works in 4 simple steps:
## <ul>
## <li>Choose a minority class as the input vector
## <li>Find its k nearest neighbors (k_neighbors is specified as an argument in the SMOTE() function)
## <li>Choose one of these neighbors and place a synthetic point anywhere on the line joining the point under consideration and its chosen neighbor
## <li> Repeat the steps until data is balanced
## </ul>

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

x_smote, y_smote = smote.fit_resample(X, y)

In [None]:
print('Original dataset shape', X.shape)
print('Resample dataset shape', x_smote.shape)

# **Spliting Data**
## To get a good prediction, divide the data into training and testing data, it is because as the name suggests you will train few data points and test few data points, and keep on doing that unless you get good results.

In [None]:
X_train,X_test,y_train,y_test = train_test_split(x_smote,y_smote,test_size=0.28)

# **Models**

## Feature Scaling 

In [None]:
scalar = StandardScaler()
X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.fit_transform(X_test)

## **Logistic Regression**

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled,y_train)
log_reg.score(X_test_scaled,y_test)

## **Random Forest**

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train_scaled,y_train)
rf.score(X_test_scaled,y_test)

## **Clasification Report**

In [None]:
rf_pred = rf.predict(X_test_scaled)
log_pred = log_reg.predict(X_test_scaled)

print("Classifiaction Report for Random Forest")
print(classification_report(y_test,rf_pred))
print("******************************************************")
print("Classification Report for Logistic Regression")
print(classification_report(y_test,log_pred))

## **Confusion Metrices**

## **Random Forest**

In [None]:
class_names = [0,1]
fig,ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)

cnf_matrix = confusion_matrix(y_test,rf_pred)
sns.heatmap(pd.DataFrame(cnf_matrix), annot = True, cmap="Blues",
            fmt = 'g')
ax.xaxis.set_label_position('top')
plt.tight_layout()
plt.title(f'Heat Map for Random Forest', {'fontsize':20})
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

## **Logistic Regression**

In [None]:
class_names = [0,1]
fig,ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)

cnf_matrix = confusion_matrix(y_test,log_pred)
sns.heatmap(pd.DataFrame(cnf_matrix), annot = True, cmap = 'Blues',
            fmt = 'g')
ax.xaxis.set_label_position('top')
plt.tight_layout()
plt.title(f'Heat Map for Logistic Regression', {'fontsize':20})
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

## **Roc-Auc Curves**

In [None]:
pred_prob1 = log_reg.predict_proba(X_test_scaled)
pred_prob2 = rf.predict_proba(X_test_scaled)
from sklearn.metrics import roc_curve

# roc curve for models
fpr1, tpr1, thresh1 = roc_curve(y_test, pred_prob1[:,1], pos_label=1)
fpr2, tpr2, thresh2 = roc_curve(y_test, pred_prob2[:,1], pos_label=1)

# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

In [None]:
auc_score1 = roc_auc_score(y_test, pred_prob1[:,1])
auc_score2 = roc_auc_score(y_test, pred_prob2[:,1])

print(auc_score1, auc_score2)

In [None]:
plt.style.use('seaborn')
# plot roc curves
plt.plot(fpr1, tpr1, linestyle='--',color='orange', label='Logistic Regression')
plt.plot(fpr2, tpr2, linestyle='--',color='green', label='Random Forest')
plt.plot(p_fpr, p_tpr, linestyle='--', color='blue')
# title
plt.title('ROC curve')
# x label
plt.xlabel('False Positive Rate')
# y label
plt.ylabel('True Positive rate')

plt.legend(loc='best')
plt.savefig('ROC',dpi=300)
plt.show();

## **Inference :**
## After plotting AUC-ROC curve we can observe Random Forest curve is higher than that for the Logistic Regression ROC curve. Therefore, we can say that Random Forest did a better job of classifying the positive class in the dataset.

# F1 Score
## The F1 Score is the 2*((precision*recall)/(precision+recall))

## **Logistic Regression**

In [None]:
print(f1_score(y_test,log_pred))

## **Random Forest**

In [None]:
print(f1_score(y_test,rf_pred))

## **Feature Importance For Random Forest Model**

In [None]:
plt.figure(figsize=(9,7))
feature_imp1 = rf.feature_importances_
sns.barplot(x=feature_imp1, y=X.columns)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features For Random Forest ",{'fontsize':25})
plt.show();
feature_dict = {k:v for (k,v) in zip(X.columns,feature_imp1)}

# **Save Model**

In [None]:
import pickle
with open('stroke.sav','wb') as f:
    pickle.dump(rf,f)

## **Inference :**
## Feature importance hepls to understand how model is work.
## In this case we can see that Age,Average Glucoge and BMI are the most important features for our model.
## Age is most signifiacant feature for our model.

# **Conclusion :**

## We start with reading data and then categorised categorical features and numerical features.After that we deal with missing values   in **BMI** feature.
## Then we perform EDA on features.We conclude that we have imbalance data ie negative  examples is greater that positive class.
## After visulization we handle imbalance data.
## After that we move to most important part model building. Before starting to train model we split our data into train data(testing ## purpose) and test data(validation purpose) and perform **feature scaling**.
## Random Forest and Logistic Regression models were tried.
## To check which model perform best plot roc-auc curves along with **classifiaction report** and **confusion matrices**.
## While **Random Forest**  win the race.
## **I therefore selected the Random Forest as my model.**odel.**