In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import cufflinks as cf
cf.go_offline()

In [None]:
raw_data = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
raw_data.head()

In [None]:
raw_data.describe(include = "all")

In [None]:
sns.heatmap(raw_data.isnull(),yticklabels = False ,cbar = False,cmap = 'viridis');

In [None]:
print(f"Percentage of missing value in BIM is  {(raw_data['bmi'].isnull().sum()/raw_data.shape[0])*100:0.2f}")

In [None]:
raw_data.dropna(axis = 0, inplace = True)
raw_data.head()

In [None]:
raw_data['stroke']=raw_data['stroke'].astype(str)
fig = px.scatter(raw_data,x='age',y='bmi',color = 'stroke')
fig.show()

In [None]:
fig = px.histogram(raw_data,x='age',color = 'stroke',marginal="box")
fig.show()

In [None]:
sns.countplot(x='work_type',data = raw_data);

In [None]:
raw_data['gender'].value_counts()

In [None]:
raw_data['gender'].replace(to_replace = 'Other',value ='Female',inplace = True)

In [None]:
sns.countplot(x='gender',data = raw_data,hue = 'stroke');

In [None]:
sns.displot(raw_data['age'],kde= True)
plt.show()

In [None]:
sns.displot(raw_data['avg_glucose_level'],kde= True,color = 'green')
plt.show()

In [None]:
sns.countplot(x='Residence_type',data = raw_data,hue = 'stroke');

In [None]:
smoke_group = ['never smoked','Unknown','smokes','formerly smoked']
smoke_dict = {}

for i in smoke_group:
    
    per = (len(raw_data[raw_data["smoking_status"] == i])/len(raw_data))*100
    
    smoke_dict.update({i:round(per)})

print(smoke_dict)

In [None]:
sizes = smoke_dict.values()

labels = smoke_dict.keys()

explode = (0.1, 0, 0, 0.1) 

fig1, ax = plt.subplots(figsize=(12,5))
ax.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',shadow=True, startangle=90)

ax.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.title("Smoking Status Percentage ")
plt.show()

In [None]:
sns.countplot(x='smoking_status',data = raw_data,hue = 'stroke');

### The graph shows there is an inverse relationship between Smoking and Stroke

In [None]:
raw_data['bmi'].iplot(kind = 'hist')

In [None]:
raw_data['smoking_status'].value_counts()/raw_data['smoking_status'].value_counts().sum()
# unknown will replace with never smoked

In [None]:
raw_data['work_type'].value_counts()/raw_data['work_type'].value_counts().sum()

# Converting categorical data to numerical

In [None]:
data = raw_data.copy()

data['gender']= data['gender'].map({'Female':1,"Male":0})
data['ever_married'] = data['ever_married'].map({'Yes':1,"No":0})
data['Residence_type'] = data['Residence_type'].map({'Urban':0,'Rural':1})
data['smoking_status'] = data['smoking_status'].map({'never smoked':0,'Unknown':0,'smokes':1,'formerly smoked':2})
data['work_type'] = data['work_type'].map({'Private':1,'Self-employed':2,'children':3,'Govt_job':4,'Never_worked':5})

In [None]:
data.head()

In [None]:
y= data['stroke']
X = data.drop(columns = ['stroke','id'],axis = 1)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_scale = scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X_scale,y,test_size = 0.30,random_state = 42)

In [None]:
from sklearn.linear_model import LogisticRegression

logmodel = LogisticRegression()

logmodel.fit(X_train,y_train)

In [None]:
predictions = logmodel.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
import warnings
warnings.filterwarnings('ignore')

print(classification_report(y_test,predictions))

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test,predictions)

In [None]:
logmodel.score(X_test,y_test)