* ## Project:- Stroke  Prediction

A stroke occurs when the blood supply to part of your brain is interrupted or reduced, preventing brain tissue from getting oxygen and nutrients. Brain cells begin to die in minutes.

A stroke is a medical emergency, and prompt treatment is crucial. Early action can reduce brain damage and other complications.

importing libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import classification_report

loading data!!!

In [None]:
df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

### Data Cleaning 

In [None]:
df.drop('id',axis=1,inplace=True)

In [None]:
df.columns

### any missing values??

In [None]:
df.isnull().sum()

In [None]:
# fill na

In [None]:
df['bmi'].fillna(df['bmi'].mean(), inplace=True)

In [None]:
df.isnull().sum()['bmi']

### Feature Engineering

Now deal with categorcal data 

1.hypertension

2.heart_disease

3.ever_married

4.work_type

5.Residence_type

6.smoking_status

7.Gender

##### 1.hypertension

In [None]:
df['hypertension'].value_counts()

In [None]:
print(df['hypertension'].dtype)

##### 2.heart_disease

In [None]:
df['heart_disease'].value_counts()

In [None]:
print(df['heart_disease'].dtype)

##### 3.ever_married

In [None]:
df['ever_married'].value_counts()

In [None]:
def married(ever_married):
    if ever_married == 'Yes':
        return 1
    else:
        return 0

In [None]:
df['ever_married']=df['ever_married'].apply(married)

In [None]:
df['ever_married'].value_counts()

##### 4.work_type

In [None]:
df['work_type'].value_counts()

In [None]:
def work_type(work):
    if work == 'Private':
        return 4
    if work =='Self-employed':
        return 3
    if work =='children':
        return 2
    if work == 'Govt_job':
        return 1
    if work == 'Never_worked':
        return 0

In [None]:
df['work_type']= df['work_type'].apply(work_type)

In [None]:
df.work_type.value_counts()

##### 5.Residence_type

In [None]:
df['Residence_type'].value_counts()

In [None]:
def residence_type(residence):
    if residence=='Urban':
        return 1
    else:
        return 0
    Residence_type
df['Residence_type']=df['Residence_type'].apply(residence_type)

In [None]:
df.Residence_type.value_counts()

##### 6.smoking_status

In [None]:
df['smoking_status'].value_counts()

In [None]:
def smoking_status(smoking):
    if smoking=='never smoked':
        return 0
    if smoking=='Unknown':
        return 
        
        

In [None]:
df['smoking_status'] = pd.Categorical(df['smoking_status'])
dfDummies_smoking_status = pd.get_dummies(df['smoking_status'], prefix = 'smoking_status_encoded')
dfDummies_smoking_status

In [None]:
df = pd.concat([df, dfDummies_smoking_status], axis=1)

##### 7.Gender

In [None]:
df['gender'].value_counts()


In [None]:
def gender(gender):
    if gender=='Male':
        return 1
    else:
        return 0

In [None]:
df['gender']=df['gender'].apply(gender)
df['gender'].value_counts()

### Data Visualization

##### 1.Avg Glucose level

In [None]:
plt.figure(figsize=(8,6))
plt.title('Agv Glucose level')
sns.boxplot(x='avg_glucose_level',data=df)
plt.show()

##### 2.Age

In [None]:
plt.figure(figsize=(10,8))
sns.histplot(data=df,x='age',hue='stroke')

##### 3.heart_disease

In [None]:
plt.figure(figsize=(8,6))
plt.title('heart_disease')
sns.histplot(x='heart_disease',data=df,hue='stroke')
plt.show()

##### 3.Hypertension

In [None]:
plt.figure(figsize=(8,6))
plt.title('heart_disease')
sns.histplot(x='hypertension',data=df,hue='stroke')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
plt.title('heart_disease')
sns.histplot(x='work_type',data=df,hue='stroke')
plt.show()

Which Factor affecting most on Stroke

In [None]:
df.corr()['stroke'].sort_values(ascending=False)

In [None]:
plt.figure(figsize=(10,8))
df.corr()['stroke'].sort_values(ascending=False)[1:].plot(kind='bar')

#### correlation heatmap

In [None]:
plt.figure(figsize=(12,8))
plt.title('Correlation Heatmap')
sns.heatmap(df.corr())
plt.show()

In [None]:
df.drop('smoking_status',axis=1,inplace=True)

In [None]:
df.info()

In [None]:
X=df.drop('stroke',axis=1)

In [None]:
y=df['stroke']

### Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
X_train.shape,y_train.shape

In [None]:
X_test.shape, y_test.shape

### Model Building

In [None]:
lr=LogisticRegression()

In [None]:
lr.fit(X_train,y_train)

In [None]:
y_pred=lr.predict(X_test)

#### model Evaluations

In [None]:
print(classification_report(y_test,y_pred))

acccuracy for 1 is not well bcoz our data is unbalanced

In [None]:
df['stroke'].value_counts()

### Handle imbalanced data

In [None]:
df_class_1=df[df['stroke']==1]
df_class_0=df[df['stroke']==0]

#### OverSampling Method

In [None]:
df_class_1_over = df_class_1.sample(4861, replace=True)

In [None]:
df_class_1_over.shape

In [None]:
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_test_over.stroke.value_counts())

In [None]:
# df_test_over

spliting data into training and testing set

In [None]:
X = df_test_over.drop('stroke',axis='columns')
y = df_test_over['stroke']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)

#### model building and model Evaluation

#### LogisticRegression

In [None]:
lr.fit(X_train,y_train)


In [None]:
predictions=lr.predict(X_test)

In [None]:
predictions

In [None]:
# y_test

In [None]:
print(classification_report(y_test,predictions))

#### RandomForestClassifier

In [None]:
model = RandomForestClassifier()

In [None]:
model.fit(X_train,y_train)

In [None]:
predictions=model.predict(X_test)

In [None]:
print(classification_report(y_test,predictions))

#### predictions

In [None]:
X_train.head(1)

In [None]:
def rfc_model(data):
    pred=model.predict(data)[0]
    if pred==1:
        return 'Stroke Occure'
    else:
        return 'Stroke not occure'

rfc_model([[1,67.0,0,1,1,4,1,228.33,36.7,0,1,0,0]])
    
 

In [None]:
y_train[:1]