In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
!pip install statsmodels
import statsmodels.api as sm 
from scipy import stats

In [None]:
pwd

In [None]:
data = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
data.head(5)

In [None]:
data.info()

In [None]:
data.drop(columns = ['id'], axis=1, inplace=True)

In [None]:
# bmi column has missing values, so we will try and deal with those missing values first of all of this column, for that we will need some additional informations, like how many missing value values are there, so:-
data.isnull().sum()
# so now we can see, every other column except bmi zero missing values

In [None]:
data.info()

In [None]:
data['bmi'].fillna(value=data['bmi'].mean(), inplace=True)
data['bmi'].isnull().sum()
# here with above code what we have done is that we have replaced the missing values in the column 'bmi' with the mean values, so we need to calculate the mean by summing the values in the 'bmi' column and then dividing it by total number of rows.
#  and then we replaced all the missing values with mean, there are many stratigies for the replacement which solely depends over the type of data you have.
# and now we can see that we have zero missing values in our column

In [None]:
data.head()

In [None]:
y = data['stroke']
x = data.drop(columns='stroke', axis=1)
# here we have divided our dataset into dependent and independent variables, in 'x' we have all the independent variables and in 'y' we have all the depedent variables.

In [None]:
x.info() # so now our independent data columns have no missing values and now we can deal further

In [None]:
x['gender'].value_counts() # so as we can see, only one column is there with 'other' value in our categorical column of gender, so we will replace that with male, or we can also delete the whole row , there can be different strategies. we will only have to delete one row if we decide to get rid of 'other' value.

In [None]:
x['gender'] = x['gender'].replace('Other', 'Male') # we replaced with the 'male' value here
x['gender'].value_counts() # and now checking the number of unique values

In [None]:
x.head()

In [None]:
# checking the 'ever_married' column
x['ever_married'].value_counts() # so here we don't have any value which is in very less amount.

In [None]:
x['Residence_type'].value_counts()

In [None]:
x['smoking_status'].value_counts() # so here also we do not need to worry about the different data in very less amount.

In [None]:
x['work_type'].value_counts() # here we can replace 'Never_worked' values with 'Govt_job'

In [None]:
# now we will encode our categorical features.
# as we donot have any ordinal categorical features, that's we can simply get the dummy variables rather than label encoding
categorical_features = ['gender','ever_married','work_type','Residence_type','smoking_status']

In [None]:
dummy = pd.get_dummies(x[categorical_features])  # herer we converted these categorical features into dummy variables

In [None]:
x = pd.concat([x, dummy], axis=1)
x.head() # here we concatenated the two data frames.

In [None]:
# we will drop all the categorical features which are there in dataframes which will not be usedful in the prediction now as we already have out set of dummy encoded set of categorical features.
x.drop(columns=categorical_features, inplace=True)

In [None]:
x.head()

In [None]:
x.info()

In [None]:
# now we will see if there are outliers present in the data columns, for that we will make a function
def outlier(x , column):
    q1 = x[column].quantile(0.25)
    q3 = x[column].quantile(0.75)
    itr = q3 -q1
    upper_limit = q3 + 1.5 * itr
    lower_limit = q1 - 1.5 * itr
    return upper_limit, lower_limit

In [None]:
# now we will check for outliers in the numerical columns one by one.
ul , ll = outlier(x, 'age')
ul2, ll2 = outlier(x, 'avg_glucose_level')
ul3, ll3 = outlier(x, 'bmi')


In [None]:
# now we will plot the graph to see the outliers present in the data visually using box plot.
numercal_columns = ['age', 'avg_glucose_level', 'bmi']
plt.boxplot(x['bmi']) # here we can see , we have lot of outliers which we need to remove or deal with some techniques

In [None]:
plt.boxplot(x['avg_glucose_level']) # here too we have lot of data outside of the box, which we call outliers

In [None]:
plt.boxplot(x['age']) # here there is no outliers present

In [None]:
x.describe()

In [None]:
# therfore now we will make a function for treating the outliers
def outliers_deal(value):
    if value > ul2:
        return ul2
    elif value < ll2:
        return ll2
    else:
        return value

In [None]:
x['avg_glucose_level'] = x['avg_glucose_level'].apply(outliers_deal)

In [None]:
# therfore now we will make another function for treating the outliers
def outliers_deal(value):
    if value > ul3:
        return ul3
    elif value < ll3:
        return ll3
    else:
        return value

In [None]:
x['bmi'] = x['bmi'].apply(outliers_deal)

In [None]:
x.describe() # now we can see the max and min values have been limited after we dealt with the outliers

In [None]:
x.info()

In [None]:
# if we apply tree based algorithms, multicolinearity will not be a problem in that case, but if we apply distance based algorithms, multicolinearity will effect our predictions. thereby we will try to remove multicolinearity and in the process the feature variables will also be reduced which effect the model predcition in a positive way
corr = x.corr()
corr

In [None]:
# plotting the heat map of correlation matrix using seaborn library
plt.figure(dpi=150)
sns.heatmap(x.corr(), xticklabels= corr.columns, yticklabels= corr.columns)

In [None]:
# plt.figure(dpi=100)
# sns.pairplot(x,height=2,palette='OrRd')
# plt.show() # one can observe the correlation between different variable

In [None]:
x = x.rename(columns={'smoking_status_never smoked':'smoking_status_never_smoked','smoking_status_formerly smoked':'smoking_status_formerly_smoked'})

In [None]:
# we will use VIF to calculate and remove the features which are highly correlated
columns = ['age','hypertension','heart_disease','avg_glucose_level','bmi','gender_Female','gender_Male',	'ever_married_No','ever_married_Yes','work_type_Govt_job','work_type_Never_worked','work_type_Private',	'work_type_Self-employed',	'work_type_children',	'Residence_type_Rural'	,'Residence_type_Urban'	'smoking_status_Unknown'	,'smoking_status_formerly smoked'	,'smoking_status_never smoked',	'smoking_status_smokes' ]

In [None]:
# if we are applying any kind of tree models, then we don't need to worry about the multicolinearity. feature scaling is also not required for tree based models
# so now, we will split our dataset into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_train)

In [None]:
from sklearn import metrics
metrics.accuracy_score(y_train, y_pred)

In [None]:
y_pred = clf.predict(x_test)
metrics.accuracy_score(y_test, y_pred)
# if the absolute difference between the accuracy score of both prediction i.e, over training set and over test set would have  been very high, in that case, we would have needed the hyperparameter tuning because of overfitting, but here in this case, the difference is not that big, so we can easily proceed ahead.

In [None]:
# now, we will visualize the theh feature importance
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train,y_train)
predict = clf.predict(x_test)

In [None]:
import pandas as pd 
feature_importace = pd.Series(clf.feature_importances_ , index=x.columns).sort_values(ascending=False)
feature_importace

In [None]:
# visualizing feature importance
plt.figure(dpi=100)
sns.barplot(x = feature_importace, y = feature_importace.index )
plt.xlabel('feature importance score')
plt.ylabel('features')
plt.title('visualizing important features')
plt.show()
# now what we can do is that we can retrain our model on selected features. the more the value the more the feature is important.