In [None]:
import numpy as np
import pandas as pd
from scipy.stats import norm, kstest
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt

## WHO Dataset :
### Describtion:


### *1) id: unique identifier*

### *2) gender: "Male", "Female" or "Other"*

### *3) age: age of the patient**

### *4) hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension*

### *5) heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease*

### *6) ever_married: "No" or "Yes"*

### *7) work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"*

### *8) Residence_type: "Rural" or "Urban"*

### *9) avg_glucose_level: average glucose level in blood*

### *10) bmi: body mass index*

### *11) smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"**

### *12) stroke: 1 if the patient had a stroke or 0 if not*

*Note: "Unknown" in smoking_status means that the information is unavailable for this patient*


In [None]:
data = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
data

## Let's Check Null Value :
### Only Bmi Has Null Value

In [None]:
data.info()

In [None]:
data['bmi'] = data['bmi'].fillna(data['bmi'].mean())

In [None]:
data.isnull().sum().plot(kind='bar')

In [None]:
data['id'].duplicated().sum()

In [None]:
sns.distplot(data['age'], bins=100);

In [None]:
# Bernoli Distribution 
sns.distplot(data['stroke']);

In [None]:
# is it normal or not ?
## let's check it 
sns.distplot(data['avg_glucose_level'], bins=100);

In [None]:
def kl(p, q):
    res = np.sum(np.where(np.logical_and( p!=0 ,q!=0), p * np.log(p / q), 0))
    return res

In [None]:
count, devision = np.histogram(data['avg_glucose_level'], bins=100)

In [None]:
ideal = norm.rvs(size=len(data['avg_glucose_level']), loc = data['avg_glucose_level'].mean(), scale =  data['avg_glucose_level'].std())
icount, idevision = np.histogram(ideal, bins=100)

In [None]:
kl(count/5110, icount/5110)

In [None]:
plt.plot(np.cumsum(data['avg_glucose_level'].sort_values().reset_index(drop=True)))
plt.plot(np.cumsum(ideal), color='red')

In [None]:
data['bmi'].plot(kind='hist', bins=100)

In [None]:
ideal = norm.rvs(size=len(data['bmi']), loc = data['bmi'].mean(), scale =  data['bmi'].std())

In [None]:
plt.hist(ideal, bins=100)
plt.hist(data['bmi'], bins=100)

In [None]:
sns.countplot(data['gender']);

## Clean The Data:

### ** When i want to try remove the Other on gender column i got error ** 
### so try to another way to remove it 

In [None]:
data.loc[data['gender'].str.contains('Other')]

In [None]:
data[data.gender.values == 'Other']

In [None]:
def remove(gender):
    if 'Other' in gender:
        return np.nan
    else:
        return gender

In [None]:
data['gender'] = data['gender'].apply(remove)
data['gender'] = data['gender'].dropna()

In [None]:
sns.countplot(data['gender']);

In [None]:
data = data.drop('id', axis=1)

In [None]:
sns.countplot(data['smoking_status']);

In [None]:
data['smoking_status'].value_counts()

In [None]:
from numpy import nan as NA
#data[data['age'] < 20]
def remove_age(age):
    if age < 18:
        return NA
    else:
        return age

In [None]:
data['age'] = data['age'].apply(remove_age)
data['age'].dropna(inplace=True)
data.dropna(inplace=True)

In [None]:
data

In [None]:
data['age'].describe()

### Children (00-14 years)
### Youth (15-24 years)
### Adults (25-64 years)
### Seniors (65 years and over)

In [None]:
data['categori_age'] = pd.cut((data['age']), 3, labels=['Youth ', 'Adults ', 'Seniors '])

In [None]:
obj = data.select_dtypes(include='object')
obj

In [None]:
data

##  Now We Want To See If The Age Is More Than 40 or 50 Is Really Effected On Stroke Or Not
### if p_value < 0.5 means we can have a reason to prove it if the p_value > 0.5 means maybe it happens base on chance

In [None]:
from numpy import sqrt, round, abs

In [None]:
dataA = data[data['age'] >= 50]['stroke']
dataB = data[data['age'] < 50]['stroke']

In [None]:
Amean = dataA.mean()
Bmean = dataB.mean()
Astd = dataA.std()
Bstd = dataB.std()
Alen = len(dataA)
Blen = len(dataB)

In [None]:
above = Amean - Bmean
below = sqrt((Astd ** 2 / Alen) + (Bstd ** 2 / Blen))
z  = above / below 

In [None]:
z

In [None]:
p_value = 2 * (1 - norm.cdf(abs(z)))
p_value

## Anova_Test

In [None]:
from statsmodels.formula.api import ols
import statsmodels.api as sm

In [None]:
res = ols('stroke ~ age', data=data).fit()
sm.stats.anova_lm(res)

In [None]:
data

In [None]:
data.groupby(['gender', 'categori_age'])['stroke'].size()

In [None]:
data.pivot_table(values='age', columns='categori_age', index='stroke', aggfunc='count').plot(kind='bar', figsize=(15, 8))

In [None]:
data.groupby(['Residence_type', 'work_type'])['stroke'].size().plot(kind='bar')

In [None]:
data

In [None]:
sns.catplot(
    data=data, kind="bar",
    x="gender", y="heart_disease", hue="stroke",
    ci="sd", palette="dark"
)

In [None]:
plt.figure(figsize=(10, 10))
plt.hlines(40, 20, 80, color='black')
plt.scatter(x=data['age'], y=data['bmi'], color='red')

# Convert To Numerical:

In [None]:
data['gender'] = pd.get_dummies(data['gender'])
data['Residence_type'] = pd.get_dummies(data['Residence_type'])
data['ever_married'] = pd.get_dummies(data['ever_married'])
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import make_column_transformer
##                              *** For Machine Learning use ohe ***
#ohe = make_column_transformer((OneHotEncoder(), ['work_type', 'smoking_status', 'categori_age']), remainder='passthrough')
#data = ohe.fit_transform(data)

In [None]:
label = LabelEncoder()
data['work_type'] = label.fit_transform(data['work_type'])
data['smoking_status'] = label.fit_transform(data['smoking_status'])
data['categori_age'] = label.fit_transform(data['categori_age'])


In [None]:
data.drop('age', inplace=True, axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler
stn = StandardScaler()
data['avg_glucose_level'] = stn.fit_transform(data['avg_glucose_level'].values.reshape(-1, 1))
data['bmi'] = stn.fit_transform(data['bmi'].values.reshape(-1, 1))


In [None]:
data

# Feature Selcetion
## Chi2

In [None]:
from sklearn.feature_selection import chi2, SelectKBest

x = data.drop('stroke', axis=1)
y =  data['stroke']
best_f = SelectKBest(chi2, k=5)
best_f.fit_transform(abs(x), y)


dfscores = pd.DataFrame(best_f.scores_)
dfcolumns = pd.DataFrame(x.columns)

featureScores = pd.concat([dfcolumns,dfscores],axis=1)

featureScores.columns = ['feature','score']  
print(featureScores.nlargest(5,'score')) 

In [None]:
plt.figure(figsize=(8, 8))
sns.heatmap(x.corr(), annot=True)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [None]:
from  sklearn.metrics import confusion_matrix

# Random_Forest:


In [None]:
from sklearn.ensemble  import RandomForestClassifier
clf_r = RandomForestClassifier(n_estimators=10)
clf_r.fit(x_train, y_train)
print(clf_r.score(x_train, y_train))
r_predict = clf_r.predict(x_test)
# Evaluate Model:
print(confusion_matrix(y_test, r_predict))

# Logestic_regression:

In [None]:
from  sklearn.linear_model import LogisticRegression
clf_l = LogisticRegression()
clf_l.fit(x_train, y_train)
print(clf_l.score(x_train, y_train))
l_predict = clf_l.predict(x_test)
# Evaluate Model:
print(confusion_matrix(y_test, l_predict))

# SVM:

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear', 'poly'), 'C':[1,2, 3, 4, 5]}

svc = SVC()
clf_sg = GridSearchCV(svc, parameters)
clf_sg.fit(x_train, y_train)

print(clf_sg.best_estimator_)
print(clf_sg.score(x_train, y_train))

sg_predict = clf_sg.predict(x_test)
# Evaluate Model:
print(confusion_matrix(y_test, sg_predict))

# PCA:

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
x2d = pca.fit_transform(x_train)

In [None]:
plt.scatter(x = x2d[:, 0], y = x2d[:, 1])

In [None]:
pca.explained_variance_ratio_

# Another Way To Feature Selection:
### use Boruta 
#### See How It's Work



Faster run times, thanks to scikit-learn

Scikit-learn like interface

Compatible with any ensemble method from scikit-learn

Automatic n_estimator selection

Ranking of features

Feature importances are derived from Gini impurity instead of RandomForest R package's MDA

In [None]:
import xgboost as xgb
model = xgb.XGBClassifier()
from boruta import BorutaPy
feature = BorutaPy(model, n_estimators='auto', verbose=2, random_state=1)

## Pay attention Use Np.Array Is Not Mandetory

In [None]:
feature.fit(np.array(x_train), np.array(y_train))

In [None]:
print(feature.support_)
print(feature.ranking_)
print(x_train.columns[feature.support_].to_list())

In [None]:
x_boruta = feature.transform(np.array(x_train))

In [None]:
xgb_model = xgb.XGBClassifier()
xgb_model.fit(x_boruta, y_train)

In [None]:
x_test_filtered = feature.transform(np.array(x_test))
prediction_xgb = xgb_model.predict(x_test_filtered)

In [None]:
from sklearn import metrics
print ("Accuracy = ", metrics.accuracy_score(y_test, prediction_xgb))

In [None]:
cm = confusion_matrix(y_test, prediction_xgb)
#print(cm)
sns.heatmap(cm, annot=True)