# Heart failure(death) model

Information about the data:


    1.Sex - Gender of patient Male = 1, Female =0
    2.Age - Age of patient
    3.Diabetes - 0 = No, 1 = Yes
    4.Anaemia - 0 = No, 1 = Yes
    5.High_blood_pressure - 0 = No, 1 = Yes
    6.Smoking - 0 = No, 1 = Yes
    7.DEATH_EVENT - 0 = No, 1 = Yes
 



### There are some factors that affects Death Event. This dataset contains person's information like age ,sex , blood pressure, smoke, diabetes,ejection fraction, creatinine phosphokinase, serum_creatinine, serum_sodium, time and we have to predict their DEATH EVENT.

#### Requirements

In [None]:
#Regular EDA
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


#Evaluation
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import precision_score,recall_score,f1_score
from sklearn.metrics import plot_roc_curve


#Models
import xgboost as xgb
import lightgbm as lgb
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from joblib import dump, load
  




## Importing the data 

In [None]:
data = pd.read_csv("https://raw.githubusercontent.com/sunnymoon-sultan/Heart_failure_classifier/main/heart.csv",error_bad_lines = False)

## EDA(explorotary data analysis)

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
print(f"the size of the data is: {data.size}. and the shape of the dataset is: {data.shape}" )

In [None]:
data.columns

In this dataset we don't need the time feature because it's not necessary and irrelevant!
So we are just gonna drop it

In [None]:
data.drop("time",axis=1,inplace=True)

Let's find out the unique values in our dataset!

In [None]:
data.nunique()

Let's see the AGE columns distribution

In [None]:
data["DEATH_EVENT"].value_counts().plot(kind='bar',color=["salmon","lightblue"]);
print(data.DEATH_EVENT.value_counts())


As we can see there is a data imbalance in the dataset!

For this we will use oversampling technique to balance our data!

In [None]:
! pip install imblearn

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
##we have to define the x(feature variable) and the y(target variable)
x = data.iloc[:,:-1]
y = data.iloc[:,-1]
# fitting the predictor and the target variable
x_ros,y_ros = ros.fit_resample(x,y)
y_ros.value_counts().plot(kind="bar",color=["salmon","blue"],figsize=(10,6));

Let's focus on our EDA now!

Releaition between the the age and DEATH_EVENT

In [None]:
pd.crosstab(data.anaemia,data.DEATH_EVENT).plot(kind="bar",color=["black","red"],figsize=(10,6));
plt.ylabel("Amount")
plt.title("Aneamia[0]=Don't have Aneamia,Aneamia[1]=Have Aneamia")

plt.xticks(rotation= 0);

In [None]:
pd.crosstab(data.diabetes,data.DEATH_EVENT).plot(kind="bar",color=["green","red"],figsize=(10,6));
plt.ylabel("Amount")
plt.xticks(rotation= 0);

### The corr() matrix

In [None]:
corr_matrix= data.corr()
fig,ax = plt.subplots(figsize=(15,10))
ax = sns.heatmap(corr_matrix,
annot=True,
linewidths=0.5,
fmt = ".2f",
cmap="YlGnBu");

In [None]:
sns.pairplot(data)

In [None]:
sns.distplot(data.serum_sodium);


In [None]:
sns.distplot(data.age);

In [None]:
sns.distplot(data.serum_creatinine);


## MODELING

Now, we are going to fit our data into a algoritjm/model .
For that we are gonna create a functio that will give us the accuracy of all models tha twe have chosen in one line!

In [None]:
models = {"Logistic Regression":LogisticRegression(),
"KNN":KNeighborsClassifier(),
"Random Forest": RandomForestClassifier(),
         "SVC":SVC(),
          "Gradient_boosting":GradientBoostingClassifier(),
          "DecissionTree":DecisionTreeClassifier(),
         "lightgbm":lgb.LGBMClassifier(),
         "Xg boost":xgb.XGBClassifier()}
#the function!
def fit_and_score(models,x_train,x_test,y_train,y_test):
    np.random.seed(42)
    model_scores={}
    for name,model in models.items():
        model.fit(x_train,y_train)
        model_scores[name] = model.score(x_test,y_test)
    return model_scores
#splitting the data into test and tarin sets!
np.random.seed(42)
x_train, x_test, y_train, y_test = train_test_split(x_ros, y_ros, test_size=0.2)
x_t,x_te,y_t,y_te = train_test_split(x,y,test_size = 0.2)



In [None]:
model_scores = fit_and_score(models = models,
x_train=x_train,
x_test=x_test,
y_train = y_train,
y_test = y_test)
print(model_scores)

Let's visualize it!

In [None]:
model_compare = pd.DataFrame(model_scores,index=["accuracy"])
model_compare.T.plot.bar(color=["salmon"],figsize=(15,6))
plt.xticks(rotation=0);
print(f"{max(model_scores.values())*100}")

## The models we will work with:
### 1.XG boost,
### 2.lightgbm,
### 3.Random forest,
### 4.Gradient boosting

## Hyperperameter tuning!

Let's see if we can improve our model's score.

### XG BOOST tuning

In [None]:
## randomized search cv has been used to tune hyper parameter tuning

xg_grid = {"n_estimators":np.arange(200,800),
"learning_rate":[0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
"max_depth":[3,4,5,9,6],
"maximize" : [True],
'min_child_weight': [1, 5, 10,6],
"subsample":[0.8,0.884,0.9,1,0.7,0.5,2],
"gamma":[0.5,0.6,0.9,3,5,1,2,4],
"alpha":np.arange(0,5),
"objective":["reg:logistic","binary:logistic"],
"colsample_bytree":[0.1,1,0.5,0.3,2,5],
"booster":["gbtree","gblinear","dart"]}

### Tuning Light gbm

In [None]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
lgbm_grid={'objective': ['binary'],
             'metric': ['auc'],
             'is_unbalance':[True],
             'bagging_freq':[5],
             'boosting':['dart'],
             'num_boost_round':[300],
             'early_stopping_rounds':[30]}

estimator = lgb.LGBMClassifier()
rs_lgbm = RandomizedSearchCV(
    estimator, param_distributions=lgbm_grid, 
    n_iter=100,
    cv=5,
    scoring='roc_auc',
    random_state=314,
    verbose=True)

In [None]:
rs_lgbm.fit(x_train,y_train)


In [None]:
rs_lgbm.best_params_

In [None]:
print(f"Before tuning:{model_scores['lightgbm']*100} after tuning:{rs_lgbm.score(x_test,y_test)*100}")

As we can see lightgbm is giving us a accuracy of 95% EXCEELENT!

## Tuning Ramdom forest

In [None]:
rf_grid = {"n_estimators":np.arange(100,900),
"max_depth": [None,3,5,10,6,7],
"min_samples_split":np.arange(2,20),
"min_samples_leaf":np.arange(1,20),
"max_features":["auto","sqrt"]}
estimator = RandomForestClassifier()
rs_rf = RandomizedSearchCV(
    estimator, param_distributions=rf_grid, 
    n_iter=90,
    cv=5,
    verbose=True)


In [None]:
rs_rf.fit(x_train,y_train)

In [None]:
rs_rf.best_params_

In [None]:
print(f"Before tuning:{model_scores['Random Forest']*100} after tuning:{rs_rf.score(x_test,y_test)*100}")

NICE! Random forest is giving us much better level of accuracy! 

## Tuning Gradient boosting

In [None]:
gb_grid = {"n_estimators":np.arange(200,800),
"learning_rate":[0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
"max_depth":[3,4,5,9,6],
"subsample":[0.8,0.884,0.9,1,0.7,0.5,2],
"min_samples_leaf":[20,10,50,30],
"max_features":["sqrt"],
"max_depth":[5,6,7,8]}

estimator = GradientBoostingClassifier()
rs_gb = RandomizedSearchCV(
    estimator, param_distributions=gb_grid, 
    n_iter=90,
    cv=5,
    n_jobs = -1,
    verbose=True)
rs_gb.fit(x_train,y_train)


In [None]:
rs_gb.best_params_

In [None]:
print(f"Before tuning:{model_scores['Gradient_boosting']*100} after tuning:{rs_gb.score(x_test,y_test)*100}")

Which one should we use?
Well as we can see 
our best model  so far is LIGHTGBM,Xgboost,Random_forest!


### Let's evaluate our model 

Let'S see which one's performence is better! XGBOOST vs RANDOMFOREST!

### XG boost evaluation

Cross validation !

We are gonna define a function which gonna plot the scores of cv!

In [None]:
def cross_val_plot(model,x,y,CV):
    cv_acc = np.mean(cross_val_score(model,x,y,cv=CV,scoring="accuracy")*100)
    cv_acc_con = float(format(cv_acc,".2f"))
    
    cv_precision =np.mean(cross_val_score(model,x,y,cv=CV,scoring="precision")*100)
    cv_precision_con = float(format(cv_precision,".2f"))
    
    cv_f1 = np.mean(cross_val_score(model,x,y,cv=CV,scoring="f1")*100)
    cv_f1_con = float(format(cv_f1,".2f"))
    
    cv_recall = np.mean(cross_val_score(model,x,y,cv=CV,scoring="recall")*100)
    cv_recall_con = float(format(cv_recall,".2f"))
    
    empty_dict = {"accuracy":cv_acc_con,
                 "precision":cv_precision_con,
                 "f1":cv_f1_con,
                 "recall":cv_recall_con}
    score_dict = pd.DataFrame(empty_dict,index=["score"])
    ploting = score_dict.T.plot.bar()
    plt.title(f"Accuracy:{cv_acc_con},Precision:{cv_precision_con},f1:{cv_f1_con},recall:{cv_recall_con}")
    return ploting

In [None]:
model_xgb = xgb.XGBClassifier()
cross_val_plot(model_xgb,x_ros,y_ros,5)

## confusion matrix(XGB BOOST)

In [None]:
model_xgb.fit(x_train,y_train)
y_preds_xgb = model_xgb.predict(x_test)
sns.set(font_scale=1.5)
def plot_conf_mat(y_test,y_preds):
    fig,ax = plt.subplots(figsize=(3,3))
    ax = sns.heatmap(confusion_matrix(y_test,y_preds),
    annot=True,
    cbar=True)
    plt.xlabel("True label")
    plt.ylabel("Predicted label")

plot_conf_mat(y_test,y_preds_xgb)

# ROC AUC

In [None]:
plot_roc_curve(model_xgb,x_test,y_test);

## Classification  report

In [None]:
print(classification_report(y_test,y_preds_xgb))

## Random forest Evaluation!

In [None]:
model_rf = RandomForestClassifier(n_estimators= 531,
 min_samples_split= 2,
 min_samples_leaf= 1,
 max_features= "sqrt",
 max_depth= 10)
model_rf.fit(x_train,y_train)
y_preds_rf = model_rf.predict(x_test)

Cross validation of random forest

In [None]:
cross_val_plot(model_rf,x_ros,y_ros,5);

Confusion matrix of random forest

In [None]:
plot_conf_mat(y_test,y_preds_rf)

ROC/AUC curve of random forest 

In [None]:
plot_roc_curve(model_rf,x_test,y_test);

WOAH! our Random forest classifier is doing best!

Classification report of Random forest classifier

In [None]:
print(classification_report(y_test,y_preds_rf))

### As we can see our Random forest classifier is doing great ! So we can say that we will choose RFclassifier over xgboost!

### Let's see how well our lightgbm is doing!

In [None]:
clf_lgbm = lgb.LGBMClassifier(bagging_freq=5, boosting='dart', early_stopping_rounds=30,
               is_unbalance=True, metric='auc', num_boost_round=300,
               objective='binary')
clf_lgbm.fit(x_train,y_train)
clf_lgbm.score(x_test,y_test)*100


In [None]:
cross_val_plot(clf_lgbm,x_ros,y_ros,5);

In [None]:
y_preds_lgbm = clf_lgbm.predict(x_test)

In [None]:
plot_conf_mat(y_test,y_preds_lgbm);plot_roc_curve(clf_lgbm,x_test,y_test);


In [None]:
print(f"LGBM:{classification_report(y_test,y_preds_lgbm)}"),print(f"RANDOM FOREST:{classification_report(y_test,y_preds_rf)}")

In [None]:
print(f"Random forest:{plot_conf_mat(y_test,y_preds_rf)}"),print(f"LGBM:{plot_conf_mat(y_test,y_preds_lgbm)}")

## We have choosen RANDOM FOREST CLASSIFIER as our best estimator!

In [None]:
final_model = RandomForestClassifier(n_estimators= 531,
 min_samples_split= 2,
 min_samples_leaf= 1,
 max_features= "sqrt",
 max_depth= 10)

In [None]:
final_model.fit(x_train,y_train)
train_score=final_model.score(x_train,y_train)*100
test_score = final_model.score(x_test,y_test)*100
print(f"The test score is:{test_score}")
print(f"The train score is:{train_score}")

## Let's save our model!

In [None]:
dump(final_model, 'Heart_attack_algorithm.joblib')

# Thank you!