In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import io

  
df = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
print(df)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

### EXPLORATORY DATA ANALYSIS

In [None]:
df1 = df.copy()

df1 = df1.astype({
    'anaemia': 'category',
    'diabetes': 'category',
    'sex': 'category',
    'high_blood_pressure': 'category',
    'smoking': 'category',
    'DEATH_EVENT': 'category'
})


#### *1) UNIVARIATE*

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np

In [None]:
def dist_box(data):
 # function plots a combined graph for univariate analysis of continous variable 
 #to check spread, central tendency , dispersion and outliers  
    Name=data.name.upper()
    fig,(ax_box,ax_dis)  =plt.subplots(2,1,gridspec_kw = {"height_ratios": (.25, .75)},figsize=(8, 5))
    mean=data.mean()
    median=data.median()
    mode=data.mode().tolist()[0]
    fig.suptitle("SPREAD OF DATA FOR "+ Name  , fontsize=18, fontweight='bold')
    sns.boxplot(x=data,showmeans=True, orient='h',color="violet",ax=ax_box)
    ax_box.set(xlabel='')
    sns.distplot(data,kde=False,color='blue',ax=ax_dis)
    ax_dis.axvline(mean, color='r', linestyle='--',linewidth=2)
    ax_dis.axvline(median, color='g', linestyle='-',linewidth=2)
    ax_dis.axvline(mode, color='y', linestyle='-',linewidth=2)
    plt.legend({'Mean':mean,'Median':median,'Mode':mode})

In [None]:
#select all quantitative columns for checking the spread
list_col=  df1.select_dtypes([np.number]).columns
for i in range(len(list_col)):
    dist_box(df[list_col[i]])

In [None]:
# Function to create barplots that indicate percentage for each category.
def bar_perc(plot, feature):
    total = len(feature) # length of the column
    for p in plot.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total) # percentage of each class of the category
        x = p.get_x() + p.get_width() / 2 - 0.05 # width of the plot
        y = p.get_y() + p.get_height()           # hieght of the plot
        plot.annotate(percentage, (x, y), size = 12) # annotate the percentage

In [None]:
#get all category datatype 
list_col=  df1.select_dtypes(['category']).columns
fig1, axes1 =plt.subplots(1,6,figsize=(30, 10))
for i in range(len(list_col)):
    order = df1[list_col[i]].value_counts(ascending=False).index # to display bar in ascending order
    axis=sns.countplot(x=list_col[i], data=df1 , order=order,ax=axes1[i],palette='viridis').set(title=list_col[i].upper())
    bar_perc(axes1[i],df1[list_col[i]])

**Observations :**
- Serum Creatinine is highly skewed, has a lot of outliers.
- Creatinine Phosphokinase also has a lot of outliers


#### *2) BIVARIATE AND MULTIVARIATE ANALYSIS*

In [None]:
plt.figure(figsize=(15,5))
sns.heatmap(df.corr(),annot=True ,cmap="YlGn")
plt.show()

**Observations :**
- Most useful features are age, serum_creatinine

Drop 'time' feature since it is a useless variable

In [None]:
df1.drop(['time'], axis = 1, inplace = True)

df.drop(['time'], axis = 1, inplace = True)

In [None]:
# AGE VS DEATH_EVENT

sns.boxplot(x=df1['DEATH_EVENT'],
              y=df1['age'])


In [None]:
# ANAEMIA VS DEATH_EVENT

plt.scatter

In [None]:
# SERUM_CREATININE VS DEATH_EVENT

sns.boxplot(x=df1['DEATH_EVENT'],
              y=df1['serum_creatinine'])


Scale the data i.e Min - Max Scaling

In [None]:
df_max_scaled = df.copy()

num_cols = [col for col in df_max_scaled.columns if df_max_scaled[col].dtype in ['int', 'float']]

for col in num_cols:
  df_max_scaled[col] = df_max_scaled[col] / df_max_scaled[col].abs().max() 

display(df_max_scaled)

In [None]:
list_col=  df_max_scaled.select_dtypes([np.number]).columns
for i in range(len(list_col)):
    dist_box(df_max_scaled[list_col[i]])

In [None]:
death_1 = df.loc[df['DEATH_EVENT'] == 1]

death_0 = df.loc[df['DEATH_EVENT'] == 0]

In [None]:
list_col=  df.select_dtypes([np.number]).columns
for i in range(len(list_col)):
    dist_box(death_0[list_col[i]])
    dist_box(death_1[list_col[i]])

### MODEL BUILDING

#### *1) PREPARATIONS*

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import mean_absolute_error, accuracy_score, classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,KFold,StratifiedKFold

In [None]:
y = df["DEATH_EVENT"]
X = df.drop('DEATH_EVENT',axis=1)
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(X, y, test_size=0.20, random_state = 0)
scaler = StandardScaler()
X_train_orig = scaler.fit_transform(X_train_orig)
X_test_orig = scaler.transform(X_test_orig)

In [None]:
df2 = df.copy()

## REMOVE OUTLIERS

def remove_outlier(df_in, col_name):
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)
    iqr = q3-q1 #Interquartile range
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
    return df_out

cols = ['creatinine_phosphokinase', 'serum_creatinine', 'platelets']

for i in cols:
  df2 = remove_outlier(df2, i)

y = df2["DEATH_EVENT"]
X = df2.drop('DEATH_EVENT',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

**We will use the following models :**

*1) Logistic Regression*

*2) Naive Bayes*

*3) Random Forest Classifier*

*4) Extreme Gradient Boost*

*5) K-Nearest Neighbour*

*6) Decision Tree*

*7) Support Vector Machine*

In [None]:
## LOGISTIC REGRESSION

m1 = 'Logistic Regression'
lr = LogisticRegression()
model = lr.fit(X_train, y_train)
lr_predict = lr.predict(X_test)
lr_conf_matrix = confusion_matrix(y_test, lr_predict)
lr_acc_score = accuracy_score(y_test, lr_predict)
print("confussion matrix")
print(lr_conf_matrix)
print("\n")
print("Accuracy of Logistic Regression:",lr_acc_score*100,'\n')
print(classification_report(y_test,lr_predict))

In [None]:
m1 = 'Logistic Regression'
lr = LogisticRegression()
model = lr.fit(X_train_orig, y_train_orig)
lr_predict = lr.predict(X_test_orig)
lr_conf_matrix = confusion_matrix(y_test_orig, lr_predict)
lr_acc_score = accuracy_score(y_test_orig, lr_predict)
print("confussion matrix")
print(lr_conf_matrix)
print("\n")
print("Accuracy of Logistic Regression:",lr_acc_score*100,'\n')
print(classification_report(y_test_orig,lr_predict))

In [None]:
## NAIVE BAYES

m2 = 'Naive Bayes'
nb = GaussianNB()
nb.fit(X_train,y_train)
nbpred = nb.predict(X_test)
nb_conf_matrix = confusion_matrix(y_test, nbpred)
nb_acc_score = accuracy_score(y_test, nbpred)
print("confussion matrix")
print(nb_conf_matrix)
print("\n")
print("Accuracy of Naive Bayes model:",nb_acc_score*100,'\n')
print(classification_report(y_test,nbpred))

In [None]:
m2 = 'Naive Bayes'
nb = GaussianNB()
nb.fit(X_train_orig,y_train_orig)
nbpred = nb.predict(X_test_orig)
nb_conf_matrix = confusion_matrix(y_test_orig, nbpred)
nb_acc_score = accuracy_score(y_test_orig, nbpred)
print("confussion matrix")
print(nb_conf_matrix)
print("\n")
print("Accuracy of Naive Bayes model:",nb_acc_score*100,'\n')
print(classification_report(y_test_orig,nbpred))

In [None]:
## RANDOM FOREST

m3 = 'Random Forest Classfier'
rf = RandomForestClassifier(n_estimators=20, random_state=2,max_depth=5)
rf.fit(X_train,y_train)
rf_predicted = rf.predict(X_test)
rf_conf_matrix = confusion_matrix(y_test, rf_predicted)
rf_acc_score = accuracy_score(y_test, rf_predicted)
print("confussion matrix")
print(rf_conf_matrix)
print("\n")
print("Accuracy of Random Forest:",rf_acc_score*100,'\n')
print(classification_report(y_test,rf_predicted))

In [None]:
## XGBOOSTING

m4 = 'Extreme Gradient Boost'
xgb = XGBClassifier(learning_rate=0.01, n_estimators=25, max_depth=15,gamma=0.6, subsample=0.52,colsample_bytree=0.6,seed=27, 
                    reg_lambda=2, booster='dart', colsample_bylevel=0.6, colsample_bynode=0.5)
xgb.fit(X_train, y_train)
xgb_predicted = xgb.predict(X_test)
xgb_conf_matrix = confusion_matrix(y_test, xgb_predicted)
xgb_acc_score = accuracy_score(y_test, xgb_predicted)
print("confussion matrix")
print(xgb_conf_matrix)
print("\n")
print("Accuracy of Extreme Gradient Boost:",xgb_acc_score*100,'\n')
print(classification_report(y_test,xgb_predicted))

In [None]:
## KNN

m5 = 'K-NeighborsClassifier'
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_predicted = knn.predict(X_test)
knn_conf_matrix = confusion_matrix(y_test, knn_predicted)
knn_acc_score = accuracy_score(y_test, knn_predicted)
print("confussion matrix")
print(knn_conf_matrix)
print("\n")
print("Accuracy of K-NeighborsClassifier:",knn_acc_score*100,'\n')
print(classification_report(y_test,knn_predicted))

In [None]:
## DECISION TREE

m6 = 'DecisionTreeClassifier'
dt = DecisionTreeClassifier(criterion = 'entropy',random_state=0,max_depth = 6)
dt.fit(X_train, y_train)
dt_predicted = dt.predict(X_test)
dt_conf_matrix = confusion_matrix(y_test, dt_predicted)
dt_acc_score = accuracy_score(y_test, dt_predicted)
print("confussion matrix")
print(dt_conf_matrix)
print("\n")
print("Accuracy of DecisionTreeClassifier:",dt_acc_score*100,'\n')
print(classification_report(y_test,dt_predicted))

In [None]:
## SVC

m7 = 'Support Vector Classifier'
svc =  SVC(kernel='rbf', C=2)
svc.fit(X_train, y_train)
svc_predicted = svc.predict(X_test)
svc_conf_matrix = confusion_matrix(y_test, svc_predicted)
svc_acc_score = accuracy_score(y_test, svc_predicted)
print("confussion matrix")
print(svc_conf_matrix)
print("\n")
print("Accuracy of Support Vector Classifier:",svc_acc_score*100,'\n')
print(classification_report(y_test,svc_predicted))

In [None]:
df2.columns

#### *2) HYPERPARAMETER TUNING*

**- Parameter grid for Logistic Regression**

In [None]:
grid_param = {
    'penalty': ['l1', 'l2'],
    'C' : [0.001, 0.01, 0.1, 0.005, 0.5, 1, 10]
}

grid_search_lr = GridSearchCV(lr, grid_param, cv = 5, n_jobs = -1, verbose = 1)
grid_search_lr.fit(X_train, y_train)

In [None]:
## best parameters
grid_search_lr.best_params_

In [None]:
## best score
grid_search_lr.best_score_

In [None]:
grid_search_lr.cv_results_

**- Parameter grid for Random Forest**

In [None]:
grid_param = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [3, 5, 7, 10],
    'max_features' : ['auto', 'sqrt', 'log2']
}

grid_search_rf = GridSearchCV(rf, grid_param, cv = 5, n_jobs = -1, verbose = 1)
grid_search_rf.fit(X_train, y_train)

In [None]:
print(grid_search_rf.best_score_)
print(grid_search_rf.best_params_)

**- Parameter grid for Dec Tree**

In [None]:
grid_param = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [3, 5, 7, 10],
    'splitter' : ['best', 'random'],
    'min_samples_leaf' : [1, 2, 3, 5, 7],
    'min_samples_split' : [1, 2, 3, 5, 7],
    'max_features' : ['auto', 'sqrt', 'log2']
}

grid_search_dtc = GridSearchCV(dt, grid_param, cv = 5, n_jobs = -1, verbose = 1)
grid_search_dtc.fit(X_train, y_train)

In [None]:
print(grid_search_dtc.best_params_)
print(grid_search_dtc.best_score_)

####*3) ENSEMBLING*

In [None]:
from mlxtend.classifier import StackingCVClassifier

In [None]:
scv=StackingCVClassifier(classifiers=[lr,knn,xgb],
                         meta_classifier= svc)

scv.fit(np.asarray(X_train),np.asarray(y_train))
scv_predicted = scv.predict(X_test)
scv_conf_matrix = confusion_matrix(y_test, scv_predicted)
scv_acc_score = accuracy_score(y_test, scv_predicted)
print("confussion matrix")
print(scv_conf_matrix)
print("\n")
print("Accuracy of StackingCVClassifier:",scv_acc_score*100,'\n')
print(classification_report(y_test,scv_predicted))

In [None]:
predictions = scv.predict(X_test)

In [None]:
test = pd.DataFrame(X_test, columns = ['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes', 'ejection_fraction',
                                 'high_blood_pressure', 'platelets', 'serum_creatinine', 'serum_sodium', 'sex',
                                 'smoking'])

In [None]:
pred = pd.DataFrame(predictions)
pred.columns = ['DEATH_EVENT']
sub_df = pd.concat([test, pred])
sub_df.to_csv('Submission.csv', index = False)

In [None]:
pred