**I found out from this dataset that "time" feature should not be used as a predictor because it causes problem when applying different feature selection techniques. When using lasso the variables are affected by this feature, but when using correlation technique it shows that this variable is negativly correlated with target variable. Although my model gets a better prediction accurecy when using time as a predictor, i still decided to remove it**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
df=pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
df.head()

In [None]:
df.info()

In [None]:
print(df.isnull().sum())

In [None]:
df.describe()

#### Use pca to check for multicollinearity in dataset and reduce the dimensions to only two variables

#### Scale data, create a copy of df and perform pca

In [None]:
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
df_copy=df.copy()


df_copy=sc.fit_transform(df_copy)

from sklearn.decomposition import PCA
pca=PCA(n_components=2)
df_pca=pca.fit_transform(df_copy)

principaldF=pd.DataFrame(data=df_pca,columns=['PC1','PC2'])
finaldf = pd.concat([principaldF, df[['DEATH_EVENT']]], axis = 1)

finaldf.head()

#### the variance captured by pca is low, this shows that variables are not correlated with each other and the reduced dimensions dont capture the variance in dataset

In [None]:
import plotly.express as px
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "Components", "y": "Explained Variance"}
)

#### Before preceeding with lasso and model prediction a exploatry analysis is done

#### First check for amount of deaths between genders

In [None]:
sns.set_style("darkgrid", {"axes.facecolor": "0.95"})
fig,ax = plt.subplots(1, 1,figsize = (15,6))
sns.countplot(x='sex',
              hue = 'DEATH_EVENT', 
              data=df,
              palette=["cornflowerblue", "khaki"])



ax.legend(["No","Yes"], 
              bbox_to_anchor=(1,1), 
              title='Survival')

ax.set_xticklabels(['Male','Female'],fontdict= { 'fontsize': 10, 'fontweight':'bold'})
# Customize the axes and title
ax.set_title("Death by heart failure amoung genders",fontdict= { 'fontsize': 20, 'fontweight':'bold'})
ax.set_ylabel("Amount",fontdict= { 'fontsize': 15, 'fontweight':'bold'})
ax.set_xlabel("Gender",fontdict= { 'fontsize': 15, 'fontweight':'bold'})

for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2., height + .3,height ,ha="center")


#### Amount of deaths due to smoking and its distribution among genders

In [None]:
fig,ax = plt.subplots(1, 2,figsize = (15,6))
sns.countplot(x="smoking",data=df,ax = ax[0], palette=["cornflowerblue", "khaki"])
sns.countplot(x="DEATH_EVENT",hue = 'smoking', data=df,ax = ax[1], palette=["cornflowerblue", "khaki"])


#annotatinos
for i in range(2):
    for p in ax[i].patches:
        height = p.get_height()
        ax[i].text(p.get_x()+p.get_width()/2., height + .3,height ,ha="center")
        

ax[0].set_xticklabels(['Non smoker','Smoker'],fontdict= { 'fontsize': 10, 'fontweight':'bold'})
ax[0].set_title("Distrubution of smokers",fontdict= { 'fontsize': 20, 'fontweight':'bold'})


ax[1].legend(["Non smoker","Smoker"], 
              bbox_to_anchor=(1,1))

ax[1].set_xticklabels(['Didnt survive','Survived'],fontdict= { 'fontsize': 10, 'fontweight':'bold'})
ax[1].set_title("Heart failure due to smoking",fontdict= { 'fontsize': 20, 'fontweight':'bold'})



In [None]:
fig,ax = plt.subplots(1, 2,figsize = (15,6))

sns.countplot(x="smoking",hue = 'DEATH_EVENT',data=df[df['sex'] == 1],ax = ax[0],palette=["cornflowerblue", "khaki"])
sns.countplot(x="smoking",hue = 'DEATH_EVENT', data=df[df['sex'] == 0],ax = ax[1],palette=["cornflowerblue", "khaki"])


ax[0].set_title('Male')

ax[0].legend(["Didnt Survive","Survived"], 
              loc="upper right")

ax[1].legend(["Didnt Survive","Survived"], 
              loc="upper right")
ax[1].set_title('Female')

#annotatinos
for i in np.arange(2):
    for p in ax[i].patches:
        height = p.get_height()
        ax[i].text(p.get_x()+p.get_width()/2., height + .3,height ,ha="center")
fig.suptitle('Amount of deaths and smoking among genders', fontsize =15)

#### 67 males that didnt smoke died

#### 3 females that did smoke survived

#### Age might be an important factor

In [None]:
age_counts = df["age"].value_counts()
fig = px.bar(age_counts, title="Age distribution")
fig.update_layout(
    xaxis_title = "Age",
    yaxis_title = "Frequency",
    title_x = 0.5, 
    showlegend = False
)
fig.show()

age = pd.cut(df['age'], 8)
fig, axs = plt.subplots(figsize=(15, 8))
sns.countplot(x=age,hue='DEATH_EVENT', 
              data=df,palette=["cornflowerblue", "khaki"]).set_title("Age distrubation with deaths",
                                                                { 'fontsize': 20, 'fontweight':'bold'});
axs.legend(["Didnt Survive","Survived"], 
              loc="upper right")

#### More people died in lower spectrums, this is a bit strange. Dataset could be wrong

#### Even if we look at the probability distribution then it would show that you have a higher probability to die the younger you are....

In [None]:
df['ejection_fraction'] = df['ejection_fraction'].div(100).round(2)

In [None]:
age_counts = df["ejection_fraction"].value_counts()
fig = px.bar(age_counts, title="Distribution of how much blood is pumped out (ejection) as a percentage (normal 50-70%)")
fig.update_layout(
    xaxis_title = "Ejection percentage",
    yaxis_title = "Frequency",
    title_x = 0.5, 
    showlegend = False
)
fig.show()

fig, axs = plt.subplots(figsize=(15, 8))
sns.countplot(x='ejection_fraction',hue='DEATH_EVENT', 
              data=df,palette=["cornflowerblue", "khaki"]).set_title("Ejection distribution and heart failure",
                                                                { 'fontsize': 20, 'fontweight':'bold'});
axs.legend(["Didnt Survive","Survived"], 
              loc="upper right")

#### More people died having an ejection rate of <50%. Still alot of people survived which shows inconsistency in dataset


#### Perform lasso regularization to shrink variables to zero and choose the most important features

In [None]:
X = df.iloc[:, :11].values
y = df.iloc[:, -1].values


from sklearn.model_selection import train_test_split, GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso

pipeline = Pipeline([
                     ('scaler',StandardScaler()),
                     ('model',Lasso())
])

search = GridSearchCV(pipeline,
                      {'model__alpha':np.arange(0.1,10,0.1)},
                      cv = 5, scoring="neg_mean_squared_error",verbose=3
                      )
search.fit(X_train,y_train)
search.best_params_

#### Optimal shrinkage level is 0.1

In [None]:
coefficients = search.best_estimator_.named_steps['model'].coef_
importance = np.abs(coefficients)
importance

#### Only the age, ejection_fraction and serum_creatinine are important here

#### Split data to training and testing set and scale

In [None]:
X=df.iloc[:,[0,4,7]].values
y=df.iloc[:, -1].values

from sklearn.model_selection import train_test_split, GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#### Randomforest and 10-kfold crossvalidation

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, np.ravel(y_train))
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accurecy score of CM: ",round(accuracy_score(y_test, y_pred),2))


from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("\nAccuracy with 10-kfold: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

#### Naive bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accurecy score of CM: ",round(accuracy_score(y_test, y_pred),2))



accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("\nAccuracy with 10-kfold: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

#### Decision trees

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)


cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accurecy score of CM: ",round(accuracy_score(y_test, y_pred),2))


accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("\nAccuracy with 10-kfold: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

#### Light gradient boosting machine

In [None]:
import lightgbm as lgb
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accurecy score of CM: ",round(accuracy_score(y_test, y_pred),2))


accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("\nAccuracy with 10-kfold: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

#### Naive bayes gave a good accurecy according to the score of crossvalidation. With time added as feature the accurecy is improved more, but i excluded it from dataset 

#### Future work: A more throughful exploatry analysis could further simply the relationships in dataset or clear up my assumptions about time and this dataset having wrong data