# Import Libraries

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

import warnings
warnings.filterwarnings('ignore')
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Import the Dataset

In [None]:
data = pd.read_csv("/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")

In [None]:
# Looking at the first 5 rows
data.head()

In [None]:
# Information about the dataset
data.info()

In [None]:
data.describe()

In [None]:
data.columns

## Variable description

1 - age = age of the patient

2 - anaemia = Decrease of red blood cells or hemoglobin (boolean)

3 - creatinie_phosphokinase = Level of the CPK enzyme in the blood (mcg/L)

4 - diabetes = If the patient has diabetes (boolean)

5 - ejection_fraction = Percentage of blood leaving the heart at each contraction (percentage)

6 - high_blood_pressure = If the patient has hypertension (boolean)

7 - platelets = Platelets in the blood (kiloplatelets/mL)

8 - serum_creatinie = Level of serum creatinine in the blood (mg/dL)

9 - serum_sodium = Level of serum sodium in the blood (mEq/L)

10 - sex = gender of the patient (boolean)

11 - smoking = patinet has smoking or not (boolean)

12 - time 

13 - death event = binary

# Categorical Variables
* anaemia , diabetes , high_blood_pressure , sex , smoking , DEATH_EVENT

In [None]:
def bar_plot(variable):
    
    # get feature
    var = data[variable]
    
    # count number of variables
    varValue = var.value_counts()
    
    # visualize
    plt.figure(figsize = (5,5))
    plt.bar(varValue.index,varValue)
    plt.xticks(varValue.index,varValue.index.values)
    plt.ylabel("Frequency")
    plt.title(variable)
    plt.show()
    print("{} : {} ".format(variable,varValue))

In [None]:
categorical_columns = ["anaemia" , "diabetes" , "high_blood_pressure" , "sex" , "smoking" , "DEATH_EVENT"]
for c in categorical_columns:
    bar_plot(c)

## Numerical Variables
* age ,  creatinie_phosphokinase , ejection_fraction , platelets , serum_creatinie , serum_sodium , time

In [None]:
def plot_hist(variable):
    plt.figure(figsize = (5,5))
    plt.hist(data[variable],bins = 50)
    plt.xlabel(variable)
    plt.ylabel("Frequency")
    plt.show()

In [None]:
numerical_columns = ["age" , "creatinine_phosphokinase" , "ejection_fraction" , "platelets" , "serum_creatinine" , "serum_sodium" , "time"]
for x in numerical_columns:
    plot_hist(x)

## Missing Values

In [None]:
data.columns[data.isnull().any()]


In [None]:
data.isnull().sum() # Here , how many missing values are in the dataset ?

## Fill Missing Values

Dataset , which have no any missing values , no need to fill missing values.


In [None]:
data.head()

## Correalation Map
* Correalation Map , which shows us to understand relationship between features(columns) , is important vizulation in machine learning.

In [None]:
import seaborn as sns
f , ax = plt.subplots(figsize = (18,18))
sns.heatmap(data.corr(),annot = True,linewidth = 5,fmt = ".1f",ax = ax)
plt.show()

in this dataset , DEATH_EVENT is a target column. I'm going to change the name of this column

In [None]:
# axis = 1 ----> which means column
# axis = 0 ----> which measn row

data = data.rename({"DEATH_EVENT":"Target"},axis = 1)

In [None]:
data.head()

## Detect Outlier Values

In [None]:
from collections import Counter

def detect_outliers(df,features):
    outlier_indices = []
    for c in features:
        # 1 st quartile
        Q1 = np.percentile(df[c],25)
        
        # 3 rd quartile
        Q3 = np.percentile(df[c],75)
        
        # IQR
        IQR = Q3 - Q1
        
        # Outlier step
        outlier_step = IQR * 1.5
   
        # detect outlier and their indeces
        outlier_list_col = df[(df[c] < Q1-outlier_step) | (df[c] > Q3 + outlier_step)].index
        
        # store indeces
        outlier_indices.extend(outlier_list_col)

    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 2)

    return multiple_outliers

In [None]:
data.loc[detect_outliers(data,["age" , "creatinine_phosphokinase" , "ejection_fraction" , "platelets" , "serum_creatinine" , "serum_sodium" , "time"])]

dataset has no outlier value.

# X and Y Coordinates

In [None]:
y = data.Target.values
x_data = data.drop(["Target"],axis = 1)

# Normalization

In [None]:
x = (x_data - np.min(x_data))/(np.max(x_data)-np.min(x_data)).values

In [None]:
x

In [None]:
y

# Train - Test Split

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 42)

# K-Nearst Neighbour Classification

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10) # n_neighbours = k value
knn.fit(x_train,y_train)
prediction = knn.predict(x_test)
print("KNN score: ",knn.score(x_test,y_test))

In [None]:
data.describe()

In [None]:
# Try to find best K (n_neighbours) value
score_list = []
for each in range(1,240):
    knn2 = KNeighborsClassifier(n_neighbors = each)
    knn2.fit(x_train,y_train)
    score_list.append(knn2.score(x_test,y_test))
plt.plot(range(1,240),score_list)
plt.title("K-value & Accuracy")
plt.xlabel("K-value")
plt.ylabel("Accuracy")
plt.show()
    

# Support Vector Machines

In [None]:
from sklearn.svm import SVC

svm = SVC(random_state = 1)
svm.fit(x_train,y_train)

# score
print("print accuracy of SVM Algorithm: ",svm.score(x_test,y_test))


# Naive Bayes Classification

In [None]:
from sklearn.naive_bayes import GaussianNB
np = GaussianNB()
np.fit(x_train,y_train)
    
print("print accuracy of Navie Bayes Algorithm: ",np.score(x_test,y_test))

# Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators= 1000,random_state=1) # n_estimators , which means number of trees
rf.fit(x_train,y_train)
print("Random Forest Classification: ", rf.score(x_test,y_test))


# Confusion Matrix

In [None]:
y_pred = rf.predict(x_test)
y_true = y_test

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true, y_pred)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

f,ax = plt.subplots(figsize = (5,5))
sns.heatmap(cm,annot = True,linewidths=0.5,linecolor="red",fmt = ".0f",ax =ax)
plt.xlabel("y_pred")
plt.ylabel("y_true")
plt.show()

# K-Means Clustering

# Hierarchical Clustering