In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
df.head(5)

In [None]:
df.info()

In [None]:
df.shape

## Counting values in the Attributes

In [None]:
df["anaemia"].value_counts()

In [None]:
df["diabetes"].value_counts()

In [None]:
df["high_blood_pressure"].value_counts()

In [None]:
df["sex"].value_counts()

In [None]:
df["smoking"].value_counts()

In [None]:
df["DEATH_EVENT"].value_counts()

## Describing the dataset

In [None]:
df.describe()

In [None]:
df.hist(bins=50, figsize=(20, 15))

## Splitting the dataset for training and testing purpose

In [None]:
from sklearn.model_selection import train_test_split
train_set, test_set  = train_test_split(df, test_size=0.2, random_state=42)
print(f"Rows in train set: {len(train_set)}\nRows in test set: {len(test_set)}\n")

## Splitting the dataset in unbiased form

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df['smoking']):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

In [None]:
strat_test_set['smoking'].value_counts()

In [None]:
41/19

In [None]:
strat_train_set['smoking'].value_counts()

In [None]:
162/77

In [None]:
splitted_df = strat_train_set.copy()

## Checking Correlation between dependent and independent Variables

In [None]:
corr_matrix = splitted_df.corr()
corr_matrix['DEATH_EVENT'].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix
attributes = ["DEATH_EVENT", "serum_creatinine", "age", "high_blood_pressure","anaemia"]
scatter_matrix(splitted_df[attributes], figsize = (20,15))

In [None]:
splitted_df.plot(kind="scatter", y="serum_creatinine", x="DEATH_EVENT", alpha=0.8)

In [None]:
import seaborn as sns
sns.boxplot(y="age", x="DEATH_EVENT", data=splitted_df)

In [None]:
sns.countplot(y="anaemia", data=splitted_df)

## Eliminating the target Attribute

In [None]:
splitted_df = strat_train_set.drop("DEATH_EVENT", axis=1)
splitted_df_labels = strat_train_set["DEATH_EVENT"].copy()

## Creating Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
my_pipeline = Pipeline([
    #     ..... add as many as you want in your pipeline
    ('std_scaler', StandardScaler()),
])


In [None]:
splitted_df_pipeline = my_pipeline.fit_transform(splitted_df)

## Training the model using Support Vector Machine

In [None]:
from sklearn.svm import SVC
svc = SVC(kernel = "linear",random_state=42)
svc_model = svc.fit(splitted_df_pipeline,splitted_df_labels)

In [None]:
some_data = splitted_df.iloc[:5]
some_labels = splitted_df_labels.iloc[:5]
prepared_data = my_pipeline.transform(some_data)
svc_model.predict(prepared_data)

In [None]:
list(some_labels)

In [None]:
from sklearn.metrics import confusion_matrix,f1_score,accuracy_score,classification_report
print("confusion matrix:  \n",confusion_matrix(splitted_df_labels,svc_model.predict(splitted_df_pipeline)))
print("="*100)
print("f1-score: ",f1_score(splitted_df_labels,svc_model.predict(splitted_df_pipeline)))
print("="*100)
print("Accuracy:  ",accuracy_score(splitted_df_labels,svc_model.predict(splitted_df_pipeline)))
print("="*100)
print("Classification Report:  \n",classification_report(splitted_df_labels,svc_model.predict(splitted_df_pipeline)))

## Using better evaluation technique - Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(svc_model, splitted_df_pipeline, splitted_df_labels, scoring="accuracy", cv=5)
print("Avg Score:  ",scores.mean(),"\n","Std deviation:  ",scores.std())

## Testing the model on test data

In [None]:
x_test = strat_test_set.drop("DEATH_EVENT",axis=1)
y_test = strat_test_set["DEATH_EVENT"].copy()
x_test_prepared = my_pipeline.transform(x_test)
final_predictions = svc_model.predict(x_test_prepared)

In [None]:
print("confusion matrix:  \n",confusion_matrix(y_test,final_predictions))
print("="*100)
print("f1-score: ",f1_score(y_test,final_predictions))
print("="*100)
print("Accuracy:  ",accuracy_score(y_test,final_predictions))
print("="*100)
print("Classification Report:  \n",classification_report(y_test,final_predictions))

In [None]:
test_scores = cross_val_score(svc_model, x_test_prepared, y_test, scoring="accuracy", cv=3)
print("Avg Score:  ",test_scores.mean(),"\n","Std deviation:  ",test_scores.std())

## Saving the model

In [None]:
from joblib import dump, load
dump(svc_model, 'HealthFailurePrediction.joblib') 