In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pyspark

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Visualization
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 400)

from matplotlib import rcParams
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (18,4)})
rcParams['figure.figsize'] = 18,4

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
rnd_seed=2021
np.random.seed=rnd_seed
np.random.set_state=rnd_seed

In [None]:
spark = SparkSession.builder.master("local[2]").appName("Heart Failure").getOrCreate()

In [None]:
spark

In [None]:
sqlContext = SQLContext(spark.sparkContext)
sqlContext

In [None]:
df = spark.read.csv('/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv', header = True, inferSchema=True)

In [None]:
df.show()

In [None]:
df.limit(10).toPandas()

In [None]:
df.describe().toPandas()

In [None]:
df.printSchema()

In [None]:
def count_missings(spark_df,sort=True):
    """
    Counts number of nulls and nans in each column
    """
    df = spark_df.select([F.count(F.when(F.isnan(c) | F.isnull(c), c)).alias(c) for (c,c_type) in spark_df.dtypes if c_type not in ('timestamp', 'string', 'date')]).toPandas()

    if len(df) == 0:
        print("There are no any missing values!")
        return None

    if sort:
        return df.rename(index={0: 'count'}).T.sort_values("count",ascending=False)

    return df

In [None]:
count_missings(df)

No missing values

correlation matrix

In [None]:
sns.heatmap(df.toPandas().corr())

In [None]:
abs(df.toPandas().corr()["DEATH_EVENT"]) > 0.1

In [None]:
#Above colums are from the correlation matrix above
COLUMNS = ['age','ejection_fraction','serum_creatinine','serum_sodium','time',"DEATH_EVENT"]

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.ml.feature import VectorAssembler, StandardScaler

In [None]:
assembler = VectorAssembler(inputCols=COLUMNS, outputCol="features") 
assembled_df = assembler.transform(df)

In [None]:
assembled_df.show(10, truncate=False)

In [None]:
# Initialize the `standardScaler`
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")

In [None]:
# Fit the DataFrame to the scaler
scaled_df = standardScaler.fit(assembled_df).transform(assembled_df)

In [None]:
# Inspect the result
scaled_df.select("features", "features_scaled").show(10, truncate=False)

In [None]:
# Split the data into train and test sets
train_data, test_data = scaled_df.randomSplit([.8,.2], seed=rnd_seed)

In [None]:
results = {}

# Logistic Regression

In [None]:
lr = LogisticRegression(labelCol="DEATH_EVENT", featuresCol="features_scaled")
lrModel = lr.fit(train_data)

In [None]:
lr_prediction = lrModel.transform(test_data )
lr_prediction

In [None]:
eval_df = lr_prediction.select(["DEATH_EVENT","prediction"]).toPandas()

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,auc,precision_score,recall_score,roc_auc_score
results['logistic_regression'] = {}
results['logistic_regression']['roc_auc'] = roc_auc_score(eval_df['DEATH_EVENT'],eval_df['prediction'])
results['logistic_regression']['accuracy'] = accuracy_score(eval_df['DEATH_EVENT'],eval_df['prediction'])
results['logistic_regression']['precision'] = precision_score(eval_df['DEATH_EVENT'],eval_df['prediction'])
results['logistic_regression']['recall'] = recall_score(eval_df['DEATH_EVENT'],eval_df['prediction'])

# Decision Tree Classifier

In [None]:
dfc =  DecisionTreeClassifier(labelCol="DEATH_EVENT", featuresCol="features_scaled")
dfc_model = dfc.fit(train_data)

In [None]:
dfc_prediction = dfc_model.transform(test_data )
dfc_prediction

In [None]:
eval_df = dfc_prediction.select(["DEATH_EVENT","prediction"]).toPandas()

results['decision_tree'] = {}
results['decision_tree']['roc_auc'] = roc_auc_score(eval_df['DEATH_EVENT'],eval_df['prediction'])
results['decision_tree']['accuracy'] = accuracy_score(eval_df['DEATH_EVENT'],eval_df['prediction'])
results['decision_tree']['precision'] = precision_score(eval_df['DEATH_EVENT'],eval_df['prediction'])
results['decision_tree']['recall'] = recall_score(eval_df['DEATH_EVENT'],eval_df['prediction'])

#  Rain Forest Classifier

In [None]:
rfc =  RandomForestClassifier(labelCol="DEATH_EVENT", featuresCol="features_scaled",numTrees = 50)
rfc_model = rfc.fit(train_data)

In [None]:
rfc_prediction = rfc_model.transform(test_data )
rfc_prediction

In [None]:
eval_df = rfc_prediction.select(["DEATH_EVENT","prediction"]).toPandas()

results['random_forest'] = {}
results['random_forest']['roc_auc'] = roc_auc_score(eval_df['DEATH_EVENT'],eval_df['prediction'])
results['random_forest']['accuracy'] = accuracy_score(eval_df['DEATH_EVENT'],eval_df['prediction'])
results['random_forest']['precision'] = precision_score(eval_df['DEATH_EVENT'],eval_df['prediction'])
results['random_forest']['recall'] = recall_score(eval_df['DEATH_EVENT'],eval_df['prediction'])

In [None]:
results

In [None]:
metric_df = pd.DataFrame(results).T

In [None]:
metric_df

In [None]:
metric_df['roc_auc'].plot(kind="bar")

In [None]:
metric_df['accuracy'].plot(kind="bar")

In [None]:
metric_df['precision'].plot(kind="bar")

In [None]:

metric_df['recall'].plot(kind="bar")