# Heart Failure Dataset: Pre-processing and Visualizations.

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
data=pd.read_csv("../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv")

In [None]:
data.head()

In [None]:
data.drop(['DEATH_EVENT','time'],axis=1,inplace=True)
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
for col in data.columns:
    print("Values of '{}':\n".format(col),data[col].unique())

In [None]:
categoricals=['anaemia','diabetes','high_blood_pressure','sex','smoking']
numericals=['age','creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium']

In [None]:
swapper={0:'No',1:'Yes'}
for col in categoricals:
    if col!='sex':
        for i in range(299):
            data.loc[i,col]=swapper[data.loc[i,col]]

In [None]:
for i in range(299):
    if data.loc[i,'sex']==0:
        data.loc[i,'sex'] = 'Female'
    else:
        data.loc[i,'sex'] = 'Male'

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')

In [None]:
n_rows,n_cols= 5,2
figure, axes = plt.subplots(nrows=n_rows, ncols=n_cols,figsize=(25, 70))
figure.suptitle('\n\n\nDistributions of Categorical Variables', fontsize=80)

for i in range(len(categoricals)):
    column=categoricals[i]
    graph1=data[column].value_counts().plot.pie(autopct='%1.1f%%',
                                                      ax=axes[i,0],
                                                      colormap="Set3",
                                                      fontsize=25,
                                                      shadow=True,
                                                      explode=[0.1,0])
    axes[i,0].set_ylabel('%',fontsize=25)
    axes[i,0].set_title(column+' (percentages)', fontsize=30)
    graph2=sns.countplot(x=column,
                         data=data,
                         palette='Set3',
                         ax=axes[i,1])
    axes[i,1].set_xlabel(None)
    axes[i,1].set_ylabel('Count',fontsize=25)
    axes[i,1].set_xticklabels(axes[i,1].get_xticklabels(), Fontsize=18)
    axes[i,1].set_title(column+' (value counts)', fontsize=30)

In [None]:
n_rows, n_cols = (6,2)

figure, axes = plt.subplots(nrows=n_rows, ncols=n_cols,figsize=(25, 70))
figure.suptitle('\nDistributions of Numerical Variables', fontsize=100)

for i in range(len(numericals)):
    
    col=numericals[i]
    
    label1='Mean = {}\nMedian = {}\nStandard Deviation = {}'.format("%.2f"%data[col].mean(),data[col].median(),"%.2f"%data[col].std())
    graph1=sns.distplot(data[col],
                        color="navy",
                        ax=axes[i,0],
                        kde_kws={"lw":4},
                        label=label1)
    graph1=graph1.legend(loc='best',fontsize=20)
    axes[i,0].set_title(col+'\n(Probability Density)',fontsize=30)
    axes[i,0].set_xlabel(None)
    axes[i,0].set_ylabel("Pobability Density",fontsize=18)
    
    graph2=sns.boxplot(x=col,
                       data=data,
                       ax=axes[i,1],
                       color='lavender',
                       fliersize=8)
    axes[i,1].set_xlabel(None)
    axes[i,1].set_title(col+'\n(Quartiles)',fontsize=30)
    
    
plt.show()