# **🗃️ Data Loading**

Useful dataset:
- [Heart failure](https://drive.google.com/file/d/1JNtrwHJukzeK6l5ljvUgB8_8yESZ_qDK/view?usp=sharing)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from scipy.stats import probplot, norm

import warnings
warnings.filterwarnings('ignore')

## Load data and take a small look

In [None]:
data = pd.read_csv( _ )

In [None]:
# plot continous-continous variables


In [None]:
# plot categorical-continous variables

## Data Cleansing

In [None]:
num_missing = data.isnull().sum().sort_values(ascending=False)
missing_percentage = _
num_missing.head(20)

In [None]:
missing = pd.concat([num_missing, missing_percentage], axis = 1, keys=['Total', 'Percentage'])
print(missing.head(20))

In [None]:
drop_columns = missing[ missing['Percentage']>0.10 ]
print(drop_columns)

Empty DataFrame
Columns: [Total, Percentage]
Index: []


In [None]:
data_cl = data.drop(missing[missing['Percentage'] > 0.10].index.tolist(), axis=1)
data_cl.isnull().sum().sort_values(ascending=False).head(20)
# data_cl.head(10)

In [None]:
data_cl.isnull().sum().sort_values(ascending=False).keys().tolist()

In [None]:
for col in data_cl.isnull().sum().sort_values(ascending=False).keys().tolist():
  data_cl = data_cl.drop(data_cl.loc[data_cl[col].isnull()].index)
  print(col)

data_cl.isnull().sum().sort_values(ascending=False).min()
print (len(data_cl))

## Outliers

In [None]:
scaled_data_cl = _

lower_bound = scaled_data_cl[scaled_data_cl[:, 0].argsort()][:10]
upper_bound = scaled_data_cl[scaled_data_cl[:, 0].argsort()][-10:]
print(lower_bound, '\n', upper_bound)

## Normality test

In [None]:
# Histogram and Q-Q plot on the distributions

sns.distplot( _ , fit = norm);
fig = plt.figure()
res = probplot( _ , plot = plt)

In [None]:
data_cl = data_cl.select_dtypes(include = ['float64', 'int64'])
data_cl.info()

In [None]:
# Transformación de los datos:
data_cl_tf = data_cl.copy()

for col in data_cl.columns.tolist():
  data_cl_tf[col].loc[data_cl_tf[col] != 0] = np.log( _ )

In [None]:
# Histogram and Q-Q plot on the distributions

sns.distplot( _ , fit = norm);
fig = plt.figure()
res = probplot( _ , plot = plt)

In [None]:
# Histogram and Q-Q plot on the distributions

sns.distplot( _ , fit = norm);
fig = plt.figure()
res = probplot( _ , plot = plt)

In [None]:
sns.pairplot(data_cl_tf, corner=True)

In [None]:
# Drop unnecesary columns
_

In [None]:
# Convert data to arrays
X = _
y = _

print(X.shape, y.shape)

## Data Modeling

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
X_train, X_test, y_train, y_test = _

print (X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(3927, 5) (982, 5) (3927,) (982,)


### Heart failure prediction

In [None]:
gbr = GradientBoostingClassifier(n_estimators=100)
_

In [None]:
predictions = _
metrics_ = classification_report(y_test, predictions)

print ("Report: \n{0}".format(metrics_))

In [None]:
lgr = LogisticRegression()
_

In [None]:
predictions = _
metrics_ = classification_report(y_test, predictions)

print ("Report: {0}".format(metrics_))

In [None]:
_, axes = plt.subplots(1,2, figsize=(10,5))

var_h = 0
var_v = 4
name_h = data_cl_tf.columns[var_h+1]
name_v = data_cl_tf.columns[var_v+1]

axes[0].scatter( _ , _ , c = _ , alpha=0.6, cmap='Paired', edgecolors='k')
axes[1].scatter( _ , _ , c = _ , alpha=0.6, cmap='Paired', edgecolors='k')

axes[0].set_xlabel(name_h), axes[0].set_ylabel(name_v)
axes[1].set_xlabel(name_h), axes[0].set_ylabel(name_v)

### Handling inbalanced data

In [None]:
# Num of samples per class
_

In [None]:
from sklearn.utils import resample

Xr_train, yr_train = resample( _ , _ ,
                              n_samples = _ ,    # to match majority class
                              random_state = 1)  # reproducible results


# Concatenate the resampled data to original data
Xr_train = _
yr_train = _

In [None]:
# Check the new data
_

In [None]:
## Retrain models
gbr = GradientBoostingClassifier(n_estimators=100)
_

In [None]:
predictions = _
metrics_ = classification_report(y_test, predictions)

print ("Report: \n{0}".format(metrics_))

In [None]:
lgr = LogisticRegression()
_

In [None]:
predictions = _
metrics_ = classification_report(y_test, predictions)

print ("Report: {0}".format(metrics_))

In [None]:
_, axes = plt.subplots(1,2, figsize=(10,5))

var_h = 0
var_v = 4
name_h = data_cl_tf.columns[var_h+1]
name_v = data_cl_tf.columns[var_v+1]

axes[0].scatter( _ , _ , c = _ , alpha=0.6, cmap='Paired', edgecolors='k')
axes[1].scatter( _ , _ , c = _ , alpha=0.6, cmap='Paired', edgecolors='k')

axes[0].set_xlabel(name_h), axes[0].set_ylabel(name_v)
axes[1].set_xlabel(name_h), axes[0].set_ylabel(name_v)