<div>
    <img src="https://storage.googleapis.com/kaggle-datasets-images/29414/37484/9a4417b65ea46ec36477358cbbf4bdd2/dataset-cover.jpg?t=2018-05-31-18-56-03"/>
</div>

In [None]:
from numpy import concatenate
from numpy import hstack
import numpy as np
import pandas as pd

import seaborn as sns; 
sns.set_theme()
sns.set(rc={'figure.figsize':(11.7,8.27)})

from sklearn.utils import shuffle
from abc import abstractmethod

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.semi_supervised import LabelSpreading, LabelPropagation

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

<h2 class="list-group-item list-group-item-action active" data-toggle="list" style="color:black; background:white; border:0.5 black dotted;" role="tab" aria-controls="home"><center>Prepare Data</center></h2>

In [None]:
path = '../input/heartbeat/'

df_abnormal = pd.read_csv(path + 'ptbdb_abnormal.csv', header=None)
df_normal = pd.read_csv(path + 'ptbdb_normal.csv', header=None)

# Dataset

In [None]:
class HeartbeatDataset():
    def __init__(self, path):
        self.df_abnormal = pd.read_csv(path + 'ptbdb_abnormal.csv', header=None)
        categories = list([1] * len(self.df_abnormal))
        self.df_normal = pd.read_csv(path + 'ptbdb_normal.csv', header=None)
        categories.extend(list([0] * len(self.df_normal)))
        self.df = pd.concat([df_abnormal, df_normal])
        self.df['labels'] = categories
        self.df = shuffle(self.df)
        
    def get_features_labels(self, end):
        features = self.df.values[:,:end]
        labels = self.df.values[:,-1]
        return features, labels

In [None]:
path = '../input/heartbeat/'

hds = HeartbeatDataset(path)

# Filter data using correlation

In [None]:
correlations = hds.df[list(hds.df.columns)[:-1]].corr()

sns.heatmap(correlations);

In [None]:
# select only the columns from 0 to 95 for highest correlation
correlations = hds.df[list(hds.df.columns)[:95]].corr()

sns.heatmap(correlations);

In [None]:
features, labels = hds.get_features_labels(95)

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=1)

<h2 class="list-group-item list-group-item-action active" data-toggle="list" style="color:black; background:white; border:0.5 black dotted;" role="tab" aria-controls="home"><center>Semi Supervised Learning</center></h2>

## Definitions

<b>Label propagation</b> : assigns labels to previously unlabeled data points. At the start of the algorithm, a (generally small) subset of the data points have labels (or classifications). These labels are propagated to the unlabeled points throughout the course of the algorithm.

<b>LabelSpreading</b> : this model is similar to the basic Label Propagation algorithm, but uses affinity matrix based on the normalized graph Laplacian and soft clamping across the labels.


## Train each model

In [None]:
models = list()
models.append(("LabelSpreading", LabelSpreading(max_iter=100)))
models.append(("LabelPropagation", LabelPropagation(max_iter=100)))

In [None]:
for model in models:
    model[1].fit(X_train, y_train)
    yhat = model[1].predict(X_test)
    accuracy = accuracy_score(y_test, yhat)
    print('{:16s} Accuracy: {:.3f}'.format(model[0], accuracy))

## Combine model using logistic regression

In [None]:
def get_meta(models, X):
    meta = list()
    for model in models:
        yhat = model[1].predict_proba(X)
        meta.append(yhat)
    return hstack(meta)

In [None]:
meta_train = get_meta(models, X_train)
meta_test = get_meta(models, X_test)

In [None]:
meta_model = LogisticRegression(solver='liblinear')
meta_model.fit(meta_train, y_train)

In [None]:
yhat = meta_model.predict(meta_test)
accuracy = accuracy_score(y_test, yhat)
print('Combined Accuracy: {:.3f}'.format(accuracy))