In [None]:
import tensorflow as tf

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.compose import ColumnTransformer

import seaborn as sns
import matplotlib.pyplot as plt

## Prepare and Preprocess Data

In [None]:
df = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv').dropna(axis=1)
df_orig = df.copy()
feats = df.columns.difference(['id', 'diagnosis'])
df.loc[:, feats] = ColumnTransformer([('mmx', MinMaxScaler(), list(feats))]).fit_transform(df)

In [None]:
train_df, val_df = train_test_split(df, stratify=df.diagnosis, test_size=0.2, random_state=42)
train_df_orig, val_df_orig = train_test_split(df_orig, stratify=df.diagnosis, test_size=0.2, random_state=42)
train_X, val_X = train_df[feats].values, val_df[feats].values

## Autoencoder 

### Build

In [None]:
class Autoencoder(tf.keras.Model):
    def __init__(self, latent_dim, inp_shp):
        super(Autoencoder, self).__init__()
        self.latent_dim = latent_dim
        self.inp_shp = inp_shp
        self.encoder = tf.keras.Sequential([
                                            tf.keras.layers.Dense(20, activation='relu'),
                                            tf.keras.layers.Dense(self.latent_dim, activation='relu')
                                           ])
        self.decoder = tf.keras.Sequential([
                                            tf.keras.layers.Dense(20, activation='relu'),
                                            tf.keras.layers.Dense(self.inp_shp, activation='sigmoid')
                                           ])
    
    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [None]:
autoencoder = Autoencoder(15, len(feats))
autoencoder.compile(optimizer='adam', loss=tf.keras.losses.MeanSquaredError())

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(patience=10)

### Train

In [None]:
autoencoder.fit(train_X, train_X, epochs=100, shuffle=True, 
                validation_data=(val_X, val_X), callbacks=[early_stop])

In [None]:
val_df.shape

## Visualize (Latent Dimension = 2)

In [None]:
# reduced_train = pd.DataFrame(autoencoder.encoder(train_df[feats].values).numpy(), columns=['x1','x2'])
# reduced_val =  pd.DataFrame(autoencoder.encoder(val_df[feats].values).numpy(),  columns=['x1','x2'])

In [None]:
# reduced_train['target'] = train_df.diagnosis
# reduced_val['target'] = val_df.diagnosis

In [None]:
# fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(10,20))
# sns.scatterplot(x='x1', y='x2', hue='target', data=reduced_train, ax=ax1)
# sns.scatterplot(x='x1', y='x2', hue='target', data=reduced_val, ax=ax2)

## Binary Classification Modeling

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, classification_report

In [None]:
lr1 = LogisticRegression(max_iter=5000).fit(train_df_orig[feats], train_df_orig.diagnosis)

In [None]:
lr2 = LogisticRegression(max_iter=5000).fit(autoencoder(train_df[feats].values).numpy(), train_df.diagnosis)

In [None]:
pred1 = lr1.predict_proba(val_df_orig[feats])[:,1]
pred2 = lr2.predict_proba(autoencoder(val_df[feats].values).numpy())[:,1]

In [None]:
y_true = val_df.diagnosis.map({'B':0, 'M':1}).values

### Plain Data Results

In [None]:
roc_auc_score(y_true, pred1), f1_score(y_true, 1*(pred1>0.5))

In [None]:
print(classification_report(y_true, 1*(pred1>0.5)))

### Autoencoder "Denoised" Results

In [None]:
roc_auc_score(y_true, pred2), f1_score(y_true, 1*(pred2>0.5))

In [None]:
print(classification_report(y_true, 1*(pred2>0.5)))