In [None]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
import os, torch
import torch.nn as nn
from torch import optim
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

data_path = '../input/electrode-stability-dataset/DATA.csv'
data = pd.read_csv(data_path, header=0, index_col=False)

In [None]:
#show data columns
print(data.columns)

In [None]:
#check unique values occurances in each column
for col in data.columns:
    print(f'{col} has {len(data[col].unique())} unique values')

In [None]:
#check columns data type to distinguish numercial data from categorical ones
for col in data.columns:
    print(f'the {col} is of {data[col].dtype.name} data type')

In [None]:
#visualize some categorical data
batch = 600
for idx in range(0, len(data['Electrode']), batch):
    plt.scatter(x=data['Electrode'][idx:idx+batch], y=data['Stability. (H)'][idx:idx+batch])
    plt.xlabel('Electrode')
    plt.ylabel('Stability. (H)')
    plt.title(f'Stability. (H) vs Electrode {idx+1} to {idx+batch} samples')
    plt.show()

In [None]:
#select all object, boolean and category type columns and frop Ref. Publication date [year:mm:dd] column
data_categorical = data.select_dtypes(['object', 'bool', 'category']).drop(['Ref. Publication date [year:mm:dd]'], axis=1)
data_categorical.head()


In [None]:
#drop columns with lots of missing or irrelevant data
valid_categorical_col = data_categorical.drop(['TL1-front', 'TL2-front', 'TL3-front', 'TL4-back', 'TL5-back', 'TL6-back', 'TL7-back'], axis=1).columns
valid_numerical_col = data.select_dtypes('float64').columns

#visualize and check if 'blank' still exits in any column
for cat_col, num_col in zip(valid_categorical_col, valid_numerical_col):
    print(f'{cat_col}: \n{data[cat_col].value_counts()}\n____________________________________')
    print(f'{num_col}: \n{data[num_col].value_counts()}\n____________________________________\n\n\n\n\n')

In [None]:
#create new dataframe with relevant data
data_df = pd.DataFrame()
for cat_col in valid_categorical_col:
    data_df[cat_col] = data[cat_col]

for num_col in valid_numerical_col:
    data_df[num_col] = data[num_col]
data_df.head()

In [None]:
#plot all Stability. (H) against all features
for col in data_df.columns:
    plt.scatter(x=data_df[col], y=data_df['Stability. (H)'])
    plt.xlabel(f'{col}')
    plt.ylabel('Stability. (H)')
    plt.title(f'Stability. (H) vs {col}')
    plt.show()

In [None]:
#linear correlation analysis
data_df[valid_numerical_col].corr()

In [None]:
#encode categorical features
label_enc = LabelEncoder()
for col in valid_categorical_col:
    data_df[f'{col} Encode'] = label_enc.fit_transform(data_df[col]).reshape(-1)
data_df.head(100)


In [None]:
#visualize raw numerical data via distribution curve'
for col in valid_numerical_col:
    data_df[col].plot.kde()
    plt.title(f'{col} distribution curve')
    plt.show()

In [None]:
#normaiization target with ln(x + 1)
for col in valid_numerical_col:
    if col == 'Stability. (H)':
        data_df[f'{col} (Normailzed)'] = np.log(data_df[col] + 1)
        continue
    data_df[f'{col} (Normailzed)'] = data_df[col]
    print(f'{col} (Normailzed) \n')
    print(data_df[f'{col} (Normailzed)'].describe())
data_df.head(1000)

In [None]:
#visualize normalized numerical data via distribution curve'
for col in valid_numerical_col:
    new_col_name = f'{col} (Normailzed)'
    data_df[new_col_name].plot.kde()
    plt.title(f'distribution curve of {new_col_name}')
    plt.show()

In [None]:
#Neural Newtork Model

class NNregressor(nn.Module):
    def __init__(self, in_features, out_features):
        super(NNregressor, self).__init__()
        self.in_features = in_features
        self.out_features = out_features

        self.dense_layer_1 = nn.Sequential(
            nn.Linear(self.in_features, self.in_features*3),
            nn.BatchNorm1d(self.in_features*3),
            nn.LeakyReLU(0.3),
        )
        self.dense_layer_2 = nn.Sequential(
            nn.Linear(self.in_features*3, self.in_features*6),
            nn.BatchNorm1d(self.in_features*6),
            nn.LeakyReLU(0.3),
        )
        self.dense_layer_3 = nn.Sequential(
            nn.Linear(self.in_features*6, self.in_features*9),
            nn.BatchNorm1d(self.in_features*9),
            nn.LeakyReLU(0.3),
        )
        self.dense_layer_4 = nn.Sequential(
            nn.Linear(self.in_features*9, self.in_features*12),
            nn.BatchNorm1d(self.in_features*12),
            nn.LeakyReLU(0.3),
        )
        self.dense_layer_5 = nn.Sequential(
            nn.Linear(self.in_features*12, self.out_features),
        )
        self.dropout_layer = nn.Dropout(0.2)

        
    def forward(self, feature):
        output = self.dense_layer_1(feature)
        output = self.dense_layer_2(output)
        output = self.dense_layer_3(output)
        output = self.dense_layer_4(output)
        output = self.dense_layer_5(output)
        #output = self.dropout_layer(output)
        return output


In [None]:
#select feature and target columns to train with
feature_label = []
target_label = ['Stability. (H) (Normailzed)']
for col in valid_categorical_col:
    feature_label.append(f'{col} Encode')
for col in valid_numerical_col:
    if col == 'Stability. (H) (Normailzed)':
        continue
    feature_label.append(f'{col} (Normailzed)')

feature_data = data_df[feature_label].to_numpy().reshape(-1, len(feature_label))
target_data = data_df[target_label].to_numpy().reshape(-1, 1)

print(f'feature data size: {feature_data.shape}')
print(f'Target data size: {target_data.shape}')
data_df[target_label].head()

In [None]:
#hyper-param
in_features = len(feature_label)
out_features = len(target_label)
test_size = 0.1
lr = 1e-3
epochs = 200
train_batch_size = 600

#set model
regression_model = NNregressor(in_features, out_features)
lossFunc = nn.MSELoss()
optimizer = optim.Adam(regression_model.parameters(), lr=lr)

#data_splice
def splice(X, y, test_size=test_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    return X_train, X_test, y_train, y_test

#numpy to tensor
def numpy_to_tensor(*array):
    tensor = tuple(torch.Tensor(i) for i in array)
    return tensor

X_train, X_test, y_train, y_test = splice(feature_data, target_data)
X_train, X_test, y_train, y_test = numpy_to_tensor(X_train, X_test, y_train, y_test)



In [None]:
#Compute R2Score
def R2Score(actual, pred):
    #convert the tensor parameters to numpy array and detach the gradient from pred
    actual, pred = actual.numpy(), pred.detach().numpy()
    actual, pred = actual.reshape(-1), pred.reshape(-1)
    numerator = np.sum(np.square(actual - pred))
    denominator = np.sum(np.square(actual - np.mean(actual)))
    return 1 - (numerator/denominator)

In [None]:
#training process
def training_process(X, y, epochs, batch_size):
    total_loss = list()
    total_R2Score = list()
    for epoch in range(epochs):
        print(f'Epoch no: {epoch+1}')
        batch_loss = list()
        batch_R2Score = list()
        regression_model.train()
        for idx in tqdm(range(0, len(X), batch_size)):
            optimizer.zero_grad()
            X_batch = X[idx:idx+batch_size]
            y_batch = y[idx:idx+batch_size]
            pred = regression_model(X_batch)
            loss = torch.sqrt(lossFunc(pred, y_batch))
            train_R2score = R2Score(y_batch, pred)
            batch_R2Score.append(train_R2score)
            batch_loss.append(loss.item())
            loss.backward()
            optimizer.step()
        mean_batch_loss, mean_batch_R2Score = np.mean(batch_loss), np.mean(batch_R2Score)
        print(f'RMSE batch_loss is: {mean_batch_loss} \n R^2 Score: {mean_batch_R2Score}')
        total_loss.append(mean_batch_loss)
        total_R2Score.append(mean_batch_R2Score)

training_process(X_train, y_train, epochs=epochs, batch_size=train_batch_size)


In [None]:
#testing process

def inference(X, y):
    comparison_df = pd.DataFrame()
    comparison_df['actual'] = np.array(y).reshape(-1)
    regression_model.eval()
    with torch.no_grad():
        pred = regression_model(X)
        comparison_df['predicted'] = np.array(pred.detach()).reshape(-1)
        RMSE_loss = torch.sqrt(lossFunc(pred, y))
        test_R2Score = R2Score(y, pred)
        print(f'RMSE: {RMSE_loss} \n R^2 Score: {test_R2Score}')

    #plot histogram of comparison
    comparison_df.hist()

    #plot bar chart of comparison
    comparison_df.head(10).plot.bar()
inference(X_test, y_test)