In [None]:
# import all the libraries
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch.nn.functional as F
from collections import Counter

In [None]:
# gpu usage
torch.cuda.is_available()

In [None]:
# import the dataset 
data = pd.read_csv('./MMdata.csv')
data_original = data.copy()

In [None]:
data.shape

In [None]:
#make 5 copies of each row for using for self-supervision
data_original = np.array(data_original)
data_original = np.repeat(data_original, repeats=5, axis=0)

In [None]:
data_original = pd.DataFrame(data_original)
#data_original = data_original.to_csv('./data_new.csv')

In [None]:
# drop the nas and reset index and Quality columns
data = data.dropna()

In [None]:
# Get all the features 
data = pd.DataFrame(data)
Quality = data.iloc[:, -1]

In [None]:
data.columns = data_original.columns
#data = data.drop(['Quality'], axis = 1)

In [None]:
data

In [None]:
data = data.iloc[:, :-1]

In [None]:
# normalize the dataset using stdscaler/MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(data)
data_norm = scaler.transform(data)

In [None]:
data_norm = pd.DataFrame(data_norm)

In [None]:
data_norm = data_norm.reset_index()

In [None]:
data_norm.shape, data_norm.min(), data_norm.max()

In [None]:
data_norm = data_norm.drop(['index'], axis = 1)

In [None]:
data_norm

In [None]:
Quality = Quality.reset_index()

In [None]:
Quality = Quality.drop(['index'], axis = 1)

In [None]:
Quality

In [None]:
# Below is the final transformed dataset with the quality levels
data_norm = pd.concat([data_norm,Quality], axis = 1)

In [None]:
data_norm

In [None]:
#data_norm.to_csv('./data_copy.csv')

In [None]:
#data_norm.columns = data_original.columns

In [None]:
# transform into a numpy array and check for any na values
data_norm = np.array(data_norm)

In [None]:
np.all(np.isnan(data_norm))

In [None]:
# define X and y
X = data_norm[:, :-1]

In [None]:
y = data_norm[:, -1]

In [None]:
data_norm.shape

In [None]:
# define the indicator matrix of ones and zeros to mark the missing entries in the data
hadamard = np.ones(X.shape)

In [None]:
# mask entries in the dataset. Mark entries in either a MAR or NMAR manner depending on the mechanism of missingness present in the dataset.
import random
def mask_randomly(X):
    #hadamard_train = np.ones(X.shape)
    block_size = 8
    row_rand_list = list(np.random.randint(0, (X.shape[0]-block_size), 100))
    col_rand_list = []
    for i in row_rand_list:
        col_rand = random.randint(0, X.shape[1]-block_size)
        col_rand_list.append(col_rand)
        for j in range(block_size):
            for k in range(block_size):
                X[i+j][col_rand+k] = -1 
    #hadamard_train = np.where(data == -1, 0, hadamard_train)
    return X

In [None]:
# split into train, val and test sets for original dataset as well as indicator matrix
X_train, X_test, y_train, y_test, hadamard_train, hadamard_test = train_test_split(X, y, hadamard,test_size=0.33,shuffle=True)
X_train, X_val, y_train, y_val, hadamard_train, hadamard_val = train_test_split(X_train, y_train, hadamard_train, test_size=0.01 ,shuffle=True)

In [None]:
# keep the originals for metric calcultaion
X_train_original = X_train.copy()
X_val_original = X_val.copy()
X_test_original = X_test.copy()

In [None]:
# incomplete dataset
X_train_inc = mask_randomly(X_train)

In [None]:
# define the different percentage of deletion code - this is another mechanism of missingness 
def codefordeletion(data, hadamard, frac):
    [Rn, Cn] = data.shape
    ind = []
    for i in range(Rn):
        for j in range(Cn):
            ind.append(data[i, j])
    fraction = frac
        # calculate entries to be deleted
    rem_num = ((len(ind)) * fraction / 100)  # total number of entries to be removed
        # has to be an integer value
    rem_num = int(rem_num)
        # select random elements from the upper triangle:
    indices = np.random.choice(len(ind), rem_num, replace=False)
        # make these indices -1
    for i in indices:
        ind[i] = -1  # now place these values back in the upper triangle:
    #print(ind)
    p = 0
    for i in range(Rn):
        for j in range(Cn):
            data[i, j] = ind[p]
            p += 1
    print(data)
    print(hadamard)
    hadamard = np.where(data == -1, 0, hadamard)
    return data, hadamard

In [None]:
X_train.shape

In [None]:
# define corresponding hadamard for train, validation and test sets
hadamard_train = np.ones(X_train.shape)

In [None]:
#X_train_inc, hadamard_train = codefordeletion(X_train, hadamard_train, 20)

In [None]:
X_train.shape, hadamard_train.shape

In [None]:
hadamard_train = np.where(X_train_inc == -1, 0, hadamard_train)

In [None]:
#hadamard_train = pd.DataFrame(hadamard_train)
#hadamard_train.to_csv('had_new.csv')

In [None]:
#X_train_inc, hadamard_train  = codefordeletion(X_train, hadamard_train, 60)

In [None]:
#X_train_inc = pd.DataFrame(X_train_inc)
#X_train_inc.to_csv('./X_train_inc1.csv')

In [None]:
X_train_inc = np.where(X_train_inc == -1, np.nan, X_train_inc)

In [None]:
#hadamard_train = pd.DataFrame(hadamard_train)
#hadamard_train.to_csv('./hadamard1.csv')

In [None]:
print(X_train_inc)

#Obtain mean of columns as you need, nanmean is convenient.
col_mean_train = np.nanmean(X_train_inc, axis=0)

print(col_mean_train)

#Find indices that you need to replace
inds = np.where(np.isnan(X_train_inc))

#Place column means in the indices. Align the arrays using take
X_train_inc[inds] = np.take(col_mean_train, inds[1])

print(X_train_inc)

In [None]:
X_train_inc.shape

In [None]:
# Do the same for the validation test
#X_val_inc, hadamard_val = codefordeletion(X_val, hadamard_val, 20)
#X_val_inc = np.where(X_val_inc == -1, np.nan, X_val_inc)

In [None]:
# similarly for the validation and test sets
X_val_inc = mask_randomly(X_val)

In [None]:
hadamard_val = np.ones(X_val_inc.shape)

In [None]:
hadamard_val = np.where(X_val_inc == -1, 0, hadamard_val)

In [None]:
X_val_inc = np.where(X_val_inc == -1, np.nan, X_val_inc)

In [None]:
# replace all missing entries with mean values
print(X_val_inc)

#Obtain mean of columns as you need, nanmean is convenient.
col_mean_val = np.nanmean(X_val_inc, axis=0)

print(col_mean_val)

#Find indices that you need to replace
inds = np.where(np.isnan(X_val_inc))

#Place column means in the indices. Align the arrays using take
X_val_inc[inds] = np.take(col_mean_val, inds[1])

print(X_val_inc)

In [None]:
y_train = np.array(y_train)
y_train = np.repeat(y_train, repeats=10, axis=0)

In [None]:
y_val = np.array(y_val)
y_val = np.repeat(y_val, repeats=10, axis=0)

In [None]:
# make copies of the train and validation sets
X_train_inc = np.array(X_train_inc)
X_train_inc = np.repeat(X_train_inc, repeats=10, axis=0)

In [None]:
X_train_inc.shape

In [None]:
# Do the same for the test set
#X_test_inc, hadamard_test = codefordeletion(X_test, hadamard_test, 20)

In [None]:
# make copies of the train and validation sets
X_val_inc = np.array(X_val_inc)
X_val_inc = np.repeat(X_val_inc, repeats=10, axis=0)

In [None]:
hadamard_train = np.array(hadamard_train)
hadamard_train = np.repeat(hadamard_train, repeats=10, axis=0)

In [None]:
hadamard_val = np.array(hadamard_val)
hadamard_val = np.repeat(hadamard_val, repeats=10, axis=0)

In [None]:
X_train_original = np.array(X_train_original)
X_train_original = np.repeat(X_train_original, repeats=10, axis=0)
X_val_original = np.array(X_val_original)
X_val_original = np.repeat(X_val_original, repeats=10, axis=0)


In [None]:
X_test_inc = mask_randomly(X_test)

In [None]:
hadamard_test = np.ones(X_test.shape)

In [None]:
#X_test_inc, hadamard_test = codefordeletion(X_test, hadamard_test, 20)

In [None]:
hadamard_test = np.where(X_test_inc == -1, 0, hadamard_test)

In [None]:
X_test_inc = np.where(X_test_inc == -1, np.nan, X_test_inc)

In [None]:
print(X_test_inc)
#Obtain mean of columns as you need, nanmean is convenient.
col_mean_test = np.nanmean(X_test_inc, axis=0)

print(col_mean_test)

#Find indices that you need to replace
inds = np.where(np.isnan(X_test_inc))

#Place column means in the indices. Align the arrays using take
X_test_inc[inds] = np.take(col_mean_test, inds[1])

print(X_test_inc)

In [None]:
X_train_inc.shape

In [None]:
# Now these are the targets for training the network
X_train_target = X_train_inc.copy()
X_val_target = X_val_inc.copy()
X_test_target = X_test_inc.copy()

In [None]:
col_mean_test = list(col_mean_test)
col_mean_train = list(col_mean_train)
col_mean_val = list(col_mean_val)

In [None]:
# Now define the inputs
X_train_inputs = X_train_target.copy()
X_val_inputs = X_val_target.copy()
X_test_inputs = X_test_target.copy()

In [None]:
X_train_target.shape

In [None]:
# define the different percentage of deletion code - This is for missing at random as was done until now. 
def codefordeletion2(data, hadamards,  frac):
    [Rn, Cn] = data.shape
    ind = []
    for i in range(Rn):
        for j in range(Cn):
            ind.append(data[i, j])
    fraction = frac
        # calculate entries to be deleted
    rem_num = ((len(ind)) * fraction / 100)  # total number of entries to be removed
        # has to be an integer value
    rem_num = int(rem_num)
        # select random elements from the upper triangle:
    indices = np.random.choice(len(ind), rem_num, replace=False)
        # make these indices -1
    # check hadamards of corresponding indices
    had = []
    for i in range(Rn):
        for j in range(Cn):
            had.append(hadamards[i, j])
    for i in indices:
        #if ind[i] not in col_mean:
        if had[i] != 0:
            ind[i] = -1  # now place these values back in the upper triangle:
    #print(ind)
    p = 0
    for i in range(Rn):
        for j in range(Cn):
            data[i, j] = ind[p]
            p += 1
    #print(data)
    return data

In [None]:
def mask_randomly2(X, hadamard):
    #hadamard_train = np.ones(X.shape)
    block_size = 8
    row_rand_list = list(np.random.randint(0, (X.shape[0]-block_size), 30))
    col_rand_list = []
    for i in row_rand_list:
        col_rand = random.randint(0, X.shape[1]-block_size)
        col_rand_list.append(col_rand)
        for j in range(block_size):
            for k in range(block_size):
                if hadamard[i+j][col_rand+k] != 0:
                    X[i+j][col_rand+k] = -1
    return X

In [None]:
X_train_inputs = codefordeletion2(X_train_inputs, hadamard_train, 0)

In [None]:
#X_train_inputs = mask_randomly2(X_train_inputs, hadamard_train)


In [None]:
X_train_inputs = np.where(X_train_inputs == -1, np.nan, X_train_inputs)

In [None]:
print(X_train_inputs)

#Obtain mean of columns as you need, nanmean is convenient.
col_mean = np.nanmean(X_train_inputs, axis=0)

print(col_mean)

#Find indices that you need to replace
inds = np.where(np.isnan(X_train_inputs))

#Place column means in the indices. Align the arrays using take
X_train_inputs[inds] = np.take(col_mean, inds[1])

print(X_train_inputs)

In [None]:
# Do the same for the validation test
X_val_inputs = codefordeletion2(X_val_inputs, hadamard_val, 0)
#X_val_inputs = mask_randomly2(X_val_inputs, hadamard_val)
X_val_inputs = np.where(X_val_inputs == -1, np.nan, X_val_inputs)

In [None]:
print(X_val_inputs)

#Obtain mean of columns as you need, nanmean is convenient.
col_mean = np.nanmean(X_val_inputs, axis=0)

print(col_mean)

#Find indices that you need to replace
inds = np.where(np.isnan(X_val_inputs))

#Place column means in the indices. Align the arrays using take
X_val_inputs[inds] = np.take(col_mean, inds[1])

print(X_val_inputs)

In [None]:
'''
X_train_inputs = pd.DataFrame(X_train_inputs)
X_train_target = pd.DataFrame(X_train_target)
hadamard_train = pd.DataFrame(hadamard_train)
X_train_inputs.to_csv('inputs.csv')
X_train_target.to_csv('targets.csv')
hadamard_train.to_csv('hadamard.csv')
'''

In [None]:
# Do the same for the test set
#X_test_inputs = codefordeletion2(X_test_inputs, hadamard_test, 20)

In [None]:
#X_test_inputs = np.where(X_test_inputs == -1, np.nan, X_test_inputs)

In [None]:
#print(X_test_inputs)
'''
#Obtain mean of columns as you need, nanmean is convenient.
col_mean = np.nanmean(X_test_inputs, axis=0)

print(col_mean)

#Find indices that you need to replace
inds = np.where(np.isnan(X_test_inputs))

#Place column means in the indices. Align the arrays using take
X_test_inputs[inds] = np.take(col_mean, inds[1])

print(X_test_inputs)
'''

In [None]:
# define the dataset
class MMData(Dataset):
    def __init__(self, X_input, X_target, X_original, hadamard):
        self.X_input = X_input.copy()
        self.X_target = X_target.copy()
        self.X_original = X_original.copy()
        self.hadamard = hadamard.copy()
        
    def __len__(self):
        return len(self.X_input)
    
    def __getitem__(self, idx):
        return self.X_input[idx], self.X_target[idx], self.X_original[idx], self.hadamard[idx]

In [None]:
# define the autoencoder architecture
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()

        self.fc1 = nn.Linear(97, 45)
        self.fc2 = nn.Linear(45, 16)
        self.fc3 = nn.Linear(16, 45)
        self.fc4 = nn.Linear(45, 97)

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        return self.fc2(h1)

    def decode(self, z):
        h2 = F.relu(self.fc3(z))
        return F.sigmoid(self.fc4(h2))

    def forward(self, x):
        z = self.encode(x.view(-1, 97))
        return self.decode(z)

In [None]:
# define the autoencoder architecture
'''
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()

        self.fc1 = nn.Linear(97, 34)
        self.fc2 = nn.Linear(34, 8)
        self.fc3 = nn.Linear(8, 34)
        self.fc4 = nn.Linear(34, 97)

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        return self.fc2(h1)

    def decode(self, z):
        h2 = F.relu(self.fc3(z))
        return F.sigmoid(self.fc4(h2))

    def forward(self, x):
        z = self.encode(x.view(-1, 97))
        return self.decode(z)
'''

In [None]:
#creating train and valid datasets
train_ds = MMData(X_train_inputs, X_train_target, X_train_original, hadamard_train)
valid_ds = MMData(X_val_inputs, X_val_target, X_val_original, hadamard_val)
test_ds = MMData(X_test_inputs, X_test_target, X_test_original, hadamard_test)

batch_size = 64
train_dl = DataLoader(train_ds, batch_size=batch_size,shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size,shuffle=True)
test_dl = DataLoader(test_ds, batch_size=batch_size,shuffle=True)

In [None]:
# define the loss function
def loss_function(Y_hat, Y, Omega):
    loss = torch.sum(Omega*(Y_hat-Y)**2)
    return loss

In [None]:
# define the model and optimizer
model = AutoEncoder()
model.to('cuda:0')
optimizer = optim.Adam(list(model.parameters()), lr=1e-3)

In [None]:
# define the train and validation loops
def train(epoch):
    model.train()
    train_loss = 0
    preds_train = []
    orig_train = []
    hadamards_train = []
    for X_train_inputs, X_train_target, X_train_original, hadamard_train in tqdm(train_dl):
        X_train_inputs, X_train_target, X_train_original, hadamard_train = X_train_inputs.cuda(), X_train_target.cuda(), X_train_original.cuda(), hadamard_train.cuda()
        optimizer.zero_grad()
        recon = model(X_train_inputs.float())
        loss = loss_function(recon, X_train_target, hadamard_train)      
        loss.backward()
        train_loss += loss.item()
        optimizer.step() 
        if epoch == 200:
            preds_train.append(recon.cpu().detach().numpy())
            orig_train.append(X_train_original.cpu().detach().numpy())
            hadamards_train.append(hadamard_train.cpu().detach().numpy())            
    train_loss=train_loss/len(train_dl)
    train_losses.append(train_loss)
    print('Train Loss: %.3f'%(train_loss))
    return train_losses, preds_train, orig_train, hadamards_train

In [None]:
#define the validation epochs
def val(epoch):
    model.eval()
    running_loss=0
    preds_val = []
    orig_val = []
    hadamards_val = []
    with torch.no_grad():
        for X_val_inputs, X_val_target, X_val_original, hadamard_val in tqdm(valid_dl):
            X_val_inputs, X_val_target, X_val_original, hadamard_val = X_val_inputs.cuda(), X_val_target.cuda(), X_val_original.cuda(), hadamard_val.cuda()
            recon=model(X_val_inputs.float())
            loss = loss_function(recon, X_val_target, hadamard_val) 
            running_loss+=loss.item()
            if epoch == 200:
                preds_val.append(recon.cpu().detach().numpy())
                orig_val.append(X_val_original.cpu().detach().numpy())
                hadamards_val.append(hadamard_val.cpu().detach().numpy()) 
        eval_loss=running_loss/len(valid_dl)
        eval_losses.append(eval_loss)
    print('Validation Loss: %.3f' %(eval_loss))
    return eval_losses, preds_val, orig_val, hadamards_val

In [None]:
# run the train and val loops
epochs=200
train_losses = []
eval_losses = []
for epoch in range(1,epochs+1): 
    train_losses, preds_train, orig_train, hadamards_train = train(epoch)
    eval_losses, preds_val, orig_val, hadamards_val = val(epoch)

In [None]:
preds_val

In [None]:
# plot the train and validation lossto monitor convergence
import matplotlib.pyplot as plt
plt.figure(figsize=(10,5))
plt.title("Training and Validation Loss", fontsize = 15)
plt.plot(eval_losses, '-o', label="Val_loss")
plt.plot(train_losses, '-o', label="Train_loss")
plt.xlabel("Iterations", fontsize=15)
plt.ylabel("Loss", fontsize=15)
plt.legend()
plt.grid(linestyle = '--', linewidth = 0.2)
#plt.savefig('./loss.png')
plt.show()

In [None]:
# define the test loop
def test():
    model.eval()
    running_loss=0
    recon_test = []
    original_test = []
    hadamards_test = []
    with torch.no_grad():
        for X_test_inputs, X_test_target, X_test_original, hadamard_test in tqdm(test_dl):
            X_test_inputs, X_test_target, X_test_original, hadamard_test = X_test_inputs.cuda(), X_test_target.cuda(), X_test_original.cuda(), hadamard_test.cuda()
            recon=model(X_test_inputs.float())
            loss = loss_function(recon, X_test_target, hadamard_test) 
            running_loss+=loss.item()
            recon_test.append(recon.cpu().detach().numpy())
            original_test.append(X_test_original.cpu().detach().numpy())
            hadamards_test.append(hadamard_test.cpu().detach().numpy())            
        eval_loss=running_loss/len(test_dl)
        eval_losses.append(eval_loss)
    print('Validation Loss: %.3f' %(eval_loss))
    return recon_test, original_test, hadamards_test

In [None]:
recon_test, original_test, hadamards_test = test()

In [None]:
recon_test

In [None]:
# Get the original as well as reconstructed datasets
recon_test = np.vstack(recon_test)
original_test = np.vstack(original_test)

In [None]:
hadamards_test = np.vstack(hadamards_test)

In [None]:
hadamards_test.shape, recon_test.shape

In [None]:
recon_test

In [None]:
# Fix the observed entries in the reconstructed set
recon_test[hadamards_test == 1.0] = original_test[hadamards_test == 1.0]

In [None]:
recon_test.shape, original_test.shape

In [None]:
recon_test = pd.DataFrame(recon_test)

In [None]:
original_test = pd.DataFrame(original_test)

In [None]:
recon_test.columns = data_original.columns[:-1]

In [None]:
original_test.columns = data_original.columns[:-1]

In [None]:
recon_test = pd.DataFrame(recon_test)
recon_test.to_csv('C:\\Users\\16175\\Desktop\\recon_test.csv')

In [None]:
original_test = pd.DataFrame(original_test)
original_test.to_csv('C:\\Users\\16175\\Desktop\\original_test.csv')

In [None]:
# calculate the metrics on the unobserved entries of the tesset
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(original_test, recon_test)

In [None]:
rmse

In [None]:
hadamards_test = np.array(hadamards_test)

In [None]:
# Model evaluation calculate the metric on the unobserved entries of the test set this is relative absolute error. 
[r,c] = original_test.shape
hop = np.zeros((r*c))
ori = np.zeros((r*c))
meane = []
abse = []

##################################################################
	#  mean  and absolute hop error calculation----------------
print("-------------- Calculating error -----------------")

[r, c] = original_test.shape
        # vectorize matrices - placeholders
hop = []
ori = []

p = 0
for i in range(r):
    for j in range(c):
        if hadamards_test[i, j] == 0:  # considers error on only unobserved entries
            hop.append(recon_test[i, j])
            ori.append(original_test[i, j])
            p = p + 1

hop = np.array(hop)
ori = np.array(ori)
x = np.round(hop - ori)

print ("numerator:", np.sum(abs(x)))
print ("sum of unobserved entries:", np.sum(ori))
print ("b: total unobserved entries:", len(ori))

mean_err = (np.sum(abs(x))) / (np.sum(ori))
mean_err = mean_err * 100
mean_std = np.std(abs(x))

abs_err = (np.sum(abs(x))) / (len(ori))  # divided by the number of unobserved entries
abs_std = np.std(abs(x))

print(mean_err)
print(abs_err)