# Importing packages

In [None]:
!pip install torchviz

In [None]:
import pickle
import nltk
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, models
from torchsummary import summary
from torchviz import make_dot
from sklearn.metrics import log_loss, hamming_loss, accuracy_score, f1_score, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from gensim.models.doc2vec import Doc2Vec
from nltk.tokenize import word_tokenize
from PIL import Image
from tqdm import tqdm
nltk.download('punkt')
plt.rcParams['figure.figsize'] = (10, 8)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Copying the files to the local colab machine from google drive to speed up performance

In [None]:
!cp -r "/content/drive/Shareddrives/CIS 522 Final Project/shopee-product-matching.zip" .
!unzip "/content/shopee-product-matching.zip"

In [None]:
batch_size = 256

In [None]:
!nvidia-smi

In [None]:
pd.read_csv('/content/drive/Shareddrives/CIS 522 Final Project/Data/triplet_train.csv').head()

# Defining image preprocessing and augmentation

In [None]:
train_transforms = transforms.Compose([
                                       transforms.RandomResizedCrop(size=256, scale=(0.8, 1.0)),
                                       transforms.ColorJitter(),
                                       transforms.RandomHorizontalFlip(),
                                       transforms.Resize(size=256),
                                       transforms.CenterCrop(size=224),
                                       transforms.ToTensor(),
                                       transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
                                       ])
inference_transforms = transforms.Compose([
                                      transforms.Resize(size=256),
                                      transforms.CenterCrop(size=224),
                                      transforms.ToTensor(),
                                      transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
                                      ])

# Defining the training dataset structure

In [None]:
class ShopeeDatasetTrain(Dataset):

    def __init__(self, csv_file, transform=None, folder='train'):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.dataframe = pd.read_csv(csv_file)
        self.nlp_model = Doc2Vec.load('/content/drive/Shareddrives/CIS 522 Final Project/Models/d2v.model')
        self.transform = transform
        self.folder = folder

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        anchor_image = Image.open('{}_images/{}'.format(self.folder, self.dataframe.loc[idx, 'image_anchor']))
        positive_image = Image.open('{}_images/{}'.format(self.folder, self.dataframe.loc[idx, 'image_positive']))
        negative_image = Image.open('{}_images/{}'.format(self.folder, self.dataframe.loc[idx, 'image_negative']))

        
        anchor_text = self.nlp_model.infer_vector(word_tokenize(self.dataframe.loc[idx, 'title_anchor'].lower()))
        positive_text = self.nlp_model.infer_vector(word_tokenize(self.dataframe.loc[idx, 'title_positive'].lower()))
        negative_text = self.nlp_model.infer_vector(word_tokenize(self.dataframe.loc[idx, 'title_negative'].lower()))

        if self.transform:
            anchor_image = self.transform(anchor_image)
            positive_image = self.transform(positive_image)
            negative_image = self.transform(negative_image)

        sample = anchor_image.cuda().float(), positive_image.cuda().float(), negative_image.cuda().float(), torch.from_numpy(anchor_text).cuda().float(), torch.from_numpy(positive_text).cuda().float(), torch.from_numpy(negative_text).cuda().float()

        return sample

# Defining the inference dataset structure

In [None]:
class ShopeeDatasetInference(Dataset):

    def __init__(self, csv_file, transform=None, folder='train'):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.dataframe = pd.read_csv(csv_file).drop_duplicates(subset=['posting_id_anchor'])
        self.nlp_model = Doc2Vec.load('/content/drive/Shareddrives/CIS 522 Final Project/Models/d2v.model')
        self.transform = transform
        self.folder = folder

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        
        image = Image.open('{}_images/{}'.format(self.folder, self.dataframe.loc[idx, 'image_anchor']))
        text = self.nlp_model.infer_vector(word_tokenize(self.dataframe.loc[idx, 'title_anchor'].lower()))
        label = self.dataframe.loc[idx, 'label_group_positive']
        

        if self.transform:
            image = self.transform(image)

        sample = image.cuda().float(), torch.from_numpy(text).cuda().float(), label

        return sample

In [None]:
train_dataset = ShopeeDatasetTrain('/content/drive/Shareddrives/CIS 522 Final Project/Data/triplet_train.csv', transform=train_transforms, folder='train')
valid_dataset = ShopeeDatasetTrain('/content/drive/Shareddrives/CIS 522 Final Project/Data/triplet_valid.csv', transform=inference_transforms, folder='train')
inference_dataset = ShopeeDatasetInference('/content/drive/Shareddrives/CIS 522 Final Project/Data/triplet_valid.csv', transform=inference_transforms, folder='train')

# Defining the dataloader to fetch batches from the dataset

In [None]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
inference_loader = DataLoader(inference_dataset, batch_size=batch_size, shuffle=False)

# Defining the Neural Network architecture

In [None]:
class FullNet(nn.Module):   
    def __init__(self):
        super(FullNet, self).__init__()

        self.linear_layers = nn.Sequential(
            nn.Linear((150528+300), 1024),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(512, 300),
        )

    def forward(self, image, text_embedding):
        image = image.view(image.shape[0], -1)
        output = self.linear_layers(torch.cat((image, text_embedding), axis=1))

        return output

In [None]:
dnn = FullNet().cuda()

# Defining the training loop

In [None]:
def train_model(model, train_loader, valid_loader):

  criterion = nn.TripletMarginLoss()
  optimizer = torch.optim.Adam(model.parameters())

  epoch_losses = []
  epoch_predictions = []
  epoch_actuals = []

  for epoch in range(1, 20):

    total_loss = 0

    model.train()

    for anchor_image, positive_image, negative_image, anchor_text, positive_text, negative_text in tqdm(train_loader):
      optimizer.zero_grad()

      anchor_output = model(anchor_image, anchor_text)
      positive_output = model(positive_image, positive_text)
      negative_output = model(negative_image, negative_text)


      loss = criterion(anchor_output, positive_output, negative_output)
      total_loss += loss.item()

      loss.backward()
      optimizer.step()

    print()
    print('epoch: {}, train loss: {}'.format(epoch, total_loss/len(train_loader)))


    torch.save(model.state_dict(), '/content/drive/Shareddrives/CIS 522 Final Project/dnn_model_epoch_{}'.format(epoch))

    model.eval()

    total_loss = 0

    actual = []
    predicted = []

    for anchor_image, positive_image, negative_image, anchor_text, positive_text, negative_text in tqdm(valid_loader):
      with torch.no_grad():
        anchor_output = model(anchor_image, anchor_text)
        positive_output = model(positive_image, positive_text)
        negative_output = model(negative_image, negative_text)

        loss = criterion(anchor_output, positive_output, negative_output)
        total_loss += loss.item()
      
      positive_predicted = torch.where(F.cosine_similarity(anchor_output, positive_output, dim=1) > 0.5, 1, 0).detach().cpu().numpy().tolist()
      negative_predicted = torch.where(F.cosine_similarity(anchor_output, negative_output, dim=1) > 0.5, 1, 0).detach().cpu().numpy().tolist()

      predicted += positive_predicted + negative_predicted
      actual += [1] * len(anchor_output) + [0] * len(anchor_output)

    print()
    print('epoch: {}, valid loss: {}'.format(epoch, total_loss/len(valid_loader)))

    epoch_losses.append(total_loss)
    epoch_predictions.append(predicted)
    epoch_actuals.append(actual)

    with open('/content/drive/Shareddrives/CIS 522 Final Project/dnn_valid_losses.npy', 'wb') as f:
      np.save(f, np.array(epoch_losses))
    with open('/content/drive/Shareddrives/CIS 522 Final Project/dnn_valid_actuals.npy', 'wb') as f:
      np.save(f, np.array(epoch_actuals))
    with open('/content/drive/Shareddrives/CIS 522 Final Project/dnn_valid_predictions.npy', 'wb') as f:
      np.save(f, np.array(epoch_predictions))

  return epoch_losses, epoch_predictions, epoch_actuals

# Defining the inference loop

In [None]:
def inference_model(model, inference_loader):

  model.eval()

  outputs = []
  labels = []

  with torch.no_grad():
    for batch_images, batch_texts, batch_labels in tqdm(inference_loader):
      output = model(batch_images, batch_texts)

      outputs += output.detach().cpu().numpy().tolist()
      labels += batch_labels.detach().cpu().numpy().tolist()


  return outputs, labels

In [None]:
epoch_losses, epoch_predictions, epoch_actuals = train_model(dnn, train_loader, valid_loader)

# Computing and displaying the results for all the metrics (loss, accuracy, F1-micro, F1-macro)

In [None]:
epoch_losses = np.load('/content/drive/Shareddrives/CIS 522 Final Project/dnn_valid_losses.npy')
epoch_predictions = np.load('/content/drive/Shareddrives/CIS 522 Final Project/dnn_valid_predictions.npy')
epoch_actuals = np.load('/content/drive/Shareddrives/CIS 522 Final Project/dnn_valid_actuals.npy')

In [None]:
epoch_actuals.shape

In [None]:
fig = plt.figure()
plt.plot(epoch_losses)
plt.xlabel('epoch', fontsize=16)
plt.ylabel('Validation Triplet Loss', fontsize=16)

In [None]:
best_epoch = np.argmin(epoch_losses)
best_epoch

In [None]:
fpr, tpr, thresholds = roc_curve(epoch_actuals[best_epoch].reshape(-1), epoch_predictions[best_epoch].reshape(-1))
roc_auc = auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate', fontsize=16)
plt.xlabel('False Positive Rate', fontsize=16)
plt.show()

In [None]:
points = [accuracy_score(epoch_actuals[i].reshape(-1), epoch_predictions[i].reshape(-1)) for i in range(len(epoch_actuals))]

fig = plt.figure()
plt.plot(points)
plt.xlabel('epoch', fontsize=16)
plt.ylabel('Validation accuracy score', fontsize=16)
plt.ylim([0, 1])

In [None]:
points = [f1_score(epoch_actuals[i].reshape(-1), epoch_predictions[i].reshape(-1)) for i in range(len(epoch_actuals))]
fig = plt.figure()
plt.plot(points)
plt.xlabel('epoch', fontsize=16)
plt.ylabel('Validation F1 score', fontsize=16)

In [None]:
all_outputs = []
all_labels = []

for epoch in range(1, len(epoch_actuals)+1):
  dnn.load_state_dict(torch.load('/content/drive/Shareddrives/CIS 522 Final Project/dnn_model_epoch_{}'.format(epoch)))
  outputs, labels = inference_model(dnn, inference_loader)

  all_outputs.append(outputs)
  all_labels.append(labels)

  with open('/content/drive/Shareddrives/CIS 522 Final Project/dnn_valid_outputs.npy', 'wb') as f:
    np.save(f, np.array(all_outputs))
  with open('/content/drive/Shareddrives/CIS 522 Final Project/dnn_valid_labels.npy', 'wb') as f:
    np.save(f, np.array(all_labels))

In [None]:
outputs = np.load('/content/drive/Shareddrives/CIS 522 Final Project/dnn_valid_outputs.npy')
labels = np.load('/content/drive/Shareddrives/CIS 522 Final Project/dnn_valid_labels.npy')

In [None]:
accuracies = []
f1_micros = []
f1_macros = []

for epoch in range(1, len(epoch_actuals)):
  X_train, X_test, y_train, y_test = train_test_split(outputs[epoch], labels[epoch], test_size=0.20, random_state=1)

  knn = KNeighborsClassifier(n_neighbors=1, n_jobs=-1)
  knn.fit(X_train, y_train)

  results = knn.predict(X_test)

  accuracies.append(accuracy_score(y_test, results))
  f1_micros.append(f1_score(y_test, results, average='micro'))
  f1_macros.append(f1_score(y_test, results, average='macro'))

In [None]:
fig = plt.figure()
plt.plot(accuracies)
plt.xlabel('epoch', fontsize=16)
plt.ylabel('Validation accuracy score', fontsize=16)
plt.ylim([0, 1])

In [None]:
fig = plt.figure()
plt.plot(f1_micros)
plt.xlabel('epoch', fontsize=16)
plt.ylabel('Validation f1-micro score', fontsize=16)
plt.ylim([0, 1])

In [None]:
fig = plt.figure()
plt.plot(f1_macros)
plt.xlabel('epoch', fontsize=16)
plt.ylabel('Validation f1-macro score', fontsize=16)
plt.ylim([0, 1])