# Importing packages

In [None]:
import pickle
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss, hamming_loss, accuracy_score, f1_score, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from gensim.models.doc2vec import Doc2Vec
from nltk.tokenize import word_tokenize
from PIL import Image
from tqdm import tqdm
import gc
nltk.download('punkt')
plt.rcParams['figure.figsize'] = (10, 8)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Copying the files to the local colab machine from google drive to speed up performance

In [None]:
!cp -r "/content/drive/Shareddrives/CIS 522 Final Project/shopee-product-matching.zip" .
!unzip "/content/shopee-product-matching.zip"

In [None]:
pd.read_csv('/content/drive/Shareddrives/CIS 522 Final Project/Data/triplet_train.csv').head()

# Importing the dataset, dividing into train and test and loading the nlp model

In [None]:
train_dataset = pd.read_csv('/content/drive/Shareddrives/CIS 522 Final Project/Data/triplet_train.csv')
nlp_model = Doc2Vec.load('/content/drive/Shareddrives/CIS 522 Final Project/Models/d2v.model')

In [None]:
train_dataset = train_dataset.drop_duplicates(subset=['posting_id_anchor'])

In [None]:
train_dataset, valid_dataset = train_test_split(train_dataset, test_size=0.10, random_state=1)

In [None]:
train_dataset = train_dataset.drop_duplicates(subset=['posting_id_anchor'])
valid_dataset = valid_dataset.drop_duplicates(subset=['posting_id_anchor'])

In [None]:
train_labels = train_dataset['label_group_positive']
valid_labels = valid_dataset['label_group_positive']

# Loading the images as numpy arrays and saving the results

In [None]:
train_image_inputs = np.array([np.asarray(Image.open('train_images/{}'.format(image)).resize((224, 224))).flatten() for image in train_dataset['image_anchor']])
valid_image_inputs = np.array([np.asarray(Image.open('train_images/{}'.format(image)).resize((224, 224))).flatten() for image in valid_dataset['image_anchor']])

In [None]:
with open('/content/drive/Shareddrives/CIS 522 Final Project/ml_train_image_inputs.npy', 'wb') as f:
  np.save(f, train_image_inputs)
with open('/content/drive/Shareddrives/CIS 522 Final Project/ml_valid_image_inputs.npy', 'wb') as f:
  np.save(f, valid_image_inputs)

In [None]:
train_image_inputs = np.load('/content/drive/Shareddrives/CIS 522 Final Project/ml_train_image_inputs.npy')
valid_image_inputs = np.load('/content/drive/Shareddrives/CIS 522 Final Project/ml_valid_image_inputs.npy')

In [None]:
train_labels = train_labels
valid_labels = valid_labels

# Freeing up unused memory

In [None]:
gc.collect()

# Tokenizing the titles and generating embeddings for them

In [None]:
train_text_inputs = np.array([nlp_model.infer_vector(word_tokenize(text.lower())) for text in train_dataset['title_anchor']])
valid_text_inputs = np.array([nlp_model.infer_vector(word_tokenize(text.lower())) for text in valid_dataset['title_anchor']])

# Combining the title embeddings with the images array representation

In [None]:
train_inputs = np.concatenate((train_image_inputs, train_text_inputs), axis=1)
valid_inputs = np.concatenate((valid_image_inputs, valid_text_inputs), axis=1)

# Defining the machine learning model

In [None]:
lr = KNeighborsClassifier(n_jobs=-1)
lr.fit(train_image_inputs, train_labels)

results = lr.predict(valid_image_inputs)

# Reporting the results for all the metrics (accuracy, F1-micro, F1-macro)

In [None]:
accuracy_score(valid_labels, results)

In [None]:
f1_score(valid_labels, results, average='micro')

In [None]:
f1_score(valid_labels, results, average='macro')