In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import cv2
from PIL import Image
import torch
import numpy as np
from torch import nn
from torchvision import transforms
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
import torchvision.models as models
from torch.utils.data import random_split
from sklearn.neighbors import NearestNeighbors

import matplotlib.pyplot as plt

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, ne_chunk
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm
from textblob import TextBlob


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device available now:', device)

In [None]:
df_train = pd.read_csv("../input/shopee-product-matching/train.csv")
df_train.head(3)

In [None]:
#See different titles for same product
df = pd.DataFrame(index = [0, 1, 2], columns = ["Title 1", "Title 2"])
labels = [df_train["label"].values[0], df_train["label"].values[1], df_train["label"].values[2]]
n= len(labels)
for l in range(n):
    indexes = df_train[df_train["label_group"]==labels[l]].index.values 
    titles = df_train["title"].iloc[indexes].values
    df.iloc[l] = titles

df

In [None]:
#Print a pair of product
indexes = df_train[df_train["label_group"]==df_train["label"].values[0]].index.values 
fig, ax = plt.subplots(1,2)
img_A = np.array(Image.open('../input/shopee-product-matching/train_images/' +  df_train["image"].iloc[indexes[0]]))
img_B = np.array(Image.open('../input/shopee-product-matching/train_images/' +  df_train["image"].iloc[indexes[1]]))
ax[0].imshow(img_A)
ax[1].imshow(img_B)

In [None]:
tmp = df_train.groupby('label_group').posting_id.agg('unique').to_dict()
df_train['target'] = df_train.label_group.map(tmp)
df_train.head(3)

In [None]:
d = dict(zip(df_train.label_group.unique(), range(len(df_train.label_group))))
df_train.label_group.replace(d, inplace=True)

In [None]:
df_train.head(2)

## Creating shopee dataset

In [None]:
class ShopeeDataset(Dataset):
    
    def __init__(self, csv, train=True):
        self.csv = csv.reset_index()
        self.train = train
        self.transform = transforms.Compose([
                                             transforms.Resize((256, 256)),
                                             transforms.ToTensor()
                                            ])
        
    def __len__(self):
        return len(self.csv)
    
    def __getitem__(self, index):
        if self.train:
            image = Image.open('../input/shopee-product-matching/train_images/' +
                               self.csv.image[index])
        else:
            image = Image.open('../input/shopee-product-matching/test_images/' +
                               self.csv.image[index])

        image = self.transform(image)
        
        if self.train:
            label = torch.tensor(self.csv.label_group[index])
            return image, label
        
        else:
            return image

In [None]:
batch_size = 16

In [None]:
train_dataset = ShopeeDataset(csv=df_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=4)

print("Dataset length: {}".format(len(train_dataset)), "\n" +
      "Shape of images: {}".format(train_dataset[0][0].shape))

## Importing model ResNet18

In [None]:
resnet18 = models.resnet18(pretrained=True)

## Extracting embeddings from train images using ResNet18

In [None]:
def extract_embeddings(model, dataloader):
    embeddings = []

    with torch.no_grad():
        for image, label in tqdm(dataloader):
            if torch.cuda.is_available():
                image = image.to('cuda')
                model.to('cuda')

            img_emd = model(image)
            img_emd = img_emd.detach().cpu().numpy()
            embeddings.append(img_emd)

    embeddings = np.concatenate(embeddings)
    print("Shape of embeddings: {}".format(embeddings.shape))
    
    return embeddings

In [None]:
#embeddings = extract_embeddings(resnet18, train_dataloader)

In [None]:
#np.save("image_embeddings", embeddings)

# Text prediction

In [None]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'
    
# Preprocess titles
original_titles = df_train["title"]
lemmatizer = WordNetLemmatizer()

def preprocess_title(title):
    title_clean = title.lower().translate(str.maketrans('','',string.punctuation)).strip()
    title_tokenize = word_tokenize(title_clean)
    title_w_stopword = [w for w in title_tokenize if not w in stopwords.words()]
    title_lemmatized = [lemmatizer.lemmatize(w) for w in title_w_stopword]
    return " ".join(title_lemmatized)
    

In [None]:
for i in range(5):
    print(color.BOLD + "Orginial Title -" + color.END, original_titles.iloc[i])
    print(color.BOLD + "Preprocessed Title -" + color.END, preprocess_title(original_titles.iloc[i]))
    print('\n')

In [None]:
from sklearn.metrics.pairwise import linear_kernel

def predict_text(val_df, threshold = 0.4, get_score=False):
    preprocess_title  = val_df["preprocess_title"].values
    tf_idf = TfidfVectorizer(stop_words='english', binary=True, max_features=25000)
    text_embeddings = tf_idf.fit_transform(preprocess_title).toarray()
    
    predictions = []
    
    for index in range(len(val_df)):
        cosine_similarities = linear_kernel(text_embeddings[index].reshape(1,-1), text_embeddings).flatten()
        pair_indexes = cosine_similarities.argsort()[:-10:-1]
        for pair_index in pair_indexes:
            if cosine_similarities[pair_index] > threshold:
                pred = val_df.iloc[index].posting_id
                predictions.append(pred)
        
    val_df['matches'] = predictions
    
    if get_score:
        val_df['f1_score'] = get_f1_score(val_df['target'], val_df['matches'])

In [None]:
#Preprocess the titles
tqdm.pandas()

#df_train["preprocess_title"] = df_train["title"].progress_apply(lambda x: preprocess_title(x))
#df_train["preprocess_title"] = df_train["preprocess_title"].astype('U')
#preprocess_title = df_train["preprocess_title"].values


#Creating Train, Val dataframe
N = len(df_train)
N_train = int(0.8 * N)
idx = np.array(range(N))
np.random.shuffle(idx)

idx_train, idx_val = idx[:N_train], idx[N_train:]
train = df_train.loc[idx_train].reset_index(drop=True)
val = df_train.loc[idx_val].reset_index(drop=True)

In [None]:
predict_text(val, threshold=0.4, get_score=True)
print("F1 score: {}".format(val_df['f1_score'].mean()))

## Making predictions

In [None]:
def get_f1_score(y, y_pred):
    intr_sect = np.array([len(set(x1).intersection(x2)) for x1, x2 in zip(y, y_pred)])
    len_y = y.apply(lambda x: len(x)).values
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    f1 = 2 * intr_sect / (len_y + len_y_pred)
    
    return f1

In [None]:
def predict(model, train_emd, val_emd, 
            train_df, val_df, threshold, get_score=False):
    predictions = []
    
    knn_model = NearestNeighbors(n_neighbors=50, algorithm='brute', metric='cosine')
    knn_model.fit(train_emd)
    dists, idx = knn_model.kneighbors(val_emd)
    
    for i in range(val_emd.shape[0]):
        mask = dists[i] < threshold
        pred = train_df.loc[idx[i][mask]].posting_id.values
        predictions.append(pred)
        
    val_df['matches'] = predictions
    
    if get_score:
        val_df['f1_score'] = get_f1_score(val_df['target'], val_df['matches'])

In [None]:
"""
num_imgs = len(train_dataset)
num_train_imgs = int(num_imgs*.8)
idx = np.array(range(num_imgs))
np.random.shuffle(idx)
idx_train, idx_val = idx[:num_train_imgs], idx[num_train_imgs:]
train_df = df_train.loc[idx_train].reset_index(drop=True)
val_df = df_train.loc[idx_val].reset_index(drop=True)
train_set = ShopeeDataset(csv=train_df)
train_loader = DataLoader(train_set, batch_size=batch_size, num_workers=4)
val_set = ShopeeDataset(csv=val_df)
val_loader = DataLoader(val_set, batch_size=batch_size, num_workers=4)
train_embd = extract_embeddings(resnet18, train_loader)
val_embd = extract_embeddings(resnet18, val_loader)
predict(resnet18, train_embd, val_embd, train_df, val_df, threshold=.3, get_score=True)
print("F1 score: {}".format(val_df['f1_score'].mean()))
"""

In [None]:
#predict(resnet18, train_embd, val_embd, train_df, val_df, threshold=.3, get_score=True)

In [None]:
#predict(resnet18, embeddings, embeddings, df_train, df_train, threshold=.2, get_score=True)

In [None]:
#print("F1 score: {}".format(df_train['f1_score'].mean()))

In [None]:
"""
df_test = pd.read_csv("../input/shopee-product-matching/test.csv")
df_test.head(3)
"""

In [None]:
#test_set = ShopeeDataset(csv=df_test, train=False)
#test_loader = DataLoader(test_set, batch_size=batch_size, num_workers=4)

In [None]:
def extract_test_embeddings(model, dataloader):
    embeddings = []
    
    with torch.no_grad():
        for image in tqdm(dataloader):
            if torch.cuda.is_available():
                image = image.to('cuda')
                model.to('cuda')
                
            img_emd = model(image)
            img_emd = img_emd.detach().cpu().numpy()
            embeddings.append(img_emd)
            
    embeddings = np.concatenate(embeddings)
    print("Shape of embeddings: {}".format(embeddings.shape))

    return embeddings

In [None]:
#test_embd = extract_test_embeddings(resnet18, test_loader)

In [None]:
#predict(resnet18, embeddings, test_embd, df_train, df_test, threshold=.5)

In [None]:
#df_test['matches'] = df_test['matches'].apply(lambda x: " ".join(x))

In [None]:
#df_test[['posting_id', 'matches']].to_csv('submission.csv', index=False)

## ResNet18 with ArcFace

In [None]:
class ArcFace(nn.Module):
    
    def __init__(self, in_feat, out_feat, s=30.0, m=0.30, margin=False):
        super(ArcFace, self).__init__()
        self.in_feat = in_feat
        self.out_feat = out_feat
        self.s = torch.tensor(m)
        self.m = torch.tensor(m)
        
        with torch.no_grad():
            weights = nn.Parameter(torch.FloatTensor(out_feat, in_feat))
            nn.init.xavier_uniform_(weights)
        
        if torch.cuda.is_available():
            self.weights = weights.to('cuda')

        self.margin = margin
        self.cos_m = torch.cos(self.m)
        self.sin_m = torch.sin(self.m)
        
        tmp = torch.tensor(np.pi - m)
        self.theta = torch.cos(tmp)
        self.mm = torch.sin(tmp) * m

    def forward(self, x, label):
        cos = F.linear(F.normalize(x), F.normalize(self.weights))
        sin = (1. - cos**2).clamp(0, 1)**.5
        phi = cos * self.cos_m - sin * self.sin_m
        if self.margin:
            phi = torch.where(cos > 0, phi, cos)
        else:
            phi = torch.where(cos > self.theta, phi, cos - self.mm)

        one_hot = torch.zeros(cos.size(), device=device)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        output = (one_hot * phi) + ((1. - one_hot) * cos)
        output *= self.s
        del cos, sin, phi, one_hot
        
        return output

In [None]:
class ResNetWithArcFaceNet(nn.Module):

    def __init__(self, num_classes, model, s=30.0, margin=0.50):
        super(ResNetWithArcFaceNet, self).__init__()

        self.model = nn.Sequential(*list(resnet18.children())[:-1]).cuda()
        self.num_classes = num_classes
        self.s = s
        self.margin = margin
        #self.fc = nn.Linear(1000, 512)
        self.flatten = nn.Sequential(nn.Flatten())
        self.arcface = ArcFace(512, self.num_classes, s=self.s, m=self.margin)
        #self.softmax = nn.Softmax(dim=1)

    def forward(self, x, label):
        out = self.model(x)
        out = self.flatten(out)
        #out = self.fc(out)
        out = self.arcface(out, label)
        #out = self.softmax(out)

        return out

In [None]:
learning_rate = 1e-3
num_classes = df_train.label_group.unique().shape[0]
model = ResNetWithArcFaceNet(num_classes, resnet18)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss = nn.CrossEntropyLoss()

In [None]:
def train(model, optimizer, num_classes, train_loader, criterion, n_epoch=5):
    
    #model.eval()
    loss_train = []
    if torch.cuda.is_available():
        model = model.to('cuda')
    
    
    for epoch in range(n_epoch): 
        for images, labels in tqdm(train_loader):
            if torch.cuda.is_available():
                images = images.to('cuda')
                labels = labels.to('cuda')
            
            optimizer.zero_grad()
            outputs = model(images, labels)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            loss_train.append(loss.item())

        if epoch % 1 == 0:
            print("Epoch {}".format(epoch))
            print("Loss: {}".format(loss.item()))
    
        torch.save({
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': loss_train[-1],
                    }, 'model_arcface_epoch' + str(epoch) + '.pt')
            
    print('Finished Training')

In [None]:
train(model, optimizer, num_classes, train_dataloader, loss, n_epoch=50)

In [None]:
torch.save({
            'epoch': 50,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            }, 'model_arcface_epoch50.pt')