## Outline
- TFIDF representatation of the text and Effnet output is input into arcface layer. 
- During inference, seperated those 2 embeddings and created seperate predictions for both.
- BERT based model's weights are taken from [here](https://www.kaggle.com/tanulsingh077/best-multilingual-model) and created predictions from this model too.
- Took union of those 3 embeddings' predictions as the final predictions.
- Also final part of the notebook, runs similar experiments with this [notebook](https://www.kaggle.com/anlgrbz/how-optimum-threshold-changes-with-embed-test-size) for different embedding models.

In [None]:
from shopee_helper_script import *
from torch.nn import functional as F
import transformers
import plotly.express as px

IMAGE_SIZE = 512

tfidf_dim=15000
img_emb_dim=256
tfidf_emb_dim=256

image_threshold = 0.1 - 0.03
bert_threshold = 0.4 - 0.18
tfidf_threshold = 0.15 - 0.08


data_folder = "../input/shopee-product-matching"
pre_trained_image_model_folder = "../input/pre-trained-models/"
model_path = "../input/pre-trained-models/tfidf_15000_256_256_11014_10.pth"  #### MODIFY THIS####
model_file_name = "efficientnet-b1-f1951068.pth"
model_name = "efficientnet-b1"


transformer_model = '../input/sentence-transformer-models/paraphrase-xlm-r-multilingual-v1/0_Transformer'
TEXT_MODEL_PATH = '../input/best-multilingual-model/sentence_transfomer_xlm_best_loss_num_epochs_25_arcface.bin'
TOKENIZER = transformers.AutoTokenizer.from_pretrained(transformer_model)

!cp -r ../input/eff-net-whl/EfficientNet-PyTorch .
!pip install -e EfficientNet-PyTorch/.


In [None]:
import importlib  
ENet = importlib.import_module("EfficientNet-PyTorch.efficientnet_pytorch")
EfficientNet = ENet.EfficientNet

In [None]:
class image_tfidf_embedder(Base_model):
    def __init__(self, tfidf_dim=15000, img_emb=256, text_emb=256, out_classes=11014):
        super().__init__()

        try:
            self.effnet = EfficientNet.from_name(model_name)
            self.effnet.load_state_dict(torch.load(pre_trained_image_model_folder + model_file_name))
        except:
            self.effnet = EfficientNet.from_pretrained(pre_trained_image_model_folder + model_file_name)

        self.linear1 = nn.Linear(1000, img_emb)
        self.linear2 = nn.Linear(tfidf_dim, text_emb)
        self.arcface_head = ArcFace(img_emb + text_emb, out_classes)

        self.loss = nn.CrossEntropyLoss()
        self.metric = metrics.accuracy_score
        self.threshold = None


    def forward(self, data_batch):
        images = data_batch["image"]
        text_vec = data_batch["text_vec"]
        label = data_batch["label"]

        images = self.effnet(images)
        img_emb = self.linear1(images)
        text_emb = self.linear2(text_vec)

        full_emb = torch.cat([img_emb,text_emb], dim=1)


        if self.training:
            out = self.arcface_head(full_emb, label)
            loss = self.loss(out, label)
            metric = 0

            return out, loss, metric
        else:
            return full_emb, 0, 0


    def set_optimizer(self, lr):
        self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)


    def validate_all(self, valid_dataset):
        embeddings = self.predict(valid_dataset, batch_size=self.valid_batch_size)

        # Find best threshold for cosine distance and log the f1
        best_cosine_threshold = get_best_threshold(cosine_find_matches_cupy, embeddings, valid_dataset.df.posting_id, valid_dataset.df.target, np.arange(0.05,0.30,0.05))
        matches = cosine_find_matches_cupy(embeddings, valid_dataset.df.posting_id, best_cosine_threshold,create_submission=False)
        f1_score = matches_to_f1_score(valid_dataset.df.target, pd.Series(matches))
        wandb.log({"Valid_cosine_F1": f1_score}, step=self.current_train_step)



class bert_embedder(Base_model):
    def __init__(self, out_classes=11014, dropout=0.3):
        super().__init__()
        self.transformer = transformers.AutoModel.from_pretrained(transformer_model)


    def forward(self, data_batch):
        input_id = data_batch["input_id"]
        attention_mask = data_batch["attention_mask"]
        
        x = self.transformer(input_ids=input_id, attention_mask=attention_mask)
        
        feat = x[0]
        feat = feat[:,0,:]
        out = F.normalize(feat)
                
        return out, None, None



In [None]:
def get_transforms(img_size=IMAGE_SIZE):
    return albumentations.Compose([
        albumentations.Resize(img_size, img_size),
        albumentations.Normalize()
    ])

# Function to get our text title embeddings
def get_tfidf(titles, max_features = 15500):

    vectorizer = TfidfVectorizer(stop_words = 'english', binary = True, max_features = max_features)
    vectorizer = vectorizer.fit(pd.read_csv(data_folder + "/train.csv"))
    text_embeddings = vectorizer.transform(titles)
    del vectorizer
    return text_embeddings

class ShopeeDataset(Dataset):
    def __init__(self, df, mode, transforms=get_transforms()):
        self.df = df.reset_index(drop=True)
        self.transform = transforms
        self.mode = mode
        self.text_vec = get_tfidf(df["title"])

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row = self.df.iloc[index,]
        text_vec = self.text_vec[index,]
        
        
        try:
            label_group = torch.tensor(row.label_group)
        except (ValueError, AttributeError):
            label_group = torch.Tensor()


        image = cv2.imread(row.file_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = self.transform(image=image)
        image = image["image"].astype(np.float32)
        image = image.transpose(2, 0, 1) # Turn into pytorch format # Batch, Channels, ...
        image = torch.tensor(image)

        return {"image":image,  "text_vec":torch.tensor(np.squeeze(text_vec.toarray().astype(np.float32))) , "label":label_group}

        
class ShopeeTextDataset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row = self.df.iloc[index,]
        
        text = row.title
        
        text = TOKENIZER(text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
        input_ids = text['input_ids'][0]
        attention_mask = text['attention_mask'][0]


        return {"input_id":torch.tensor(input_ids),  "attention_mask": torch.tensor(attention_mask)}

In [None]:
#train = pd.read_csv("../input/shopee-product-matching/train.csv")
#text_train_ds = ShopeeTextDataset(train)

test = pd.read_csv("../input/shopee-product-matching/test.csv")
text_test_ds = ShopeeTextDataset(test)

text_model = bert_embedder(out_classes=11014)
text_model.load_state_dict(dict(list(torch.load(TEXT_MODEL_PATH).items())[:-1]))
text_model.device=device
text_model.to(device)

text_embed = text_model.predict(text_test_ds, batch_size=1000)

text_matches = cosine_find_matches_cupy(text_embed, text_test_ds.df.posting_id, bert_threshold, create_submission=False)

In [None]:
#image_train_ds , _ = create_train_test(mode="train")
test_ds , _ = create_train_test(mode="inference")

# Load trained model
model = image_tfidf_embedder(tfidf_dim=tfidf_dim,  img_emb=img_emb_dim, text_emb=tfidf_emb_dim)
model.load_state_dict(torch.load(model_path))
model.device=device

# Generate embeddings and then submission file
img_tfidf_embed = model.predict(test_ds) # If doesn't work, self.device attribute might be missing

img_embed = img_tfidf_embed[:,:256]
tfidf_embed = img_tfidf_embed[:,256:]

img_matches = cosine_find_matches_cupy(img_embed, test_ds.df.posting_id, image_threshold, create_submission=False)
tfidf_matches = cosine_find_matches_cupy(tfidf_embed, test_ds.df.posting_id, tfidf_threshold, create_submission=False)

In [None]:
#Take union of the predictions
#matches = list(map(lambda x,y,z: " ".join(set(x.split(" ") + y.split(" ") + z.split(" "))), img_matches, tfidf_matches, text_matches ))
matches = list(map(lambda x,y: " ".join(set(x.split(" ") + y.split(" ") )), img_matches,  text_matches ))

pd.DataFrame({"posting_id": test_ds.df.posting_id, "matches": matches}).to_csv("submission.csv", index=False)

## Run Optimum Threshold Search for Different Data Size

In [None]:
def get_best_threshold2(method, embeddings, posting_ids, correct_matches, candidates):

    scores = dict()
    for threshold in candidates:

        matches = method(embeddings, posting_ids, threshold, create_submission=False)

        scores[threshold] = matches_to_f1_score(pd.Series(matches), pd.Series(correct_matches))

        print(f"Method:{method.__name__},   Threshold:{threshold},   F1-Score: {scores[threshold]}")

    best_threshold = max(scores, key=scores.get)
    print(f"Best Threshold:{best_threshold},  Best F1-Score: {scores[best_threshold]}")

    return best_threshold, scores[best_threshold]

In [None]:
def create_submission_format(df):
    tmp = df.groupby("label_group").posting_id.unique().to_dict()
    matches = df.label_group.map(lambda x: " ".join(tmp[x]))
    return matches

train = pd.read_csv(data_folder+"/train.csv")
train["target"] = create_submission_format(train)

cv_splitter = GroupKFold(n_splits=5)
train["fold"] = -1

# Assign folds for validation
for fold, (train_idx, valid_idx) in enumerate(cv_splitter.split(train, None, train.label_group)):
    train.loc[valid_idx, "fold"] = fold

In [None]:
model_no = 1 # "Bert"


text_train_ds = ShopeeTextDataset(train)
text_model = bert_embedder(out_classes=11014)
text_model.load_state_dict(dict(list(torch.load(TEXT_MODEL_PATH).items())[:-1]))
text_model.device=device
text_model.to(device)
text_embed = text_model.predict(text_train_ds, batch_size=500)

    
    
tracker = pd.DataFrame(columns=["model_no", "n_label_group", "n_post", "optimum_threshold", "score"], data=np.zeros((15,5)))

for folds_before, _ in enumerate(range(5)):
    print("="*50)
    print("All Folds up to Fold:", folds_before)
    print("="*50)
    valid_emb = text_embed[train.fold <= folds_before,]
    valid_df = train.loc[train.fold <= folds_before,]
    n_label_group = valid_df.label_group.nunique()
    n_post = valid_df.shape[0]
    print("Number of Label Groups: ", n_label_group)
    print("Number of Posts: ", n_post)
    best_threshold, best_score = get_best_threshold2(cosine_find_matches_cupy, valid_emb, valid_df.posting_id.values, valid_df.target.values, np.arange(0.35, 0.70, 0.05))
    tracker.iloc[folds_before,] = (model_no, n_label_group, n_post, best_threshold, best_score)



In [None]:
model_no = 2 #"image"
train_ds , _ = create_train_test(mode="train")

# Load trained model
model = image_tfidf_embedder(tfidf_dim=tfidf_dim,  img_emb=img_emb_dim, text_emb=tfidf_emb_dim)
model.load_state_dict(torch.load(model_path))
model.device=device

# Generate embeddings and then submission file
img_tfidf_embed = model.predict(train_ds) # If doesn't work, self.device attribute might be missing

img_embed = img_tfidf_embed[:,:256]
tfidf_embed = img_tfidf_embed[:,256:]


for folds_before, _ in enumerate(range(5)):
    print("="*50)
    print("All Folds up to Fold:", folds_before)
    print("="*50)
    valid_emb = img_embed[train.fold <= folds_before,]
    valid_df = train.loc[train.fold <= folds_before,]
    n_label_group = valid_df.label_group.nunique()
    n_post = valid_df.shape[0]
    print("Number of Label Groups: ", n_label_group)
    print("Number of Posts: ", n_post)
    best_threshold, best_score = get_best_threshold2(cosine_find_matches_cupy, valid_emb, valid_df.posting_id.values, valid_df.target.values, np.arange(0.05, 0.45, 0.05))
    tracker.iloc[5+folds_before,] = (model_no, n_label_group, n_post, best_threshold, best_score)



In [None]:
model_no = 3 #"tfidf"


for folds_before, _ in enumerate(range(5)):
    print("="*50)
    print("All Folds up to Fold:", folds_before)
    print("="*50)
    valid_emb = tfidf_embed[train.fold <= folds_before,]
    valid_df = train.loc[train.fold <= folds_before,]
    n_label_group = valid_df.label_group.nunique()
    n_post = valid_df.shape[0]
    print("Number of Label Groups: ", n_label_group)
    print("Number of Posts: ", n_post)
    best_threshold, best_score = get_best_threshold2(cosine_find_matches_cupy, valid_emb, valid_df.posting_id.values, valid_df.target.values, np.arange(0.05, 0.45, 0.05))
    tracker.iloc[10+folds_before,] = (model_no, n_label_group, n_post, best_threshold, best_score)



In [None]:
tracker = tracker.iloc[:15,]
tracker.model_no = tracker.model_no.map({1:"Bert", 2:"effnet" ,3:"tfidf"})
fig = px.scatter(tracker, x="n_post", y="optimum_threshold", trendline="ols", facet_col="model_no")
fig.show()

In [None]:
# # to have target column 
# tmp_ds , _ = create_train_test(mode="train")

# ############ SEARCH THRESHOLD ON TRAIN ####################
# text_train_ds = ShopeeTextDataset(pd.read_csv("../input/shopee-product-matching/train.csv"))

# text_model = text_embedder(out_classes=11014)
# text_model.load_state_dict(dict(list(torch.load(TEXT_MODEL_PATH).items())[:-1]))
# text_model.device=device
# text_model.to(device)
# text_embed = text_model.predict(text_train_ds, batch_size=1000)
# print("For data size:", len(train_ds))
# get_best_threshold2(cosine_find_matches_cupy, text_embed, text_train_ds.df.posting_id,  tmp_ds.df.target, np.arange(0.05,0.60,0.05))


In [None]:
# ############ SEARCH THRESHOLD ON TRAIN ####################
# train_ds , _ = create_train_test(mode="train")
# # Load trained model
# model = image_embedder(tfidf_dim=tfidf_dim,  img_emb=img_emb, text_emb=text_emb)
# model.load_state_dict(torch.load(model_path))
# model.device=device

# # Generate embeddings and then submission file
# embeddings = model.predict(train_ds) # If doesn't work, self.device attribute might be missing

# img_emb = embeddings[:,:256]
# text_emb = embeddings[:,256:]
# best_threshold_img = get_best_threshold(cosine_find_matches_cupy, img_emb, train_ds.df.posting_id,  train_ds.df.target, np.arange(0.05,0.50))
# best_threshold_text = get_best_threshold(cosine_find_matches_cupy, text_emb, train_ds.df.posting_id,  train_ds.df.target, np.arange(0.05,0.50))

# best_threshold_img = get_best_threshold(cosine_find_matches_cupy, img_emb, train_ds.df.posting_id,  train_ds.df.target, np.arange(0.05,0.50,0.05))
# best_threshold_text = get_best_threshold(cosine_find_matches_cupy, text_emb, train_ds.df.posting_id,  train_ds.df.target, np.arange(0.05,0.50,0.05))

# print(best_threshold_img)
# print(best_threshold_text)

In [None]:
# ####### MAKE TEST PREDICTIONS FOR IMAGE  ###############
# # Create test dataset
# # train_ds , _ = create_train_test(mode="train")
# test_ds , _ = create_train_test(mode="inference")

# # Load trained model
# model = image_embedder(img_emb=img_emb)
# model.load_state_dict(torch.load(model_path))
# model.device=device

# # Generate embeddings and then submission file
# embeddings = model.predict(test_ds) # If doesn't work, self.device attribute might be missing

# cosine_find_matches_cupy(embeddings, test_ds.df.posting_id, threshold, create_submission=True)

In [None]:
# # Create seperate predictions for text & image then union
# img_emb = embeddings[:,:256]
# text_emb = embeddings[:,256:]

# #img_matches = cosine_find_matches_cupy(img_emb, test_ds.df.posting_id, 0.1, create_submission=False)
# text_matches = cosine_find_matches_cupy(text_emb, test_ds.df.posting_id, 0.20, create_submission=True) # 0.75 on data that it is trained
# # Take union of the predictions
# #matches = list(map(lambda x,y: " ".join(set(x.split(" ") + y.split(" "))), img_matches, text_matches))

# #pd.DataFrame({"posting_id": test_ds.df.posting_id, "matches": matches}).to_csv("submission.csv", index=False)