In [None]:
! pip install sentence-transformers wandb --quiet  

In [None]:
import pandas as pd
import numpy as np
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
from torch.utils.data import DataLoader
import torch
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, BinaryClassificationEvaluator
import logging
from datetime import datetime
import os
import gzip
import pandas as pd
import wandb

In [None]:
wandb.login(key='<KEY>')

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
df_cooking = pd.read_json('/kaggle/input/alexa-ctr-data/wizard_of_tasks_cooking_v1.0.json')

df_diy = pd.read_json('/kaggle/input/alexa-ctr-data/wizard_of_tasks_diy_v1.0.json')

In [None]:
columns = list(df_cooking.columns)
test = []
result = []
target_cooking = []
target_diy = []
for i in range(len(columns)):
    temp = dict(df_cooking[columns[i]])
    for j in range(len(temp['turns'])):
        t = {}
        t['text'] = temp['turns'][j]['text']
        if t['text'] is None:
            continue
        t['split'] = temp['data_split']   
        t['label'] = temp['turns'][j]['intent']
        if t['split'] == 'test':
            test.append(t)
        if t['label'] not in target_cooking:
            target_cooking.append(t['label'])
        result.append(t)
columns = list(df_diy.columns)
for i in range(len(columns)):
    temp = dict(df_diy[columns[i]])
    
    for j in range(len(temp['turns'])):
        t = {}
        t['text'] = temp['turns'][j]['text']
        if t['text'] is None:
            continue
        t['split'] = temp['data_split']
        t['label'] = temp['turns'][j]['intent']
        if t['split'] == 'test':
            test.append(t)
        if t['label'] not in target_diy:
            target_diy.append(t['label'])
        result.append(t)

In [None]:
target_cooking

In [None]:
set(target_diy).intersection(set(target_cooking))

In [None]:
model_name = 'all-MiniLM-L12-v2'

In [None]:
# Convert the dataset to a DataLoader ready for training
logging.info("Read train dataset")

train_samples = []
dev_samples = []
test_samples = []

for row in result:
    inp_example = InputExample(texts=[row['text'], ' '.join(row['label'].split('_'))], label=1)
    if row['split'] == 'validation':
        dev_samples.append(inp_example)
    elif row['split'] == 'test':
        test_samples.append(inp_example)
    else:
        train_samples.append(inp_example)
    for j in range(len(target_diy)):
        if target_diy[j]!=row['label']:
            inp_example = InputExample(texts=[row['text'], ' '.join(target_diy[j].split('_'))], label=0)
            if row['split'] == 'validation':
                dev_samples.append(inp_example)
            elif row['split'] == 'test':
                test_samples.append(inp_example)
            else:
                train_samples.append(inp_example)

    

In [None]:
len(train_samples)

In [None]:

model = SentenceTransformer(model_name).to(device)
weight_decay = 0.01

train_batch_size_l = [64, 128, 256]
num_epochs_l = [2, 4, 8]
warmup = [0.01, 0.1, 0.5]
for x in train_batch_size_l:
    for y in num_epochs_l: 
        for z in warmup: 
            wandb.init(
                # set the wandb project where this run will be logged
                project="alexa-ctr",
                config={
                "architecture": "bi-encoder-" + model_name,
                "epochs": y,
                "train_batch_size": x,
                "weight_decay": weight_decay,
                "evaluator": "BinaryClassificationEvaluator",
                "loss": "ContrastiveLoss",
                "warmup": z
                }
            )

            train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=x)
            train_loss = losses.OnlineContrastiveLoss(model=model)

            # Development set: Measure correlation between cosine score and gold labels
            logging.info("Read dev dataset")
            evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='actr-dev')
            
            # Configure the training. We skip evaluation in this example
            warmup_steps = math.ceil(len(train_dataloader) * num_epochs * z) #10% of train data for warm-up
            logging.info("Warmup-steps: {}".format(z))

            # Train the model
            model.fit(train_objectives=[(train_dataloader, train_loss)],
                      evaluator=evaluator,
                      epochs=y,
                      output_path='/kaggle/working/model',
                      warmup_steps=z,
                      weight_decay=weight_decay) 
            sentences2 = [' '.join(x.split('_')) for x in target_diy]
            model = model.eval()

            c = 0
            for i in test:
                sentences1 = i['text']
                embedding1 = model.encode(sentences1)
                embedding2 = model.encode(sentences2)
                score = util.cos_sim(embedding1, embedding2).tolist()[0]
                predicted = target_diy[np.argmax(score)]
                if predicted == i['label']:
                    c+=1
            wandb.log({'test_accuracy': (c/len(test))*100})
            wandb.finish()