#Siamese Network Training
This notebook contains our code to train a siamese network using question and paragraph embeddings, to improve retrieval.<br>
This method was influenced by Sentence-Transformer's <a href='https://www.sbert.net/examples/training/multilingual/README.html#extend-your-own-models'>Multilingual Models:  Extend your own models</a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install sentence_transformers
!pip install transformers
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from sentence_transformers import models, losses, evaluation
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.datasets import ParallelSentencesDataset
import logging
from datetime import datetime
import os
import random
import torch
import time

# Create Train File
We sample approx. 7500 questions from training set and create a text file for training. This is to keep within the time constraints. Note that we only train on answerable questions and corresponding paragraphs.

In [9]:
# We split the training data given to us, in 5 folds. We train on 4 folds, and validate on the 5th.
train_folds_data_path = '/content/drive/MyDrive/InterIIT/train_5folds.csv'
df = pd.read_csv(train_folds_data_path)
df_positive = df.loc[(df['Answer_possible']==1) & (df['fold']!=4)]

In [None]:
sample_proportion = 0.195 # how much to take from each theme. Use 1 to use entire dataset
themes = list(df_positive.Theme.unique())
df_sample = pd.DataFrame()
for theme in themes:
  df_theme = df_positive.loc[df_positive.Theme == theme]
  theme_num = int(df_theme.shape[0]*sample_proportion)
  df_s = df_theme.sample(theme_num)
  df_sample = df_sample.append(df_s)
print(df_sample.shape)

In [None]:
def train_data_maker(df_sample,output_path):
  '''
  Input dataframes with only positive question and paragraph pairs
  '''
  df_sample['Question'] = df_sample['Question'].str.replace('\n','')
  df_sample['Question'] = df_sample['Question'].str.strip()
  df_sample['Paragraph'] = df_sample['Paragraph'].str.replace('\n','')
  df_sample['Paragraph'] = df_sample['Paragraph'].str.strip()
  df_sample['Text'] = df_sample['Question']+'\t'+df_sample['Paragraph']
  textlist = df_sample['Text'].tolist()
  sample_text = '\n'.join(textlist)
  f = open(output_path,"w")
  f.write(sample_text)
  f.close()

In [None]:
train_data_maker(df_sample,'/content/drive/MyDrive/Siamese FineTuning/sample_theme_5folds.text')
del df_sample

# Training

In [18]:
# We take positives from the 4th fold for validation
df_val = df.loc[(df['Answer_possible']==1) & (df['fold']==4)]
val_questions = [q for q in df_val['Question']][:2]
val_context = [c for c in df_val['Paragraph']][:2]
del df_val

In [None]:
# Teacher Model: Model we want to distill, takes questions as input
teacher_model_name = 'multi-qa-mpnet-base-dot-v1'
teacher_model = SentenceTransformer(teacher_model_name)

output_path = "/content/drive/MyDrive/Siamese FineTuning/output/model-distillation-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
use_layer_reduction = True

# Student model, takes paragraph as inputs
student_model_name = 'multi-qa-mpnet-base-dot-v1'
student_model = SentenceTransformer(student_model_name)

Downloading (…)16ebc/.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b6b5d16ebc/README.md:   0%|          | 0.00/8.65k [00:00<?, ?B/s]

Downloading (…)b5d16ebc/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)ebc/data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)16ebc/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)6ebc/train_script.py:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

Downloading (…)b6b5d16ebc/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5d16ebc/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model, use_embedding_cache=True)
train_data.load_data('/content/drive/MyDrive/Siamese FineTuning/sample_theme_5folds.text',max_sentence_length = None)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=8)
train_loss = losses.MSELoss(model=student_model)
del train_data

In [None]:
val_mse = evaluation.MSEEvaluator(val_questions, val_context, teacher_model=teacher_model)
del teacher_model

In [None]:
%%time
# Train the student model to imitate the teacher on MSE Loss
output_path =  "/content/drive/MyDrive/Siamese FineTuning/output/model-distillation-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
checkpoint_path = "/content/drive/MyDrive/InterIIT/checkpoints/model-distillation-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
student_model.fit(train_objectives=[(train_dataloader, train_loss)],
                  evaluator=val_mse,
                  epochs=10,
                  warmup_steps=1000,
                  evaluation_steps=0,
                  output_path=output_path,
                  save_best_model=True,
                  optimizer_params={'lr': 1e-4, 'eps': 1e-6},
                  use_amp=True)