## Checking CPU


In [None]:
!lscpu

## Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install --quiet sentence_transformers datasets

In [None]:
import pandas as pd
import numpy as np
import gc
from sentence_transformers import InputExample, datasets, models, SentenceTransformer, losses, util
from tqdm.auto import tqdm
import time
from datasets import Dataset, load_dataset

## Data

In [None]:
generated_data = pd.read_csv('/content/drive/MyDrive/Inter_IIT/Datasets/generated_data.csv')
qa_pairs = pd.read_csv('/content/drive/MyDrive/Inter_IIT/Datasets/qa_paras.csv')
qa_pairs.drop('theme_y', axis = 'columns', inplace = True)
qa_pairs = qa_pairs.rename(columns = {'paragraph_id':'paragraph_id', 'paragraph':'paragraph', 
                                      'theme_x':'theme', 'question':'question','answer':'answer'})
generated_data = pd.concat([generated_data, qa_pairs], ignore_index = True)

In [None]:
generated_data.head()

## Single Theme Fine Tuning

In [None]:
for theme in generated_data["theme"].unique().tolist()[28:]:
  if "Cardinal_(" in theme:
    text = "Cardinal_Catholicism"
  elif "Imamah_(" in theme:
    text = "Imamah_Shia_doctrine"
  elif "Mary_(" in theme:
    text = "Mary_mother_of_Jesus"
  elif "Everton_F.C." in theme:
    text = "Everton_FC"
  else:
    text = theme
  theme_data = generated_data.loc[generated_data['theme'] == theme].reset_index(drop = True)
  train_samples = []
  for idx in tqdm(range(theme_data.shape[0])):
      row = theme_data.loc[idx]
      train_samples.append(InputExample(
          texts=[row['question'], row['paragraph']]
      ))

  batch_size = 8
  loader_8 = datasets.NoDuplicatesDataLoader(
      train_samples, batch_size=batch_size
  )

  del train_samples, theme_data
  gc.collect()

  model = SentenceTransformer('all-mpnet-base-v2')
  loss = losses.MultipleNegativesRankingLoss(model)

  epochs = 1
  warmup_steps = int(len(loader_8) * epochs * 0.1)

  model.fit(
      train_objectives=[(loader_8, loss)],
      epochs=epochs,
      warmup_steps=warmup_steps,
      output_path=f'{text}-no-train-gen-tuned-all-mpnet-base-v2',
      show_progress_bar=True
  )

In [None]:
# Combining Unknown and DevRev themes into one for fine tuning
text = 'Unk_DevRev'
theme_data = generated_data.loc[generated_data['theme'].isin(['Unknown', 'DevRev'])].reset_index(drop = True)
train_samples = []
for idx in tqdm(range(theme_data.shape[0])):
    row = theme_data.loc[idx]
    train_samples.append(InputExample(
        texts=[row['question'], row['paragraph']]
    ))

batch_size = 8
loader_8 = datasets.NoDuplicatesDataLoader(
    train_samples, batch_size=batch_size
)

del train_samples, theme_data
gc.collect()

model = SentenceTransformer('all-mpnet-base-v2')
loss = losses.MultipleNegativesRankingLoss(model)

epochs = 1
warmup_steps = int(len(loader_8) * epochs * 0.1)

model.fit(
    train_objectives=[(loader_8, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path=f'{text}-no-train-gen-tuned-all-mpnet-base-v2',
    show_progress_bar=True
)

## Cluster based fine tuning

In [None]:
import json
with open("/content/drive/MyDrive/Inter_IIT/Datasets/clusters.json", "r") as f:
  clusters = json.load(f)
  f.close()

In [None]:
clusters.keys()

In [None]:
for key in clusters:
  cluster_data = generated_data.loc[generated_data['theme'].isin(clusters[key])].reset_index(drop = True)
  train_samples = []
  for idx in tqdm(range(cluster_data.shape[0])):
      row = cluster_data.loc[idx]
      train_samples.append(InputExample(
          texts=[row['question'], row['paragraph']]
      ))

  batch_size = 8
  loader_8 = datasets.NoDuplicatesDataLoader(
      train_samples, batch_size=batch_size
  )

  del train_samples, cluster_data
  gc.collect()

  model = SentenceTransformer('all-mpnet-base-v2')
  loss = losses.MultipleNegativesRankingLoss(model)

  epochs = 1
  warmup_steps = int(len(loader_8) * epochs * 0.1)

  model.fit(
      train_objectives=[(loader_8, loss)],
      epochs=epochs,
      warmup_steps=warmup_steps,
      output_path=f'{key}-no-train-gen-squad-validation-tuned-all-mpnet-base-v2',
      show_progress_bar=True,
      # use_amp = True
  )
  path = f"InterIIT/{key}-no-train-gen-squad-validation-tuned-all-mpnet-base-v2"

## Global

In [None]:
train_samples = []
for idx in tqdm(range(generated_data.shape[0])):
    row = generated_data.loc[idx]
    train_samples.append(InputExample(
        texts=[row['question'], row['paragraph']]
    ))

batch_size = 8
loader_8 = datasets.NoDuplicatesDataLoader(
    train_samples, batch_size=batch_size
)

del train_samples
gc.collect()

model = SentenceTransformer('all-mpnet-base-v2')
loss = losses.MultipleNegativesRankingLoss(model)

epochs = 1
warmup_steps = int(len(loader_8) * epochs * 0.1)

model.fit(
    train_objectives=[(loader_8, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path=f'global-no-train-gen-squad-validation-tuned-all-mpnet-base-v2',
    show_progress_bar=True,
    # use_amp = True
)
path = f"InterIIT/global-no-train-gen-squad-validation-tuned-all-mpnet-base-v2"