# Baseline 1 (Direct regression on IMDB)

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW, get_scheduler
from datasets import load_dataset
from sklearn.metrics import mean_squared_error
import numpy as np
from tqdm import tqdm

# Load IMDb dataset
dataset = load_dataset("imdb")

# Define tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Convert labels to float
def convert_labels_to_float(example):
    example['label'] = float(example['label'])
    return example

tokenized_datasets = tokenized_datasets.map(convert_labels_to_float)

# Create data loaders
batch_size = 8
train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=batch_size)
eval_dataloader = DataLoader(tokenized_datasets['test'], batch_size=batch_size)

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# Training loop
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1} Training", leave=False)
    for batch in train_progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['label'].float())
        loss = outputs.loss
        loss.backward()
        train_loss += loss.item()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        train_progress_bar.set_postfix({"loss": loss.item()})

    avg_train_loss = train_loss / len(train_dataloader)

    # Evaluation
    model.eval()
    predictions = []
    labels = []
    eval_loss = 0
    eval_progress_bar = tqdm(eval_dataloader, desc=f"Epoch {epoch+1} Evaluation", leave=False)
    for batch in eval_progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['label'].float())
            eval_loss += outputs.loss.item()
        logits = outputs.logits.squeeze(-1).cpu().numpy()
        label = batch['label'].cpu().numpy()
        predictions.extend(logits)
        labels.extend(label)

    avg_eval_loss = eval_loss / len(eval_dataloader)
    mse = mean_squared_error(labels, predictions)
    print(f"Epoch {epoch+1}: Train Loss: {avg_train_loss:.4f}, Eval Loss: {avg_eval_loss:.4f}, Mean Squared Error: {mse:.4f}")

print("Training complete!")


In [None]:
# Save the model
model_save_path = "bert_imdb_regression.pt"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

# Save the tokenizer
tokenizer_save_path = "bert_tokenizer"
tokenizer.save_pretrained(tokenizer_save_path)
print(f"Tokenizer saved to {tokenizer_save_path}")

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load the tokenizer
tokenizer_load_path = "bert_tokenizer"
tokenizer = BertTokenizer.from_pretrained(tokenizer_load_path)

# Load the model
model_load_path = "bert_imdb_regression.pt"
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)
model.load_state_dict(torch.load(model_load_path))

# Set the model to evaluation mode
model.eval()

# Move model to the appropriate device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

print("Model and tokenizer loaded successfully!")

# Function to preprocess and predict sentiment score
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    prediction = outputs.logits.squeeze(-1).cpu().numpy()
    return prediction

# Test the model with some example texts
test_texts = [
    "This movie was fantastic! The plot was gripping and the characters were well-developed.",
    "I didn't enjoy this film. The storyline was weak and the acting was subpar.",
    "An average movie. Some good parts but also some very boring scenes."
]

for text in test_texts:
    score = predict_sentiment(text)
    print(f"Text: {text}\nPredicted Sentiment Score: {score}\n")


# Baseline 2: Contrastive Learning -> IMDB

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW
from datasets import load_dataset
from sklearn.metrics import mean_squared_error
import time

print("Loading snli dataset")
# Load a different dataset for contrastive learning, e.g., SNLI
snli_dataset = load_dataset('snli')
# Extract the premises and hypotheses
premises = snli_dataset['train']['premise']
hypotheses = snli_dataset['train']['hypothesis']

# Combine premises and hypotheses
all_texts = premises + hypotheses

# Get unique sentences
contrastive_texts = list(set(all_texts))

import pandas as pd
contrastive_df = pd.DataFrame({'Contrastive Text': contrastive_texts})
contrastive_df.to_csv('contrastive_texts.csv', index=False)


# Define BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the texts for contrastive learning
contrastive_encodings = tokenizer(contrastive_texts, truncation=True, padding=True, max_length=256)

class ContrastiveDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

contrastive_dataset = ContrastiveDataset(contrastive_encodings)

# DataLoader for contrastive learning
contrastive_loader = DataLoader(contrastive_dataset, batch_size=16, shuffle=True)

# Define the model
class BERTSimCSE(nn.Module):
    def __init__(self):
        super(BERTSimCSE, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        return outputs.pooler_output

# SimCSE Loss function
class SimCSELoss(nn.Module):
    def __init__(self, temperature=0.05):
        super(SimCSELoss, self).__init__()
        self.temperature = temperature
        self.cosine_similarity = nn.CosineSimilarity(dim=-1)

    def forward(self, z_i, z_j):
        batch_size = z_i.size(0)
        z = torch.cat([z_i, z_j], dim=0)
        similarity_matrix = self.cosine_similarity(z.unsqueeze(1), z.unsqueeze(0)) / self.temperature

        # Create labels for contrastive loss
        labels = torch.arange(batch_size).cuda()
        labels = torch.cat([labels, labels], dim=0)

        # Mask to remove self-comparisons
        mask = torch.eye(labels.shape[0], dtype=torch.bool).cuda()

        # Remove self-comparisons
        similarity_matrix = similarity_matrix[~mask].view(labels.shape[0], -1)

        # Compute the loss
        loss = F.cross_entropy(similarity_matrix, labels)
        return loss

# Training setup for contrastive learning
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = BERTSimCSE().to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = SimCSELoss()

print("Contrastive Learning Loop")

# Contrastive learning loop
model.train()
for epoch in range(3):  # Train for 3 epochs
    start_time = time.time()
    total_loss = 0
    print_interval = len(contrastive_loader) // 10
    for batch_idx, batch in enumerate(contrastive_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Forward pass
        z_i = model(input_ids, attention_mask)

        # Apply dropout again and get another representation
        model.bert.train()  # Ensure dropout is enabled
        z_j = model(input_ids, attention_mask)

        loss = criterion(z_i, z_j)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if (batch_idx + 1) % print_interval == 0 or (batch_idx + 1) == len(contrastive_loader):
            elapsed_time = time.time() - start_time
            remaining_time = elapsed_time / (batch_idx + 1) * (len(contrastive_loader) - (batch_idx + 1))

            print(f'Epoch [{epoch+1}/3], Batch [{batch_idx+1}/{len(contrastive_loader)}], Loss: {total_loss/(batch_idx+1):.4f}, '
                  f'Elapsed time: {elapsed_time:.2f}s, Remaining time: {remaining_time:.2f}s')

print("Loading imdb dataset")

# Load the IMDB dataset for fine-tuning
imdb_dataset = load_dataset('imdb')
train_texts = imdb_dataset['train']['text']
train_labels = imdb_dataset['train']['label']
test_texts = imdb_dataset['test']['text']
test_labels = imdb_dataset['test']['label']

# Tokenize the texts for IMDB
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=256)

class IMDBDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDBDataset(train_encodings, train_labels)
test_dataset = IMDBDataset(test_encodings, test_labels)

# DataLoader for fine-tuning
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Fine-tuning setup
class IMDBRatingModel(nn.Module):
    def __init__(self, bert_model):
        super(IMDBRatingModel, self).__init__()
        self.bert = bert_model
        self.classifier = nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        logits = self.classifier(cls_output)
        return logits
    
# Fine-tuning setup
model = IMDBRatingModel(model.bert).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

# Evaluation function to calculate MSE on the test set
def evaluate_model(model, test_loader, device):
    model.eval()
    all_labels = []
    all_preds = []
    start_time = time.time()
    print_interval = len(test_loader) // 10
    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

            if (batch_idx + 1) % print_interval == 0 or (batch_idx + 1) == len(test_loader):
                elapsed_time = time.time() - start_time
                remaining_time = elapsed_time / (batch_idx + 1) * (len(test_loader) - (batch_idx + 1))
                print(f'Evaluating batch [{batch_idx+1}/{len(test_loader)}], Elapsed time: {elapsed_time:.2f}s, Remaining time: {remaining_time:.2f}s')

    mse = mean_squared_error(all_labels, all_preds)
    return mse

print("Fine Tuning Loop")

# Fine-tuning loop with additional metrics
model.train()
for epoch in range(3):  # Fine-tune for 3 epochs
    start_time = time.time()
    total_loss = 0
    print_interval = len(train_loader) // 10
    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if (batch_idx + 1) % print_interval == 0 or (batch_idx + 1) == len(train_loader):
            elapsed_time = time.time() - start_time
            remaining_time = elapsed_time / (batch_idx + 1) * (len(train_loader) - (batch_idx + 1))

            print(f'Epoch [{epoch+1}/3], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {total_loss/(batch_idx+1):.4f}, '
                  f'Elapsed time: {elapsed_time:.2f}s, Remaining time: {remaining_time:.2f}s')

    # Evaluate the model on the test set
    mse = evaluate_model(model, test_loader, device)
    print(f'Epoch [{epoch+1}/3], Test MSE: {mse}')


In [None]:
# Save the model
model_save_path = "bert_contrastive_imdb_regression.pt"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

# Save the tokenizer
tokenizer_save_path = "bert_contrastive_tokenizer"
tokenizer.save_pretrained(tokenizer_save_path)
print(f"Tokenizer saved to {tokenizer_save_path}")

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load the tokenizer
tokenizer_load_path = "bert_contrastive_tokenizer"
tokenizer = BertTokenizer.from_pretrained(tokenizer_load_path)

# Load the model
model_load_path = "bert_contrastive_imdb_regression.pt"
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.load_state_dict(torch.load(model_load_path))

# Set the model to evaluation mode
model.eval()

# Move model to the appropriate device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

print("Model and tokenizer loaded successfully!")

# Function to preprocess and predict sentiment score
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    prediction = outputs.logits.squeeze(-1).cpu().numpy()
    return prediction

# Test the model with some example texts
test_texts = [
    "This movie was fantastic! The plot was gripping and the characters were well-developed.",
    "I didn't enjoy this film. The storyline was weak and the acting was subpar.",
    "An average movie. Some good parts but also some very boring scenes."
]

for text in test_texts:
    score = predict_sentiment(text)
    print(f"Text: {text}\nPredicted Sentiment Score: {score}\n")


# Baseline 3 Directly train for Tamil movie review

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from tqdm import tqdm

# Custom Dataset class to handle loading and processing
class MovieReviewsDataset(Dataset):
    def __init__(self, reviews, ratings, tokenizer, max_length):
        self.reviews = reviews
        self.ratings = ratings
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        rating = self.ratings[idx]
        encoding = self.tokenizer(
            review,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': torch.tensor(rating, dtype=torch.float)
        }

# Load datasets
train_df = pd.read_csv("tamil_movie_reviews_train.csv")
test_df = pd.read_csv("tamil_movie_reviews_test.csv")

# Extract text and labels
train_texts = train_df['ReviewInTamil'].tolist()
train_labels = train_df['Rating'].tolist()
test_texts = test_df['ReviewInTamil'].tolist()
test_labels = test_df['Rating'].tolist()

# Define tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=1)

# Create datasets
max_length = 512
train_dataset = MovieReviewsDataset(train_texts, train_labels, tokenizer, max_length)
test_dataset = MovieReviewsDataset(test_texts, test_labels, tokenizer, max_length)

# Create data loaders
batch_size = 8
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
eval_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# Training loop
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1} Training", leave=False)
    for batch in train_progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['label'].float())
        loss = outputs.loss
        loss.backward()
        train_loss += loss.item()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        train_progress_bar.set_postfix({"loss": loss.item()})

    avg_train_loss = train_loss / len(train_dataloader)

    # Evaluation
    model.eval()
    predictions = []
    labels = []
    eval_loss = 0
    eval_progress_bar = tqdm(eval_dataloader, desc=f"Epoch {epoch+1} Evaluation", leave=False)
    for batch in eval_progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['label'].float())
            eval_loss += outputs.loss.item()
        logits = outputs.logits.squeeze(-1).cpu().numpy()
        label = batch['label'].cpu().numpy()
        predictions.extend(logits)
        labels.extend(label)

    avg_eval_loss = eval_loss / len(eval_dataloader)
    mse = mean_squared_error(labels, predictions)
    print(f"Epoch {epoch+1}: Train Loss: {avg_train_loss:.4f}, Eval Loss: {avg_eval_loss:.4f}, Mean Squared Error: {mse:.4f}")

print("Training complete!")


# Baseline 4 Contrastive Learning with Tamil Murasu and then train on regression for ratings

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW
from sklearn.metrics import mean_squared_error
import pandas as pd
import time

# Load contrastive learning dataset
print("Loading Tamil Murasu dataset")
tamil_murasu_df = pd.read_csv('tamilmurasu_dataset.csv')
contrastive_texts = tamil_murasu_df['news_article'].tolist()

# Define BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenize the texts for contrastive learning with progress statements
print("Tokenizing texts for contrastive learning")
batch_size = 500
total_texts = len(contrastive_texts)
print_interval = total_texts // 10
contrastive_encodings = {'input_ids': [], 'attention_mask': []}

start_time = time.time()
for i in range(0, total_texts, batch_size):
    batch_texts = contrastive_texts[i:i+batch_size]
    encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=256)
    contrastive_encodings['input_ids'].extend(encodings['input_ids'])
    contrastive_encodings['attention_mask'].extend(encodings['attention_mask'])
    
    if (i + len(batch_texts)) % print_interval < batch_size:
        elapsed_time = time.time() - start_time
        remaining_time = elapsed_time / (i + len(batch_texts)) * (total_texts - (i + len(batch_texts)))
        print(f"Processed {i + len(batch_texts)} / {total_texts} texts, Elapsed time: {elapsed_time:.2f}s, Estimated remaining time: {remaining_time:.2f}s")

print("Finished tokenizing texts for contrastive learning")

class ContrastiveDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

contrastive_dataset = ContrastiveDataset(contrastive_encodings)

# DataLoader for contrastive learning
contrastive_loader = DataLoader(contrastive_dataset, batch_size=16, shuffle=True)

# Define the model
class BERTSimCSE(nn.Module):
    def __init__(self):
        super(BERTSimCSE, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        return outputs.pooler_output

# SimCSE Loss function
class SimCSELoss(nn.Module):
    def __init__(self, temperature=0.05):
        super(SimCSELoss, self).__init__()
        self.temperature = temperature
        self.cosine_similarity = nn.CosineSimilarity(dim=-1)

    def forward(self, z_i, z_j):
        batch_size = z_i.size(0)
        z = torch.cat([z_i, z_j], dim=0)
        similarity_matrix = self.cosine_similarity(z.unsqueeze(1), z.unsqueeze(0)) / self.temperature

        # Create labels for contrastive loss
        labels = torch.arange(batch_size).to(z_i.device)
        labels = torch.cat([labels, labels], dim=0)

        # Mask to remove self-comparisons
        mask = torch.eye(labels.shape[0], dtype=torch.bool).to(z_i.device)

        # Remove self-comparisons
        similarity_matrix = similarity_matrix[~mask].view(labels.shape[0], -1)

        # Compute the loss
        loss = F.cross_entropy(similarity_matrix, labels)
        return loss

# Training setup for contrastive learning
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = BERTSimCSE().to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = SimCSELoss()

print("Contrastive Learning Loop")

# Contrastive learning loop
model.train()
for epoch in range(3):  # Train for 3 epochs
    start_time = time.time()
    total_loss = 0
    print_interval = len(contrastive_loader) // 10
    for batch_idx, batch in enumerate(contrastive_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Forward pass
        z_i = model(input_ids, attention_mask)

        # Apply dropout again and get another representation
        model.bert.train()  # Ensure dropout is enabled
        z_j = model(input_ids, attention_mask)

        loss = criterion(z_i, z_j)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if (batch_idx + 1) % print_interval == 0 or (batch_idx + 1) == len(contrastive_loader):
            elapsed_time = time.time() - start_time
            remaining_time = elapsed_time / (batch_idx + 1) * (len(contrastive_loader) - (batch_idx + 1))

            print(f'Epoch [{epoch+1}/3], Batch [{batch_idx+1}/{len(contrastive_loader)}], Loss: {total_loss/(batch_idx+1):.4f}, '
                  f'Elapsed time: {elapsed_time:.2f}s, Remaining time: {remaining_time:.2f}s')

In [None]:
print("Loading Tamil movie reviews dataset")

# Load the Tamil movie reviews dataset for regression
train_df = pd.read_csv("tamil_movie_reviews_train.csv")
test_df = pd.read_csv("tamil_movie_reviews_test.csv")

# Extract text and labels
train_texts = train_df['ReviewInTamil'].tolist()
train_labels = train_df['Rating'].tolist()
test_texts = test_df['ReviewInTamil'].tolist()
test_labels = test_df['Rating'].tolist()

# Tokenize the texts for regression with progress statements
print("Tokenizing texts for regression")
train_encodings = {'input_ids': [], 'attention_mask': []}
test_encodings = {'input_ids': [], 'attention_mask': []}

start_time = time.time()
for i in range(0, len(train_texts), batch_size):
    batch_texts = train_texts[i:i+batch_size]
    encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=256)
    train_encodings['input_ids'].extend(encodings['input_ids'])
    train_encodings['attention_mask'].extend(encodings['attention_mask'])

    if (i + len(batch_texts)) % print_interval < batch_size:
        elapsed_time = time.time() - start_time
        remaining_time = elapsed_time / (i + len(batch_texts)) * (len(train_texts) - (i + len(batch_texts)))
        print(f"Processed {i + len(batch_texts)} / {len(train_texts)} training texts, Elapsed time: {elapsed_time:.2f}s, Estimated remaining time: {remaining_time:.2f}s")

print("Finished tokenizing training texts for regression")

start_time = time.time()
for i in range(0, len(test_texts), batch_size):
    batch_texts = test_texts[i:i+batch_size]
    encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=256)
    test_encodings['input_ids'].extend(encodings['input_ids'])
    test_encodings['attention_mask'].extend(encodings['attention_mask'])

    if (i + len(batch_texts)) % print_interval < batch_size:
        elapsed_time = time.time() - start_time
        remaining_time = elapsed_time / (i + len(batch_texts)) * (len(test_texts) - (i + len(batch_texts)))
        print(f"Processed {i + len(batch_texts)} / {len(test_texts)} test texts, Elapsed time: {elapsed_time:.2f}s, Estimated remaining time: {remaining_time:.2f}s")

print("Finished tokenizing test texts for regression")

class ReviewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ReviewsDataset(train_encodings, train_labels)
test_dataset = ReviewsDataset(test_encodings, test_labels)

# DataLoader for fine-tuning
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Fine-tuning setup
class RegressionModel(nn.Module):
    def __init__(self, bert_model):
        super(RegressionModel, self).__init__()
        self.bert = bert_model
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        logits = self.regressor(cls_output)
        return logits

# Fine-tuning setup
model = RegressionModel(model.bert).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.MSELoss()


In [None]:
# Evaluation function to calculate MSE on the test set
def evaluate_model(model, test_loader, device):
    model.eval()
    all_labels = []
    all_preds = []
    start_time = time.time()
    print_interval = len(test_loader) // 10 
    if print_interval == 0:
        print_interval = 1

    for batch_idx, batch in enumerate(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        preds = outputs.squeeze()       
        
        all_labels.extend(labels.cpu().detach().numpy())
        all_preds.extend(preds.cpu().detach().numpy())    
        
        if (batch_idx + 1) % print_interval == 0 or (batch_idx + 1) == len(test_loader):
            elapsed_time = time.time() - start_time
            remaining_time = elapsed_time / (batch_idx + 1) * (len(test_loader) - (batch_idx + 1))
            print(f'Evaluating batch [{batch_idx+1}/{len(test_loader)}], Elapsed time: {elapsed_time:.2f}s, Remaining time: {remaining_time:.2f}s')

    print(len(all_labels), len(all_preds))
    mse = mean_squared_error(all_labels, all_preds)
    return mse

print("Fine Tuning Loop")

# Fine-tuning loop with additional metrics
model.train()
for epoch in range(30):  # Fine-tune for 3 epochs
    start_time = time.time()
    total_loss = 0
    print_interval = len(train_loader) // 10
    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs.squeeze(), labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if (batch_idx + 1) % print_interval == 0 or (batch_idx + 1) == len(train_loader):
            elapsed_time = time.time() - start_time
            remaining_time = elapsed_time / (batch_idx + 1) * (len(train_loader) - (batch_idx + 1))

            print(f'Epoch [{epoch+1}/3], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {total_loss/(batch_idx+1):.4f}, '
                  f'Elapsed time: {elapsed_time:.2f}s, Remaining time: {remaining_time:.2f}s')

    # Evaluate the model on the test set
    mse = evaluate_model(model, test_loader, device)
    print(f'Epoch [{epoch+1}/3], Test MSE: {mse}')


# Baseline 5 Machine translated SNLI for contrastive learning

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW
from datasets import load_dataset
# from googletrans import Translator
from sklearn.metrics import mean_squared_error
import pandas as pd

from translate import Translator
import time

def translate_texts(texts, target_language='ta'):
    translator = Translator(to_lang=target_language)
    translations = []
    total_texts = len(texts)
    print_interval = total_texts // 1000  # Print progress every 10%
    start_time = time.time()

    for i, text in enumerate(texts, start=1):
        translation = translator.translate(text)
        print(translation)
        translations.append(translation)

        if i % print_interval == 0 or i == total_texts:
            elapsed_time = time.time() - start_time
            completed_percentage = (i / total_texts) * 100
            remaining_time = (elapsed_time / i) * (total_texts - i)
            print(f"Translated {i}/{total_texts} texts ({completed_percentage:.2f}% complete), ETA: {remaining_time:.2f}s")

    return translations

print("Loading SNLI dataset")
# Load the SNLI dataset
snli_dataset = load_dataset('snli')
english_texts = snli_dataset['train']['premise'] + snli_dataset['train']['hypothesis']

import pandas as pd

# Create a DataFrame with English texts
english_df = pd.DataFrame({'English Text': english_texts})

# Save the DataFrame to a CSV file
english_df.to_csv('snli_texts.csv', index=False)

print("English texts saved to 'snli_texts.csv'")

print("Translating texts to Tamil")
tamil_texts = translate_texts(english_texts)

# Define BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenize the texts for contrastive learning with progress statements
print("Tokenizing texts for contrastive learning")
batch_size = 500
total_texts = len(tamil_texts)
print_interval = total_texts // 10
contrastive_encodings = {'input_ids': [], 'attention_mask': []}

start_time = time.time()
for i in range(0, total_texts, batch_size):
    batch_texts = tamil_texts[i:i+batch_size]
    encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=256)
    contrastive_encodings['input_ids'].extend(encodings['input_ids'])
    contrastive_encodings['attention_mask'].extend(encodings['attention_mask'])
    
    if (i + len(batch_texts)) % print_interval < batch_size:
        elapsed_time = time.time() - start_time
        remaining_time = elapsed_time / (i + len(batch_texts)) * (total_texts - (i + len(batch_texts)))
        print(f"Processed {i + len(batch_texts)} / {total_texts} texts, Elapsed time: {elapsed_time:.2f}s, Estimated remaining time: {remaining_time:.2f}s")

print("Finished tokenizing texts for contrastive learning")

class ContrastiveDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

contrastive_dataset = ContrastiveDataset(contrastive_encodings)

# DataLoader for contrastive learning
contrastive_loader = DataLoader(contrastive_dataset, batch_size=16, shuffle=True)

# Define the model
class BERTSimCSE(nn.Module):
    def __init__(self):
        super(BERTSimCSE, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        return outputs.pooler_output

# SimCSE Loss function
class SimCSELoss(nn.Module):
    def __init__(self, temperature=0.05):
        super(SimCSELoss, self).__init__()
        self.temperature = temperature
        self.cosine_similarity = nn.CosineSimilarity(dim=-1)

    def forward(self, z_i, z_j):
        batch_size = z_i.size(0)
        z = torch.cat([z_i, z_j], dim=0)
        similarity_matrix = self.cosine_similarity(z.unsqueeze(1), z.unsqueeze(0)) / self.temperature

        # Create labels for contrastive loss
        labels = torch.arange(batch_size).to(z_i.device)
        labels = torch.cat([labels, labels], dim=0)

        # Mask to remove self-comparisons
        mask = torch.eye(labels.shape[0], dtype=torch.bool).to(z_i.device)

        # Remove self-comparisons
        similarity_matrix = similarity_matrix[~mask].view(labels.shape[0], -1)

        # Compute the loss
        loss = F.cross_entropy(similarity_matrix, labels)
        return loss

# Training setup for contrastive learning
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = BERTSimCSE().to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = SimCSELoss()

print("Contrastive Learning Loop")

# Contrastive learning loop
model.train()
for epoch in range(3):  # Train for 3 epochs
    start_time = time.time()
    total_loss = 0
    print_interval = len(contrastive_loader) // 10
    for batch_idx, batch in enumerate(contrastive_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Forward pass
        z_i = model(input_ids, attention_mask)

        # Apply dropout again and get another representation
        model.bert.train()  # Ensure dropout is enabled
        z_j = model(input_ids, attention_mask)

        loss = criterion(z_i, z_j)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if (batch_idx + 1) % print_interval == 0 or (batch_idx + 1) == len(contrastive_loader):
            elapsed_time = time.time() - start_time
            remaining_time = elapsed_time / (batch_idx + 1) * (len(contrastive_loader) - (batch_idx + 1))

            print(f'Epoch [{epoch+1}/3], Batch [{batch_idx+1}/{len(contrastive_loader)}], Loss: {total_loss/(batch_idx+1):.4f}, '
                  f'Elapsed time: {elapsed_time:.2f}s, Remaining time: {remaining_time:.2f}s')

print("Loading Tamil movie reviews dataset")

# Load the Tamil movie reviews dataset for regression
train_df = pd.read_csv("tamil_movie_reviews_train.csv")
test_df = pd.read_csv("tamil_movie_reviews_test.csv")

# Extract text and labels
train_texts = train_df['ReviewInTamil'].tolist()
train_labels = train_df['Rating'].tolist()
test_texts = test_df['ReviewInTamil'].tolist()
test_labels = test_df['Rating'].tolist()

# Tokenize the texts for regression with progress statements
print("Tokenizing texts for regression")
train_encodings = {'input_ids': [], 'attention_mask': []}
test_encodings = {'input_ids': [], 'attention_mask': []}

start_time = time.time()
for i in range(0, len(train_texts), batch_size):
    batch_texts = train_texts[i:i+batch_size]
    encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=256)
    train_encodings['input_ids'].extend(encodings['input_ids'])
    train_encodings['attention_mask'].extend(encodings['attention_mask'])

    if (i + len(batch_texts)) % print_interval < batch_size:
        elapsed_time = time.time() - start_time
        remaining_time = elapsed_time / (i + len(batch_texts)) * (len(train_texts) - (i + len(batch_texts)))
        print(f"Processed {i + len(batch_texts)} / {len(train_texts)} training texts, Elapsed time: {elapsed_time:.2f}s, Estimated remaining time: {remaining_time:.2f}s")

print("Finished tokenizing training texts for regression")

start_time = time.time()
for i in range(0, len(test_texts), batch_size):
    batch_texts = test_texts[i:i+batch_size]
    encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=256)
    test_encodings['input_ids'].extend(encodings['input_ids'])
    test_encodings['attention_mask'].extend(encodings['attention_mask'])

    if (i + len(batch_texts)) % print_interval < batch_size:
        elapsed_time = time.time() - start_time
        remaining_time = elapsed_time / (i + len(batch_texts)) * (len(test_texts) - (i + len(batch_texts)))
        print(f"Processed {i + len(batch_texts)} / {len(test_texts)} test texts, Elapsed time: {elapsed_time:.2f}s, Estimated remaining time: {remaining_time:.2f}s")

print("Finished tokenizing test texts for regression")

class ReviewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ReviewsDataset(train_encodings, train_labels)
test_dataset = ReviewsDataset(test_encodings, test_labels)

# DataLoader for fine-tuning
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Fine-tuning setup
class RegressionModel(nn.Module):
    def __init__(self, bert_model):
        super(RegressionModel, self).__init__()
        self.bert = bert_model
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        logits = self.regressor(cls_output)
        return logits

# Fine-tuning setup
model = RegressionModel(model.bert).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.MSELoss()

# Evaluation function to calculate MSE on the test set
def evaluate_model(model, test_loader, device):
    model.eval()
    all_labels = []
    all_preds = []
    start_time = time.time()
    print_interval = len(test_loader) // 10
    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            preds = outputs.squeeze()

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

            if (batch_idx + 1) % print_interval == 0 or (batch_idx + 1) == len(test_loader):
                elapsed_time = time.time() - start_time
                remaining_time = elapsed_time / (batch_idx + 1) * (len(test_loader) - (batch_idx + 1))
                print(f'Evaluating batch [{batch_idx+1}/{len(test_loader)}], Elapsed time: {elapsed_time:.2f}s, Remaining time: {remaining_time:.2f}s')

    mse = mean_squared_error(all_labels, all_preds)
    return mse

print("Fine Tuning Loop")

# Fine-tuning loop with additional metrics
model.train()
for epoch in range(3):  # Fine-tune for 3 epochs
    start_time = time.time()
    total_loss = 0
    print_interval = len(train_loader) // 10
    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs.squeeze(), labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if (batch_idx + 1) % print_interval == 0 or (batch_idx + 1) == len(train_loader):
            elapsed_time = time.time() - start_time
            remaining_time = elapsed_time / (batch_idx + 1) * (len(train_loader) - (batch_idx + 1))

            print(f'Epoch [{epoch+1}/3], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {total_loss/(batch_idx+1):.4f}, '
                  f'Elapsed time: {elapsed_time:.2f}s, Remaining time: {remaining_time:.2f}s')

    # Evaluate the model on the test set
    mse = evaluate_model(model, test_loader, device)
    print(f'Epoch [{epoch+1}/3], Test MSE: {mse}')

In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
from googletrans import Translator

# Create a Translator object
translator = Translator(service_urls=['translate.googleapis.com'])

# Define the sentence to be translated
english_sentence = "Hello, how are you?"

# Translate the sentence to Tamil
translation = translator.translate(english_sentence, dest='ta')

# Print the translated text
print("English:", english_sentence)
print("Tamil:", translation.text)


# Translate Contrastive Sentences Into Tamil

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

# Initialize the pipeline with the Tamil Llama model
pipe = pipeline("text-generation", model="abhinand/tamil-llama-7b-base-v0.1")

In [None]:
def translate_to_tamil(sentences):
    translated_sentences = []
    for sentence in sentences:
        result = pipe(sentence, max_length=512, num_return_sequences=1, do_sample=False)
        print(result)
        translated_text = result[0]['generated_text']
        translated_sentences.append(translated_text)
    return translated_sentences

# List of sentences to translate
sentences_to_translate = [
    "Translate to Tamil: The quick brown fox jumps over the lazy dog.",
    "Translate to Tamil: A journey of a thousand miles begins with a single step.",
    "Translate to Tamil: To be or not to be, that is the question."
]

# Translate the sentences
translated_sentences = translate_to_tamil(sentences_to_translate)

# Print the translated sentences
for original, translated in zip(sentences_to_translate, translated_sentences):
    print(f"Original: {original}\nTranslated: {translated}\n")


In [None]:
from transformers import pipeline

# Initialize the pipeline with the Tamil Llama model
# model_name = "abhinand/tamil-llama-7b-base-v0.1"
# pipe = pipeline("text-generation", model=model_name)

def generate_response(prompt):
    response = pipe(prompt, max_length=150, num_return_sequences=1, do_sample=True)[0]['generated_text']
    return response

def chat():
    print("You are now chatting with the model. Type 'exit' to end the conversation.")
    while True:
        user_input = input("You: ")
        if user_input.lower() == "exit":
            print("Ending the conversation. Goodbye!")
            break
        response = generate_response(user_input)
        print(f"Model: {response}")

if __name__ == "__main__":
    chat()


# Create SNLI Paraphrasings

In [None]:
import pandas as pd
from langchain_community.llms import Ollama
import time

# Initialize the model
llm = Ollama(model="llama3")

def paraphrase_sentence(sentence):
    prompt = f"Give me a paraphrase of the following sentence \"{sentence}\" Your response should only contain the response"
    response = llm.invoke(prompt)
    return response

# Read the input CSV file
input_file = "contrastive_texts.csv"
output_file = "paraphrased_texts.csv"

df = pd.read_csv(input_file)

# Assuming the column containing the sentences to be paraphrased is named 'Contrastive Text'
if 'Contrastive Text' not in df.columns:
    raise ValueError("The input CSV file must contain a 'Contrastive Text' column.")

# Create a new column for the paraphrased sentences
paraphrased_sentences = []
total_sentences = len(df)
progress_interval = 10  # Adjust this value based on your preference for progress updates

start_time = time.time()

for index, sentence in enumerate(df['Contrastive Text']):
    paraphrased_sentence = paraphrase_sentence(sentence)
    paraphrased_sentences.append(paraphrased_sentence)
    
    # Print progress with ETA
    if (index + 1) % progress_interval == 0 or (index + 1) == total_sentences:
        elapsed_time = time.time() - start_time
        avg_time_per_sentence = elapsed_time / (index + 1)
        sentences_left = total_sentences - (index + 1)
        eta = sentences_left * avg_time_per_sentence
        eta_minutes, eta_seconds = divmod(eta, 60)
        print(f"Processed {index + 1}/{total_sentences} sentences. ETA: {int(eta_minutes)}m {int(eta_seconds)}s")

# Add the paraphrased sentences to the DataFrame
df['paraphrased_sentence'] = paraphrased_sentences

# Save the result to a new CSV file
df.to_csv(output_file, index=False)

print(f"Paraphrased sentences saved to {output_file}")


# Train Contrasting Learning -> IMDB on dataset with incorporated paraphrases

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW
from datasets import load_dataset
from sklearn.metrics import mean_squared_error
import time
import pandas as pd

print("Loading paraphrased_texts.csv")
df = pd.read_csv('paraphrased_texts.csv')

# Combine sentences from both columns, filter out any " characters
contrastive_texts = list(set(df['Contrastive Text'].tolist() + df['paraphrased_sentence'].tolist()))
contrastive_texts = [text.replace('"', '') for text in contrastive_texts]
print(len(contrastive_texts))

# Define BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the texts for contrastive learning
contrastive_encodings = tokenizer(contrastive_texts, truncation=True, padding=True, max_length=256)

class ContrastiveDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

contrastive_dataset = ContrastiveDataset(contrastive_encodings)

# DataLoader for contrastive learning
contrastive_loader = DataLoader(contrastive_dataset, batch_size=16, shuffle=True)

# Define the model
class BERTSimCSE(nn.Module):
    def __init__(self):
        super(BERTSimCSE, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        return outputs.pooler_output

# SimCSE Loss function
class SimCSELoss(nn.Module):
    def __init__(self, temperature=0.05):
        super(SimCSELoss, self).__init__()
        self.temperature = temperature
        self.cosine_similarity = nn.CosineSimilarity(dim=-1)

    def forward(self, z_i, z_j):
        batch_size = z_i.size(0)
        z = torch.cat([z_i, z_j], dim=0)
        similarity_matrix = self.cosine_similarity(z.unsqueeze(1), z.unsqueeze(0)) / self.temperature

        # Create labels for contrastive loss
        labels = torch.arange(batch_size).cuda()
        labels = torch.cat([labels, labels], dim=0)

        # Mask to remove self-comparisons
        mask = torch.eye(labels.shape[0], dtype=torch.bool).cuda()

        # Remove self-comparisons
        similarity_matrix = similarity_matrix[~mask].view(labels.shape[0], -1)

        # Compute the loss
        loss = F.cross_entropy(similarity_matrix, labels)
        return loss

# Training setup for contrastive learning
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = BERTSimCSE().to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = SimCSELoss()

print("Contrastive Learning Loop")

# Contrastive learning loop
model.train()
for epoch in range(3):  # Train for 3 epochs
    start_time = time.time()
    total_loss = 0
    print_interval = len(contrastive_loader) // 10
    for batch_idx, batch in enumerate(contrastive_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Forward pass
        z_i = model(input_ids, attention_mask)

        # Apply dropout again and get another representation
        model.bert.train()  # Ensure dropout is enabled
        z_j = model(input_ids, attention_mask)

        loss = criterion(z_i, z_j)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if (batch_idx + 1) % print_interval == 0 or (batch_idx + 1) == len(contrastive_loader):
            elapsed_time = time.time() - start_time
            remaining_time = elapsed_time / (batch_idx + 1) * (len(contrastive_loader) - (batch_idx + 1))

            print(f'Epoch [{epoch+1}/3], Batch [{batch_idx+1}/{len(contrastive_loader)}], Loss: {total_loss/(batch_idx+1):.4f}, '
                  f'Elapsed time: {elapsed_time:.2f}s, Remaining time: {remaining_time:.2f}s')

print("Loading imdb dataset")

# Load the IMDB dataset for fine-tuning
imdb_dataset = load_dataset('imdb')
train_texts = imdb_dataset['train']['text']
train_labels = imdb_dataset['train']['label']
test_texts = imdb_dataset['test']['text']
test_labels = imdb_dataset['test']['label']

# Tokenize the texts for IMDB
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=256)

class IMDBDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDBDataset(train_encodings, train_labels)
test_dataset = IMDBDataset(test_encodings, test_labels)

# DataLoader for fine-tuning
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Fine-tuning setup
class IMDBRatingModel(nn.Module):
    def __init__(self, bert_model):
        super(IMDBRatingModel, self).__init__()
        self.bert = bert_model
        self.classifier = nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        logits = self.classifier(cls_output)
        return logits
    
# Fine-tuning setup
model = IMDBRatingModel(model.bert).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

# Evaluation function to calculate MSE on the test set
def evaluate_model(model, test_loader, device):
    model.eval()
    all_labels = []
    all_preds = []
    start_time = time.time()
    print_interval = len(test_loader) // 10
    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

            if (batch_idx + 1) % print_interval == 0 or (batch_idx + 1) == len(test_loader):
                elapsed_time = time.time() - start_time
                remaining_time = elapsed_time / (batch_idx + 1) * (len(test_loader) - (batch_idx + 1))
                print(f'Evaluating batch [{batch_idx+1}/{len(test_loader)}], Elapsed time: {elapsed_time:.2f}s, Remaining time: {remaining_time:.2f}s')

    mse = mean_squared_error(all_labels, all_preds)
    return mse

print("Fine Tuning Loop")

# Fine-tuning loop with additional metrics
model.train()
for epoch in range(3):  # Fine-tune for 3 epochs
    start_time = time.time()
    total_loss = 0
    print_interval = len(train_loader) // 10
    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if (batch_idx + 1) % print_interval == 0 or (batch_idx + 1) == len(train_loader):
            elapsed_time = time.time() - start_time
            remaining_time = elapsed_time / (batch_idx + 1) * (len(train_loader) - (batch_idx + 1))

            print(f'Epoch [{epoch+1}/3], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {total_loss/(batch_idx+1):.4f}, '
                  f'Elapsed time: {elapsed_time:.2f}s, Remaining time: {remaining_time:.2f}s')

    # Evaluate the model on the test set
    mse = evaluate_model(model, test_loader, device)
    print(f'Epoch [{epoch+1}/3], Test MSE: {mse}')


In [None]:
import pandas as pd
from langchain_community.llms import Ollama
import time

# Initialize the model
llm = Ollama(model="tamil-llama")

def translate_text(text):
    prompt = f"நான் நேரடியாக நகலெடுக்கக்கூடிய வடிவத்தில் பின்வரும் உரையை தமிழில் மொழிபெயர்க்கவும்: \"{text}\""
    response = llm.invoke(prompt)
    return response

# Read the input file
input_file = "contrastive_texts.csv"
output_file = "translated_paraphrased_texts.csv"

with open(input_file, "r", encoding="utf-8") as file:
    paraphrased_texts = file.readlines()

# Translate each paraphrased text
translated_texts = []
total_texts = len(paraphrased_texts)
progress_interval = 10  # Adjust this value based on your preference for progress updates
print_interval = 40  # Print original and translated sentences
start_time = time.time()

for index, text in enumerate(paraphrased_texts):
    translated_text = translate_text(text)
    translated_texts.append(translated_text)
    
    # Print original and translated sentences every 40 sentences
    if (index + 1) % print_interval == 0 or (index + 1) == total_texts:
        original_sentence = text.strip()
        translation = translated_text.strip()
        print(f"Original: {original_sentence}\nTranslation: {translation}\n")

    # Print progress with ETA
    if (index + 1) % progress_interval == 0 or (index + 1) == total_texts:
        elapsed_time = time.time() - start_time
        avg_time_per_text = elapsed_time / (index + 1)
        texts_left = total_texts - (index + 1)
        eta = texts_left * avg_time_per_text
        eta_minutes, eta_seconds = divmod(eta, 60)
        print(f"Processed {index + 1}/{total_texts} texts. ETA: {int(eta_minutes)}m {int(eta_seconds)}s")

# Write translated texts to a new file
with open(output_file, "w", encoding="utf-8") as file:
    file.writelines(translated_texts)

print(f"Translated texts saved to {output_file}")

# Paraphrased Translations -> SimCSE -> Tamil Movie Review

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW
from sklearn.metrics import mean_squared_error
import time

# Load contrastive learning dataset
print("Loading Tamil Paraphrased dataset")
contrastive_texts = []
with open('translated_paraphrased_texts.csv', 'r', encoding='utf-8') as file:
    for line in file:
        contrastive_texts.append(line.strip())
contrastive_texts = contrastive_texts

# Define BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenize the texts for contrastive learning with progress statements
print("Tokenizing texts for contrastive learning")
batch_size = 500
total_texts = len(contrastive_texts)
print_interval = total_texts // 10
contrastive_encodings = {'input_ids': [], 'attention_mask': []}

start_time = time.time()
for i in range(0, total_texts, batch_size):
    batch_texts = contrastive_texts[i:i+batch_size]
    encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=256)
    contrastive_encodings['input_ids'].extend(encodings['input_ids'])
    contrastive_encodings['attention_mask'].extend(encodings['attention_mask'])
    
    if (i + len(batch_texts)) % print_interval < batch_size:
        elapsed_time = time.time() - start_time
        remaining_time = elapsed_time / (i + len(batch_texts)) * (total_texts - (i + len(batch_texts)))
        print(f"Processed {i + len(batch_texts)} / {total_texts} texts, Elapsed time: {elapsed_time:.2f}s, Estimated remaining time: {remaining_time:.2f}s")

print("Finished tokenizing texts for contrastive learning")

class ContrastiveDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

contrastive_dataset = ContrastiveDataset(contrastive_encodings)

def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_masks = [item['attention_mask'] for item in batch]
    
    # Pad sequences to the maximum length in this batch
    input_ids_padded = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_masks_padded = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True, padding_value=0)
    
    return {
        'input_ids': input_ids_padded,
        'attention_mask': attention_masks_padded
    }

# DataLoader for contrastive learning
contrastive_loader = DataLoader(contrastive_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

# Define the model
class BERTSimCSE(nn.Module):
    def __init__(self):
        super(BERTSimCSE, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        return outputs.pooler_output

# SimCSE Loss function
class SimCSELoss(nn.Module):
    def __init__(self, temperature=0.05):
        super(SimCSELoss, self).__init__()
        self.temperature = temperature
        self.cosine_similarity = nn.CosineSimilarity(dim=-1)

    def forward(self, z_i, z_j):
        batch_size = z_i.size(0)
        z = torch.cat([z_i, z_j], dim=0)
        similarity_matrix = self.cosine_similarity(z.unsqueeze(1), z.unsqueeze(0)) / self.temperature

        # Create labels for contrastive loss
        labels = torch.arange(batch_size).to(z_i.device)
        labels = torch.cat([labels, labels], dim=0)

        # Mask to remove self-comparisons
        mask = torch.eye(labels.shape[0], dtype=torch.bool).to(z_i.device)

        # Remove self-comparisons
        similarity_matrix = similarity_matrix[~mask].view(labels.shape[0], -1)

        # Compute the loss
        loss = F.cross_entropy(similarity_matrix, labels)
        return loss

# Training setup for contrastive learning
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = BERTSimCSE().to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = SimCSELoss()

print("Contrastive Learning Loop")

# Contrastive learning loop
model.train()
for epoch in range(3):  # Train for 3 epochs
    start_time = time.time()
    total_loss = 0
    print_interval = len(contrastive_loader) // 10
    for batch_idx, batch in enumerate(contrastive_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Forward pass
        z_i = model(input_ids, attention_mask)

        # Apply dropout again and get another representation
        model.bert.train()  # Ensure dropout is enabled
        z_j = model(input_ids, attention_mask)

        loss = criterion(z_i, z_j)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if (batch_idx + 1) % print_interval == 0 or (batch_idx + 1) == len(contrastive_loader):
            elapsed_time = time.time() - start_time
            remaining_time = elapsed_time / (batch_idx + 1) * (len(contrastive_loader) - (batch_idx + 1))

            print(f'Epoch [{epoch+1}/3], Batch [{batch_idx+1}/{len(contrastive_loader)}], Loss: {total_loss/(batch_idx+1):.4f}, '
                  f'Elapsed time: {elapsed_time:.2f}s, Remaining time: {remaining_time:.2f}s')


In [None]:
print("Loading Tamil movie reviews dataset")

# Load the Tamil movie reviews dataset for regression
train_df = pd.read_csv("tamil_movie_reviews_train.csv")
test_df = pd.read_csv("tamil_movie_reviews_test.csv")

# Extract text and labels
train_texts = train_df['ReviewInTamil'].tolist()
train_labels = train_df['Rating'].tolist()
test_texts = test_df['ReviewInTamil'].tolist()
test_labels = test_df['Rating'].tolist()

# Tokenize the texts for regression with progress statements
print("Tokenizing texts for regression")
train_encodings = {'input_ids': [], 'attention_mask': []}
test_encodings = {'input_ids': [], 'attention_mask': []}

start_time = time.time()
for i in range(0, len(train_texts), batch_size):
    batch_texts = train_texts[i:i+batch_size]
    encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=256)
    train_encodings['input_ids'].extend(encodings['input_ids'])
    train_encodings['attention_mask'].extend(encodings['attention_mask'])

    if (i + len(batch_texts)) % print_interval < batch_size:
        elapsed_time = time.time() - start_time
        remaining_time = elapsed_time / (i + len(batch_texts)) * (len(train_texts) - (i + len(batch_texts)))
        print(f"Processed {i + len(batch_texts)} / {len(train_texts)} training texts, Elapsed time: {elapsed_time:.2f}s, Estimated remaining time: {remaining_time:.2f}s")

print("Finished tokenizing training texts for regression")

start_time = time.time()
for i in range(0, len(test_texts), batch_size):
    batch_texts = test_texts[i:i+batch_size]
    encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=256)
    test_encodings['input_ids'].extend(encodings['input_ids'])
    test_encodings['attention_mask'].extend(encodings['attention_mask'])

    if (i + len(batch_texts)) % print_interval < batch_size:
        elapsed_time = time.time() - start_time
        remaining_time = elapsed_time / (i + len(batch_texts)) * (len(test_texts) - (i + len(batch_texts)))
        print(f"Processed {i + len(batch_texts)} / {len(test_texts)} test texts, Elapsed time: {elapsed_time:.2f}s, Estimated remaining time: {remaining_time:.2f}s")

print("Finished tokenizing test texts for regression")

class ReviewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ReviewsDataset(train_encodings, train_labels)
test_dataset = ReviewsDataset(test_encodings, test_labels)

# DataLoader for fine-tuning
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Fine-tuning setup
class RegressionModel(nn.Module):
    def __init__(self, bert_model):
        super(RegressionModel, self).__init__()
        self.bert = bert_model
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        logits = self.regressor(cls_output)
        return logits

# Fine-tuning setup
model = RegressionModel(model.bert).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.MSELoss()

# Evaluation function to calculate MSE on the test set
def evaluate_model(model, test_loader, device):
    model.eval()
    all_labels = []
    all_preds = []
    start_time = time.time()
    print_interval = len(test_loader) // 10 
    if print_interval == 0:
        print_interval = 1

    for batch_idx, batch in enumerate(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        preds = outputs.squeeze()       
        
        all_labels.extend(labels.cpu().detach().numpy())
        all_preds.extend(preds.cpu().detach().numpy())    
        
        if (batch_idx + 1) % print_interval == 0 or (batch_idx + 1) == len(test_loader):
            elapsed_time = time.time() - start_time
            remaining_time = elapsed_time / (batch_idx + 1) * (len(test_loader) - (batch_idx + 1))
            print(f'Evaluating batch [{batch_idx+1}/{len(test_loader)}], Elapsed time: {elapsed_time:.2f}s, Remaining time: {remaining_time:.2f}s')

    print(len(all_labels), len(all_preds))
    mse = mean_squared_error(all_labels, all_preds)
    return mse

print("Fine Tuning Loop")

# Fine-tuning loop with additional metrics
model.train()
for epoch in range(30):  # Fine-tune for 3 epochs
    start_time = time.time()
    total_loss = 0
    print_interval = len(train_loader) // 10
    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs.squeeze(), labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if (batch_idx + 1) % print_interval == 0 or (batch_idx + 1) == len(train_loader):
            elapsed_time = time.time() - start_time
            remaining_time = elapsed_time / (batch_idx + 1) * (len(train_loader) - (batch_idx + 1))

            print(f'Epoch [{epoch+1}/3], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {total_loss/(batch_idx+1):.4f}, '
                  f'Elapsed time: {elapsed_time:.2f}s, Remaining time: {remaining_time:.2f}s')

    # Evaluate the model on the test set
    mse = evaluate_model(model, test_loader, device)
    print(f'Epoch [{epoch+1}/3], Test MSE: {mse}')

# Shorter Translations -> SimCSE -> Tamil Movie Review

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW
from sklearn.metrics import mean_squared_error
import time

# Load contrastive learning dataset
print("Loading Tamil Paraphrased dataset")
contrastive_texts = []
with open('translated_paraphrased_texts.csv', 'r', encoding='utf-8') as file:
    for line in file:
        contrastive_texts.append(line.strip())
contrastive_texts = contrastive_texts

# Define BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenize the texts for contrastive learning with progress statements
print("Tokenizing texts for contrastive learning")
batch_size = 500
total_texts = len(contrastive_texts)
print_interval = total_texts // 10
contrastive_encodings = {'input_ids': [], 'attention_mask': []}

start_time = time.time()
for i in range(0, total_texts, batch_size):
    batch_texts = contrastive_texts[i:i+batch_size]
    encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=256)
    contrastive_encodings['input_ids'].extend(encodings['input_ids'])
    contrastive_encodings['attention_mask'].extend(encodings['attention_mask'])
    
    if (i + len(batch_texts)) % print_interval < batch_size:
        elapsed_time = time.time() - start_time
        remaining_time = elapsed_time / (i + len(batch_texts)) * (total_texts - (i + len(batch_texts)))
        print(f"Processed {i + len(batch_texts)} / {total_texts} texts, Elapsed time: {elapsed_time:.2f}s, Estimated remaining time: {remaining_time:.2f}s")

print("Finished tokenizing texts for contrastive learning")

class ContrastiveDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

contrastive_dataset = ContrastiveDataset(contrastive_encodings)

def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_masks = [item['attention_mask'] for item in batch]
    
    # Pad sequences to the maximum length in this batch
    input_ids_padded = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_masks_padded = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True, padding_value=0)
    
    return {
        'input_ids': input_ids_padded,
        'attention_mask': attention_masks_padded
    }

# DataLoader for contrastive learning
contrastive_loader = DataLoader(contrastive_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

# Define the model
class BERTSimCSE(nn.Module):
    def __init__(self):
        super(BERTSimCSE, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        return outputs.pooler_output

# SimCSE Loss function
class SimCSELoss(nn.Module):
    def __init__(self, temperature=0.05):
        super(SimCSELoss, self).__init__()
        self.temperature = temperature
        self.cosine_similarity = nn.CosineSimilarity(dim=-1)

    def forward(self, z_i, z_j):
        batch_size = z_i.size(0)
        z = torch.cat([z_i, z_j], dim=0)
        similarity_matrix = self.cosine_similarity(z.unsqueeze(1), z.unsqueeze(0)) / self.temperature

        # Create labels for contrastive loss
        labels = torch.arange(batch_size).to(z_i.device)
        labels = torch.cat([labels, labels], dim=0)

        # Mask to remove self-comparisons
        mask = torch.eye(labels.shape[0], dtype=torch.bool).to(z_i.device)

        # Remove self-comparisons
        similarity_matrix = similarity_matrix[~mask].view(labels.shape[0], -1)

        # Compute the loss
        loss = F.cross_entropy(similarity_matrix, labels)
        return loss

# Training setup for contrastive learning
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = BERTSimCSE().to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = SimCSELoss()

print("Contrastive Learning Loop")

# Contrastive learning loop
model.train()
for epoch in range(3):  # Train for 3 epochs
    start_time = time.time()
    total_loss = 0
    print_interval = len(contrastive_loader) // 10
    for batch_idx, batch in enumerate(contrastive_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Forward pass
        z_i = model(input_ids, attention_mask)

        # Apply dropout again and get another representation
        model.bert.train()  # Ensure dropout is enabled
        z_j = model(input_ids, attention_mask)

        loss = criterion(z_i, z_j)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if (batch_idx + 1) % print_interval == 0 or (batch_idx + 1) == len(contrastive_loader):
            elapsed_time = time.time() - start_time
            remaining_time = elapsed_time / (batch_idx + 1) * (len(contrastive_loader) - (batch_idx + 1))

            print(f'Epoch [{epoch+1}/3], Batch [{batch_idx+1}/{len(contrastive_loader)}], Loss: {total_loss/(batch_idx+1):.4f}, '
                  f'Elapsed time: {elapsed_time:.2f}s, Remaining time: {remaining_time:.2f}s')


In [None]:
print("Loading Tamil movie reviews dataset")

# Load the Tamil movie reviews dataset for regression
train_df = pd.read_csv("tamil_movie_reviews_train.csv")
test_df = pd.read_csv("tamil_movie_reviews_test.csv")

# Extract text and labels
train_texts = train_df['ReviewInTamil'].tolist()
train_labels = train_df['Rating'].tolist()
test_texts = test_df['ReviewInTamil'].tolist()
test_labels = test_df['Rating'].tolist()

# Tokenize the texts for regression with progress statements
print("Tokenizing texts for regression")
train_encodings = {'input_ids': [], 'attention_mask': []}
test_encodings = {'input_ids': [], 'attention_mask': []}

start_time = time.time()
for i in range(0, len(train_texts), batch_size):
    batch_texts = train_texts[i:i+batch_size]
    encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=256)
    train_encodings['input_ids'].extend(encodings['input_ids'])
    train_encodings['attention_mask'].extend(encodings['attention_mask'])

    if (i + len(batch_texts)) % print_interval < batch_size:
        elapsed_time = time.time() - start_time
        remaining_time = elapsed_time / (i + len(batch_texts)) * (len(train_texts) - (i + len(batch_texts)))
        print(f"Processed {i + len(batch_texts)} / {len(train_texts)} training texts, Elapsed time: {elapsed_time:.2f}s, Estimated remaining time: {remaining_time:.2f}s")

print("Finished tokenizing training texts for regression")

start_time = time.time()
for i in range(0, len(test_texts), batch_size):
    batch_texts = test_texts[i:i+batch_size]
    encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=256)
    test_encodings['input_ids'].extend(encodings['input_ids'])
    test_encodings['attention_mask'].extend(encodings['attention_mask'])

    if (i + len(batch_texts)) % print_interval < batch_size:
        elapsed_time = time.time() - start_time
        remaining_time = elapsed_time / (i + len(batch_texts)) * (len(test_texts) - (i + len(batch_texts)))
        print(f"Processed {i + len(batch_texts)} / {len(test_texts)} test texts, Elapsed time: {elapsed_time:.2f}s, Estimated remaining time: {remaining_time:.2f}s")

print("Finished tokenizing test texts for regression")

class ReviewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ReviewsDataset(train_encodings, train_labels)
test_dataset = ReviewsDataset(test_encodings, test_labels)

# DataLoader for fine-tuning
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Fine-tuning setup
class RegressionModel(nn.Module):
    def __init__(self, bert_model):
        super(RegressionModel, self).__init__()
        self.bert = bert_model
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        logits = self.regressor(cls_output)
        return logits

# Fine-tuning setup
model = RegressionModel(model.bert).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.MSELoss()

# Evaluation function to calculate MSE on the test set
def evaluate_model(model, test_loader, device):
    model.eval()
    all_labels = []
    all_preds = []
    start_time = time.time()
    print_interval = len(test_loader) // 10 
    if print_interval == 0:
        print_interval = 1

    for batch_idx, batch in enumerate(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        preds = outputs.squeeze()       
        
        all_labels.extend(labels.cpu().detach().numpy())
        all_preds.extend(preds.cpu().detach().numpy())    
        
        if (batch_idx + 1) % print_interval == 0 or (batch_idx + 1) == len(test_loader):
            elapsed_time = time.time() - start_time
            remaining_time = elapsed_time / (batch_idx + 1) * (len(test_loader) - (batch_idx + 1))
            print(f'Evaluating batch [{batch_idx+1}/{len(test_loader)}], Elapsed time: {elapsed_time:.2f}s, Remaining time: {remaining_time:.2f}s')

    print(len(all_labels), len(all_preds))
    mse = mean_squared_error(all_labels, all_preds)
    return mse

print("Fine Tuning Loop")

# Fine-tuning loop with additional metrics
model.train()
for epoch in range(30):  # Fine-tune for 3 epochs
    start_time = time.time()
    total_loss = 0
    print_interval = len(train_loader) // 10
    for batch_idx, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs.squeeze(), labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if (batch_idx + 1) % print_interval == 0 or (batch_idx + 1) == len(train_loader):
            elapsed_time = time.time() - start_time
            remaining_time = elapsed_time / (batch_idx + 1) * (len(train_loader) - (batch_idx + 1))

            print(f'Epoch [{epoch+1}/3], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {total_loss/(batch_idx+1):.4f}, '
                  f'Elapsed time: {elapsed_time:.2f}s, Remaining time: {remaining_time:.2f}s')

    # Evaluate the model on the test set
    mse = evaluate_model(model, test_loader, device)
    print(f'Epoch [{epoch+1}/3], Test MSE: {mse}')