<h2> Training Model </h2>

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

# Load the data / excel
df = pd.read_excel("/Users/rohanraval/Desktop/PlayingAround/TrainData/lec01EDIT.xlsx")

# Concat the dataframes if there are more than one

#df = pd.concat([df, df_second, df_third, df_fourth, df_fifth, df_sixth, df_seventh, df_eighth, df_ninth, df_tenth])
print(df.shape)
df = df[df['text'].apply(lambda x: isinstance(x, str))]
df['Title'] = df['Title'].apply(lambda x: 1 if x != 0 else 0)

# Define the simplified model with two classes: Topic and Content
class SimplifiedTopicContentModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout):
        super(SimplifiedTopicContentModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.font_size_proj = nn.Linear(1, embedding_dim)
        self.context_proj = nn.Linear(1, embedding_dim)  # For contextual features
        self.highlight_proj = nn.Linear(1, embedding_dim)  # For the highlight feature
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.output_linear = nn.Linear(hidden_dim, 2)  # 2 output classes (Topic, Content)
    
    def forward(self, text_input, font_sizes, contextual_features, highlight_features):
        embeds = self.embedding(text_input)
        
        # Expand the input to match the embedding dimension
        font_size_embeds = self.font_size_proj(font_sizes).expand(-1, -1, embeds.size(-1))
        context_embeds = self.context_proj(contextual_features).expand(-1, -1, embeds.size(-1))
        highlight_embeds = self.highlight_proj(highlight_features).expand(-1, -1, embeds.size(-1))
        
        # Combine all embeddings
        combined_embeds = embeds + font_size_embeds + context_embeds + highlight_embeds
        
        lstm_out, _ = self.lstm(combined_embeds)
        logits = self.output_linear(lstm_out)
        return logits

# Function to create contextual features
def create_contextual_features(sizes):
    relative_sizes = [0] + [sizes[i] - sizes[i-1] for i in range(1, len(sizes))]
    max_diff = max(abs(x) for x in relative_sizes) if relative_sizes else 1
    normalized_relative_sizes = [x / max_diff for x in relative_sizes]
    return torch.tensor(normalized_relative_sizes).unsqueeze(1).float()

# Prepare data
texts = list(df.get("text"))
font_sizes = list(df.get("font_sizes"))
highlighted = list(df.get("is_bold"))
labels_title = list(df.get("Title"))

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Prepare features
sequence_length = tokenized_inputs['input_ids'].size(1)
font_size_tensor = torch.tensor(font_sizes).unsqueeze(1).float()
highlight_tensor = torch.tensor(highlighted).unsqueeze(1).float()

# Expand to match sequence length
font_sizes_padded = font_size_tensor.unsqueeze(1).expand(-1, sequence_length, -1)
highlight_features_padded = highlight_tensor.unsqueeze(1).expand(-1, sequence_length, -1)
contextual_features_padded = create_contextual_features(font_sizes).unsqueeze(1).expand(-1, sequence_length, -1)

# Prepare labels: 1 for Topic, 0 for Content
labels = []
for t, l in zip(texts, labels_title):
    true_or_not = [l] * len(tokenizer.tokenize(t))
    labels.append(true_or_not)

padded_labels = [label + [0] * (sequence_length - len(label)) for label in labels]
labels_tensor = torch.tensor(padded_labels, dtype=torch.long)

# Combine inputs and labels into a TensorDataset and DataLoader
dataset = TensorDataset(tokenized_inputs['input_ids'], font_sizes_padded, contextual_features_padded, highlight_features_padded, labels_tensor)
batch_size = 32  # Set your desired batch size
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Model initialization
vocab_size = tokenizer.vocab_size
embedding_dim = 128
hidden_dim = 256
num_layers = 2
dropout = 0.2

model = SimplifiedTopicContentModel(vocab_size, embedding_dim, hidden_dim, num_layers, dropout)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    for batch in dataloader:
        input_ids, font_sizes_batch, context_features_batch, highlight_features_batch, labels_batch = batch
        optimizer.zero_grad()
        
        logits = model(input_ids, font_sizes_batch, context_features_batch, highlight_features_batch)
        
        loss = F.cross_entropy(logits.view(-1, 2), labels_batch.view(-1))
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")

# Prediction function (unchanged)
def predict_with_highlight(model, texts, sizes, highlighted, tokenizer, sequence_length):
    model.eval()
    tokenized_inputs = tokenizer(texts, padding=True, truncation=True, max_length=sequence_length, return_tensors="pt")
    
    font_size_tensor = torch.tensor(sizes).unsqueeze(1).float()
    highlight_tensor = torch.tensor(highlighted).unsqueeze(1).float()

    font_sizes_padded = font_size_tensor.unsqueeze(1).expand(-1, sequence_length, -1)
    highlight_features_padded = highlight_tensor.unsqueeze(1).expand(-1, sequence_length, -1)
    contextual_features_padded = create_contextual_features(sizes).unsqueeze(1).expand(-1, sequence_length, -1)
    
    with torch.no_grad():
        logits = model(tokenized_inputs['input_ids'], font_sizes_padded, contextual_features_padded, highlight_features_padded)
        predictions = torch.argmax(logits, dim=2)
    
    return predictions

# Example prediction (unchanged)
#predictions = predict_with_highlight(model, texts, font_sizes, highlighted, tokenizer, sequence_length)

# Function to interpret predictions (unchanged)
def interpret_predictions(texts, predictions):
    for i, text in enumerate(texts):
        predicted_classes = predictions[i].numpy()
        print(f"Text: {text}")
        if 1 in predicted_classes:
            print(" -> This line is predicted as a Topic.")
        else:
            print(" -> This line is predicted as Content.")

# Interpret the predictions (unchanged)
# interpret_predictions(texts, predictions)


<h2> Simple interpretation of the model </h2>

In [None]:
# Load the dataframe for prediction

df2 = pd.read_excel("/Users/rohanra2val/Desktop/PlayingAround/TrainData/1completeEDIT.xlsx")

In [None]:
#text input for prediction
prediction_texts = list(df2.get("text"))

#font sizes for each line
prediction_font_sizes = list(df2.get("font_sizes"))

# (1 means highlighted, 0 means not highlighted)
prediction_highlighted = list(df2.get("is_bold"))
#prediction_highlighted = [False] * len(list(df2.get("is_bold")))


In [None]:
# Tokenize the prediction text
prediction_tokenized_inputs = tokenizer(prediction_texts, padding=True, truncation=True, return_tensors="pt")
prediction_sequence_length = prediction_tokenized_inputs['input_ids'].size(1)

# Prepare features
prediction_font_size_tensor = torch.tensor(prediction_font_sizes).unsqueeze(1).float()
prediction_highlight_tensor = torch.tensor(prediction_highlighted).unsqueeze(1).float()

# Expand to match sequence length
prediction_font_sizes_padded = prediction_font_size_tensor.unsqueeze(1).expand(-1, prediction_sequence_length, -1)
prediction_highlight_features_padded = prediction_highlight_tensor.unsqueeze(1).expand(-1, prediction_sequence_length, -1)
prediction_contextual_features_padded = create_contextual_features(prediction_font_sizes).unsqueeze(1).expand(-1, prediction_sequence_length, -1)

# Make predictions
prediction_logits = model(prediction_tokenized_inputs['input_ids'], prediction_font_sizes_padded, prediction_contextual_features_padded, prediction_highlight_features_padded)
prediction_classes = torch.argmax(prediction_logits, dim=2)

test_answer = []

# Function to interpret/ print predictions
def interpret_predictions(texts, predictions):
    for i, text in enumerate(texts):
        predicted_class = predictions[i].numpy()
        if 1 in predicted_class:
            test_answer.append(1)
            #print(text, predicted_class)
            print(f"Text: {text} -> This line is predicted as a Topic.")
        else:
            test_answer.append(0)
            print(f"Text: {text} -> This line is predicted as Content.")

# Interpret the predictions
interpret_predictions(prediction_texts, prediction_classes)
