In [None]:
%pip install transformers pandas torch scikit-learn openpyxl accelerate
%pip install openpyxl
%pip install ipywidgets

In [None]:
import pandas as pd
import os

# Disable tokenizer parallelism 
os.environ["TOKENIZERS_PARALLELISM"] = "false"

df = pd.read_excel('EnglishMaterial.xlsx', engine='openpyxl')
corpus = df['WORD'].dropna().tolist()  # Extract non-null text entries as a list

In [None]:
### Prepare the data for training

from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader

tokenizer = AutoTokenizer.from_pretrained('distilgpt2')

# Add padding token 
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

tokenizer.pad_token = tokenizer.eos_token

# Create custom Dataset class for handling text data
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.examples = []
        for text in self.texts:
            if isinstance(text, str):
                tokens = self.tokenizer(
                    text,
                    truncation=True,
                    max_length=max_length,
                    padding='max_length',
                    return_tensors='pt'
                )['input_ids'].squeeze(0)
                self.examples.append({'input_ids': tokens, 'attention_mask': (tokens != tokenizer.pad_token_id), 'labels': tokens.clone()})

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

text_dataset = TextDataset(corpus, tokenizer)
text_dataloader = DataLoader(text_dataset, batch_size=4, shuffle=True, drop_last=True)


In [None]:
### Load Pre-trained model and define training parameters
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments

# Load pre-trained distilgpt2 model for Causal Language Modeling
model = AutoModelForCausalLM.from_pretrained('distilgpt2')
model.resize_token_embeddings(len(tokenizer), mean_resizing=False)

# Define training arguments for fine-tuning
training_args = TrainingArguments(
    run_name='distilgpt2_finetune',  # Set a custom run name to avoid warnings
    output_dir='./gpt2-finetuned',    # Directory to save the model
    overwrite_output_dir=True,
    num_train_epochs=3,               # Number of epochs
    per_device_train_batch_size=4,    # Batch size per device
    save_steps=500,                   # Save checkpoint every 500 steps
    save_total_limit=2,               # Only keep 2 checkpoints
    logging_dir='./logs',             # Log directory for training logs
    logging_steps=10,
    warmup_steps=100,                 # Learning rate warmup steps
    learning_rate=5e-5,                # Learning rate
    report_to='none'                  # Disable Weights & Biases logging

)

# Set up Trainer for training
trainer = Trainer(
    model=model,
    compute_metrics=None,
    args=training_args,
    train_dataset=text_dataset,
)


In [None]:
### Train the Model

# Start fine-tuning the model
trainer.train()

### Step 6: Save the Fine-Tuned Model
model.save_pretrained('./fine_tuned_gpt2')
tokenizer.save_pretrained('./fine_tuned_gpt2')

### Step 7: Extract Embeddings for Analysis
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

fine_tuned_model = AutoModelForCausalLM.from_pretrained('./fine_tuned_gpt2')
fine_tuned_tokenizer = AutoTokenizer.from_pretrained('./fine_tuned_gpt2')

feature_extractor = pipeline('feature-extraction', model=fine_tuned_model, tokenizer=fine_tuned_tokenizer)

# Example: Extract embeddings for a sample sentence
text = "The children saw themselves in the mirror."
embeddings = feature_extractor(text)[0]

import numpy as np

# Convert embeddings to numpy array for further analysis
embeddings_np = np.array(embeddings)
print(embeddings_np.shape)


In [None]:
### Analyze Semantic Associations

# compute cosine similarity  
from sklearn.metrics.pairwise import cosine_similarity

# Try to extract embeddings for specific words in the sentence?
word1_embedding = embeddings_np[0][2]  # Example: 'children'
word2_embedding = embeddings_np[0][5]  # Example: 'mirror'

# Calculate cosine similarity between the two words
similarity = cosine_similarity([word1_embedding], [word2_embedding])
print(f"Cosine similarity between 'children' and 'mirror': {similarity[0][0]}")

In [None]:
### Analyze Syntactic Relationships
# Use attention weights to analyze syntactic relationships
from transformers import AutoModel

model = AutoModel.from_pretrained('./fine_tuned_gpt2', output_attentions=True)

inputs = fine_tuned_tokenizer(text, return_tensors='pt')
outputs = model(**inputs)
attention = outputs.attentions  # Get the attention weights

# Print the shape of attention weights (layers, heads, tokens, tokens)
print(f"Attention weights shape: {len(attention)}, {attention[0].shape}")
