In [1]:
import os
import numpy as np
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM, BertConfig
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW
from sklearn.metrics import accuracy_score

In [2]:

class SentenceDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_length):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        encoded_input = self.tokenizer(
            sentence,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        return encoded_input['input_ids'].squeeze(0), encoded_input['attention_mask'].squeeze(0)

# Load sentences from your text file
with open("C:/Users/prath/OneDrive/Desktop/200Sentences.txt", "r") as file:
    sentences = [line.strip() for line in file.readlines()]

In [6]:
# Tokenizer and model initialization
pretrained_model = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_model)

# Load the pretrained model
model = BertForMaskedLM.from_pretrained(pretrained_model)

# Modify the model to use only 4 transformer layers
model.bert.encoder.layer = torch.nn.ModuleList(model.bert.encoder.layer[:4])
model.config.num_hidden_layers = 4

# Fine-tuning parameters
max_length = 128  # Adjust based on your sentence lengths
batch_size = 4    # Reduce batch size due to increased model size
epochs = 5
learning_rate = 5e-5

# Dataset and DataLoader
dataset = SentenceDataset(sentences, tokenizer, max_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Device configuration (use GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12

In [7]:
# Fine-tuning loop with accuracy
model.train()
for epoch in range(epochs):
    total_loss = 0
    total_correct = 0
    total_samples = 0

    for batch in dataloader:
        input_ids, attention_mask = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        optimizer.zero_grad()

        # Forward pass and compute loss
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        total_loss += loss.item()

        # Compute accuracy
        predictions = torch.argmax(outputs.logits, dim=-1)
        correct = (predictions == input_ids).float().sum()
        total_correct += correct.item()
        total_samples += input_ids.numel()

        # Backward pass and optimizer step
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(dataloader)
    accuracy = total_correct / total_samples * 100

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_bert_large')


Epoch 1/5, Loss: 0.6391, Accuracy: 93.61%
Epoch 2/5, Loss: 0.0131, Accuracy: 99.83%
Epoch 3/5, Loss: 0.0040, Accuracy: 99.97%
Epoch 4/5, Loss: 0.0013, Accuracy: 100.00%
Epoch 5/5, Loss: 0.0009, Accuracy: 100.00%


In [8]:
# List of 40 target words for which you want to extract features
target_words = ['hearingimpaired', 'communication', 'meetings', 'primeMinister', 'namaskar', 'indetail', 'watching', 'both', 'activities', 'chaired', 'children', 'development', 'earlier',  'fourteen',  'india', 'instructed', 'interaction', 
'inthis', 'more', 'movingon', 'one', 'reviewed', 'situation', 'technological', 'terrorists', 'thanks', 'thatsit', 'there', 'today', 'tools', 'under', 'yesterday', 'youare', 'health', 'imprisonment', 'phone', 'training', 'krishna', 'wrong', 'train']

# Extracting Embedding Vectors for Multiple Words
def get_embeddings_for_words(sentences, words, save_dir):
    word_embeddings = {}
    model.eval()

    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    for sentence in sentences:
        tokens = tokenizer(sentence, return_tensors='pt').to(device)
        
        # Get the embeddings from the fine-tuned BERT model
        with torch.no_grad():
            outputs = model.bert(**tokens)
        
        last_hidden_states = outputs.last_hidden_state  # Shape: (batch_size, sequence_length, hidden_size)

        for word in words:
            word_token_ids = tokenizer.encode(word, add_special_tokens=False)
            word_indices = [i for i, token_id in enumerate(tokens['input_ids'][0]) if token_id in word_token_ids]

            if word_indices:
                # Extract the embeddings of the selected word
                word_embeds = last_hidden_states[0, word_indices, :].mean(dim=0).cpu().numpy()

                if word not in word_embeddings:
                    word_embeddings[word] = []
                word_embeddings[word].append(word_embeds)

    # Average embeddings for each word across sentences and save to files
    for word, embeds in word_embeddings.items():
        averaged_word_embedding = np.mean(np.array(embeds), axis=0)
        np.save(os.path.join(save_dir, f"{word}.npy"), averaged_word_embedding)

    print(f"Word embeddings saved in {save_dir}")

# Directory to save the extracted embeddings
save_directory = "C:/Users/prath/OneDrive/Desktop/word_embeddings"

# Extract features for the 40 target words and save them
get_embeddings_for_words(sentences, target_words, save_directory)

Word embeddings saved in C:/Users/prath/OneDrive/Desktop/word_embeddings
