In [1]:
from conllu import parse_incr
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
import pandas as pd
import gensim.downloader as api
import numpy as np
from collections import deque

In [2]:





class DependencyParsingDataset(Dataset):
    def __init__(self, file_path):
        self.data = self.load_data(file_path)

    def load_data(self, file_path):
        data = []
        with open(file_path, 'r', encoding='utf-8') as file:
            for sentence in parse_incr(file):
                #print(sentence)
                transitions = self.generate_transitions(sentence)
                data.append(transitions)
        return data


    def oracle(self, stack, buffer, sentence):
        if len(stack)<2 :
            return 'SHIFT'  # This ensures we don't try to access buffer[0] when buffer is empty
        #print(stack,buffer)
        top_of_stack = stack[-1] if stack else None
        first_in_buffer = buffer[0] if buffer else None

        if top_of_stack is not None and first_in_buffer is not None:
            buffer_head_idx = sentence[first_in_buffer - 1]['head']  # Adjusting index for zero-based list access
            stack_head_idx = sentence[top_of_stack - 1]['head']      # Adjusting index for zero-based list access

            if buffer_head_idx == top_of_stack:
                return 'RIGHT-ARC'
            elif stack_head_idx == first_in_buffer:
                return 'LEFT-ARC'

        return 'SHIFT'
    

    def generate_transitions(self, sentence):
        transitions = []
        stack = [0]  # Start with ROOT at the stack


            # Initialize buffer to handle multi-word tokens and null tokens
        buffer = deque()
        for token in sentence:
            if isinstance(token['id'], tuple) and token['form'] == '-':
                continue  # Ignore null tokens if represented by '-'
            elif isinstance(token['id'], tuple):
                buffer.append(token['id'][0])  # Use the first index from the tuple for multi-word tokens
            else:
                buffer.append(token['id'])

    

        arcs = []  #(dep,head)

        while buffer:
            action = self.oracle(stack, buffer, sentence)

            features = self.extract_features(stack, buffer, sentence)
            transitions.append((features, action))
        
        
            if action == 'SHIFT':
                stack.append(buffer.popleft())
            elif action == 'LEFT-ARC':
                arcs.append((stack[-1], buffer[0]))
                stack.pop()
            elif action == 'RIGHT-ARC' :
                arcs.append((buffer[0], stack[-1]))
                buffer.popleft()

        return transitions

    def extract_features(self, stack, buffer, sentence):
    # Initialize default features
        features = {
        'stack_top_id': 0, 'buffer_first_id': 0,
        'stack_top_word': 'NULL', 'buffer_first_word': 'NULL',
        'stack_top_pos': 'NULL', 'buffer_first_pos': 'NULL'
        }

    # Check and assign the top of the stack features
        if stack:
            stack_top_idx = stack[-1]
            stack_top_token = sentence[stack_top_idx - 1]  # Adjust for zero indexing
            features['stack_top_id'] = stack_top_idx
            features['stack_top_word'] = stack_top_token['form'].lower()
            features['stack_top_pos'] = stack_top_token['upos']

    # Check and assign the first item in the buffer features
        if buffer:
            buffer_first_idx = buffer[0]
            buffer_first_token = sentence[buffer_first_idx - 1]  # Adjust for zero indexing
            features['buffer_first_id'] = buffer_first_idx
            features['buffer_first_word'] = buffer_first_token['form'].lower()
            features['buffer_first_pos'] = buffer_first_token['upos']

        return features





    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
    # Retrieve the sentence data (list of tuples)
        sentence_data = self.data[idx]
    
    # You might want to process each token in the sentence. 
    # Here's an example of how you could handle this:
        processed_data = []
        for token in sentence_data:
            if len(token) == 2:
                features, action = token
                processed_data.append((features, action))
            else:
                raise ValueError(f"Expected each token to be a tuple of 2 elements, got {len(token)} elements.")
    
    # Return the processed list of tokens
        return processed_data

# Example usage
train_dataset = DependencyParsingDataset('./UD_English-EWT-master/en_ewt-ud-train.conllu')
dev_dataset = DependencyParsingDataset('./UD_English-EWT-master/en_ewt-ud-dev.conllu')
test_dataset = DependencyParsingDataset('./UD_English-EWT-master/en_ewt-ud-test.conllu')

# train_dataset_Hindi = DependencyParsingDataset('./UD_Hindi-HDTB-master/hi_hdtb-ud-train.conllu')
# dev_dataset_Hindi = DependencyParsingDataset('./UD_Hindi-HDTB-master/hi_hdtb-ud-dev.conllu')



# Example to fetch and print a batch
# for features, actions in train_loader:
#     print(features, actions)
#     break


In [3]:
class DependencyParserModel(nn.Module):
    def __init__(self, pos_vocab_size, pos_embedding_dim, embedding_dim, hidden_dim, num_actions):
        super(DependencyParserModel, self).__init__()
        self.pos_embedding = nn.Embedding(pos_vocab_size, pos_embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim * 2 + pos_embedding_dim * 2, hidden_size=hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_actions)

    def forward(self, stack_top_embeddings, buffer_first_embeddings, stack_top_pos_indices, buffer_first_pos_indices):
        # Embed POS tags and expand dimensions to match embeddings
        stack_top_pos_embeddings = self.pos_embedding(stack_top_pos_indices).squeeze(1)
        buffer_first_pos_embeddings = self.pos_embedding(buffer_first_pos_indices).squeeze(1)

        # Combine all embeddings
        combined_embeddings = torch.cat((stack_top_embeddings, buffer_first_embeddings, stack_top_pos_embeddings, buffer_first_pos_embeddings), dim=1)
        
        # Process with LSTM
        lstm_out, _ = self.lstm(combined_embeddings.unsqueeze(1))  # Unsqueeze to add a seq_length dimension
        logits = self.fc(lstm_out[:, -1, :])

        return logits


In [4]:
def build_pos_vocab(dataset):
    pos_tags = set()
    for sentence_data in dataset:  # Iterate over each sentence data in the dataset
        for token in sentence_data:  # Each token is a tuple (features, action)
            features, action = token
            # Assuming 'features' is a dictionary containing 'stack_top_pos' and 'buffer_first_pos'
            pos_tags.add(features['stack_top_pos'])
            pos_tags.add(features['buffer_first_pos'])

    # Map each POS tag to a unique index
    pos_to_index = {pos: idx for idx, pos in enumerate(pos_tags)}
    pos_to_index['<PAD>'] = len(pos_to_index)  # Adding a padding token for POS tags
    return pos_to_index

# Assuming 'DependencyParsingDataset' can be iterated and yields (features, action)
pos_vocab = build_pos_vocab(train_dataset)



In [5]:

# Load the model directly from Gensim's API
word2VecModel = api.load("word2vec-google-news-300")


In [6]:


def get_word_embedding(word, word2VecModel):
    try:
        return word2VecModel[word]
    except KeyError:
        # Attempt to remove apostrophes and retry
        word = word.replace("'", "")
        if word in word2VecModel:
            return word2VecModel[word]
        # Finally, return a zero vector if no suitable word is found
        return np.zeros(word2VecModel.vector_size)



In [7]:

action_to_index = {
    "SHIFT": 0,
    "RIGHT-ARC": 1,
    "LEFT-ARC": 2
}

In [8]:


def collate_fn(batch, word2VecModel, pos_vocab, action_to_index):
    # Containers for batch data
    stack_top_embeddings, buffer_first_embeddings = [], []
    stack_top_pos_indices, buffer_first_pos_indices = [], []
    actions_indices = []

    for sentence in batch:
        for features, action in sentence:
            # Convert words to embeddings
            stack_top_embedding = torch.tensor(get_word_embedding(features['stack_top_word'], word2VecModel), dtype=torch.float)
            buffer_first_embedding = torch.tensor(get_word_embedding(features['buffer_first_word'], word2VecModel), dtype=torch.float)

            # Convert POS tags to indices
            stack_top_pos_index = pos_vocab[features['stack_top_pos']]
            buffer_first_pos_index = pos_vocab[features['buffer_first_pos']]

            # Append embeddings and POS indices separately
            stack_top_embeddings.append(stack_top_embedding)
            buffer_first_embeddings.append(buffer_first_embedding)
            stack_top_pos_indices.append(torch.tensor([stack_top_pos_index], dtype=torch.long))
            buffer_first_pos_indices.append(torch.tensor([buffer_first_pos_index], dtype=torch.long))

            # Convert action to index and append
            actions_indices.append(action_to_index[action])

    # Pad sequences
    stack_top_embeddings = pad_sequence(stack_top_embeddings, batch_first=True, padding_value=0.0)
    buffer_first_embeddings = pad_sequence(buffer_first_embeddings, batch_first=True, padding_value=0.0)
    stack_top_pos_indices = pad_sequence(stack_top_pos_indices, batch_first=True, padding_value=pos_vocab['<PAD>'])
    buffer_first_pos_indices = pad_sequence(buffer_first_pos_indices, batch_first=True, padding_value=pos_vocab['<PAD>'])

    actions_indices = torch.tensor(actions_indices, dtype=torch.long)

    return (stack_top_embeddings, buffer_first_embeddings, stack_top_pos_indices, buffer_first_pos_indices), actions_indices


In [9]:
# Example DataLoader usage
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,
                          collate_fn=lambda batch: collate_fn(batch, word2VecModel, pos_vocab, action_to_index))

val_loader = DataLoader(dev_dataset, batch_size=32, shuffle=True,
                          collate_fn=lambda batch: collate_fn(batch, word2VecModel, pos_vocab, action_to_index))

test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, 
                         collate_fn=lambda batch: collate_fn(batch, word2VecModel, pos_vocab, action_to_index))
for batch in train_loader:
    # Unpack the batch data
    (stack_top_embeddings, buffer_first_embeddings, stack_top_pos_indices, buffer_first_pos_indices), actions_indices = batch
    
    # Print shapes and type of data in the batch
    print("Stack Top Embeddings Shape:", stack_top_embeddings.shape)
    print("Buffer First Embeddings Shape:", buffer_first_embeddings.shape)
    print("Stack Top POS Indices Shape:", stack_top_pos_indices.shape)
    print("Buffer First POS Indices Shape:", buffer_first_pos_indices.shape)
    print("Actions Indices Shape:", actions_indices.shape)
    print("Actions Indices:", actions_indices)

    # Optionally, break after the first batch to just see one example
    break

Stack Top Embeddings Shape: torch.Size([554, 300])
Buffer First Embeddings Shape: torch.Size([554, 300])
Stack Top POS Indices Shape: torch.Size([554, 1])
Buffer First POS Indices Shape: torch.Size([554, 1])
Actions Indices Shape: torch.Size([554])
Actions Indices: tensor([0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 2, 0, 0, 0, 1, 1, 1, 0, 0, 2, 2, 0, 0, 2, 1, 0, 1, 0, 2, 1, 0, 2,
        0, 0, 2, 1, 1, 0, 0, 2, 0, 1, 1, 0, 0, 1, 2, 0, 0, 0, 2, 2, 1, 0, 2, 1,
        0, 0, 2, 2, 1, 0, 0, 2, 0, 0, 0, 0, 2, 2, 2, 0, 1, 0, 2, 1, 1, 0, 0, 0,
        1, 1, 2, 2, 0, 0, 0, 2, 2, 1, 1, 0, 2, 0, 0, 2, 1, 0, 0, 2, 2, 2, 0, 0,
        2, 1, 1, 0, 0, 2, 1, 1, 0, 0, 0, 2, 2, 2, 0, 0, 0, 1, 0, 2, 2, 2, 1, 0,
        0, 0, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 2, 2, 1, 1,
        1, 0, 1, 0, 1, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 1, 0, 2,
        0, 0, 2, 1, 0, 0, 2, 2, 1, 0, 2, 1, 0, 0, 0, 2, 1, 0, 0, 2, 2, 0, 1, 0,
        0, 2, 

In [10]:
# Assuming you've predefined the following variables
pos_vocab_size = len(pos_vocab)  # from your POS vocabulary
pos_embedding_dim = 50  # arbitrary choice, can be tuned
embedding_dim = 300  # assuming your word embeddings are of size 300
hidden_dim = 128  # hidden dimension of the LSTM
num_actions = 3  # "SHIFT", "RIGHT-ARC", "LEFT-ARC"

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize model
model = DependencyParserModel(pos_vocab_size, pos_embedding_dim, embedding_dim, hidden_dim, num_actions)
model.to(device)


DependencyParserModel(
  (pos_embedding): Embedding(19, 50)
  (lstm): LSTM(700, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=3, bias=True)
)

In [11]:

# Loss and optimizer
num_epochs = 10
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Example training loop for one epoch
model.train()  # Set the model to training mode
for epoch in range(num_epochs):
    total_loss = 0
    for (stack_top_embeddings, buffer_first_embeddings, stack_top_pos_indices, buffer_first_pos_indices), actions_indices in train_loader:
        # Move tensors to the appropriate device
        stack_top_embeddings = stack_top_embeddings.to(device)
        buffer_first_embeddings = buffer_first_embeddings.to(device)
        stack_top_pos_indices = stack_top_pos_indices.to(device)
        buffer_first_pos_indices = buffer_first_pos_indices.to(device)
        actions_indices = actions_indices.to(device)

        # Zero gradients
        optimizer.zero_grad()

        # Perform a forward pass through the model
        outputs = model(stack_top_embeddings, buffer_first_embeddings, stack_top_pos_indices, buffer_first_pos_indices)

        # Calculate loss
        loss = criterion(outputs, actions_indices)

        # Backpropagate the gradients
        loss.backward()

        # Update the parameters
        optimizer.step()

        # Accumulate loss
        total_loss += loss.item()

    # Print average loss for the epoch
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

print("Training complete!")

Epoch 1/10, Loss: 0.5706
Epoch 2/10, Loss: 0.5057
Epoch 3/10, Loss: 0.4894
Epoch 4/10, Loss: 0.4782
Epoch 5/10, Loss: 0.4679
Epoch 6/10, Loss: 0.4600
Epoch 7/10, Loss: 0.4526


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

def calculate_metrics(predicted, true):
    precision = precision_score(true, predicted, average='weighted')
    recall = recall_score(true, predicted, average='weighted')
    f1 = f1_score(true, predicted, average='weighted')
    return precision, recall, f1

def validate_model(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0
    all_predicted = []
    all_true = []

    with torch.no_grad():
        for (stack_top_embeddings, buffer_first_embeddings, stack_top_pos_indices, buffer_first_pos_indices), actions_indices in data_loader:
            stack_top_embeddings = stack_top_embeddings.to(device)
            buffer_first_embeddings = buffer_first_embeddings.to(device)
            stack_top_pos_indices = stack_top_pos_indices.to(device)
            buffer_first_pos_indices = buffer_first_pos_indices.to(device)
            actions_indices = actions_indices.to(device)

            outputs = model(stack_top_embeddings, buffer_first_embeddings, stack_top_pos_indices, buffer_first_pos_indices)

            loss = criterion(outputs, actions_indices)
            total_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            all_predicted.extend(predicted.cpu().numpy())
            all_true.extend(actions_indices.cpu().numpy())

    avg_loss = total_loss / len(data_loader)
    precision, recall, f1 = calculate_metrics(all_predicted, all_true)
    return avg_loss, precision, recall, f1


In [None]:
# Validate the model on the validation data
val_loss, val_precision, val_recall, val_f1 = validate_model(model, val_loader, criterion, device)
print(f"Validation Loss: {val_loss:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1 Score: {val_f1:.4f}")

# Test the model on the test data
test_loss, test_precision, test_recall, test_f1 = validate_model(model, test_loader, criterion, device)
print(f"Test Loss: {test_loss:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, F1 Score: {test_f1:.4f}")


In [None]:
train_dataset_Hindi = DependencyParsingDataset('./UD_Hindi-HDTB-master/hi_hdtb-ud-train.conllu')
dev_dataset_Hindi = DependencyParsingDataset('./UD_Hindi-HDTB-master/hi_hdtb-ud-dev.conllu')

In [None]:
import gensim
from gensim.models import KeyedVectors

# Download the FastText Hindi word vectors from the FastText website
# Ensure the file 'cc.hi.300.vec.gz' is in your current working directory

# Load the FastText Hindi word vectors
hindi_model_path = "C:/Users/Rithin/Downloads/cc.hi.300.vec/cc.hi.300.vec" # Path to the downloaded file
hindi_word2vec_model = KeyedVectors.load_word2vec_format(hindi_model_path, binary=False)


In [None]:
pos_vocab = build_pos_vocab(train_dataset_Hindi)

In [None]:
model_hindi = DependencyParserModel(pos_vocab_size, pos_embedding_dim, embedding_dim, hidden_dim, num_actions)
model_hindi.to(device)

DependencyParserModel(
  (pos_embedding): Embedding(19, 50)
  (lstm): LSTM(700, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=3, bias=True)
)

In [None]:
# Example DataLoader usage
train_loader_hindi = DataLoader(train_dataset_Hindi, batch_size=32, shuffle=True,
                          collate_fn=lambda batch: collate_fn(batch, hindi_word2vec_model, pos_vocab, action_to_index))

val_loader_hindi = DataLoader(dev_dataset_Hindi, batch_size=32, shuffle=True,
                          collate_fn=lambda batch: collate_fn(batch, hindi_word2vec_model, pos_vocab, action_to_index))
for batch in train_loader_hindi:
    # Unpack the batch data
    (stack_top_embeddings, buffer_first_embeddings, stack_top_pos_indices, buffer_first_pos_indices), actions_indices = batch
    
    # Print shapes and type of data in the batch
    print("Stack Top Embeddings Shape:", stack_top_embeddings.shape)
    print("Buffer First Embeddings Shape:", buffer_first_embeddings.shape)
    print("Stack Top POS Indices Shape:", stack_top_pos_indices.shape)
    print("Buffer First POS Indices Shape:", buffer_first_pos_indices.shape)
    print("Actions Indices Shape:", actions_indices.shape)
    print("Actions Indices:", actions_indices)

    # Optionally, break after the first batch to just see one example
    break

Stack Top Embeddings Shape: torch.Size([1042, 300])
Buffer First Embeddings Shape: torch.Size([1042, 300])
Stack Top POS Indices Shape: torch.Size([1042, 1])
Buffer First POS Indices Shape: torch.Size([1042, 1])
Actions Indices Shape: torch.Size([1042])
Actions Indices: tensor([0, 0, 0,  ..., 1, 0, 0])


In [None]:
num_epochs =10
model_hindi.train()  # Set the model to training mode
for epoch in range(num_epochs):
    total_loss = 0
    for (stack_top_embeddings, buffer_first_embeddings, stack_top_pos_indices, buffer_first_pos_indices), actions_indices in train_loader_hindi:
        # Move tensors to the appropriate device
        stack_top_embeddings = stack_top_embeddings.to(device)
        buffer_first_embeddings = buffer_first_embeddings.to(device)
        stack_top_pos_indices = stack_top_pos_indices.to(device)
        buffer_first_pos_indices = buffer_first_pos_indices.to(device)
        actions_indices = actions_indices.to(device)

        # Zero gradients
        optimizer.zero_grad()

        # Perform a forward pass through the model
        outputs = model(stack_top_embeddings, buffer_first_embeddings, stack_top_pos_indices, buffer_first_pos_indices)

        # Calculate loss
        loss = criterion(outputs, actions_indices)

        # Backpropagate the gradients
        loss.backward()

        # Update the parameters
        optimizer.step()

        # Accumulate loss
        total_loss += loss.item()

    # Print average loss for the epoch
    avg_loss = total_loss / len(train_loader_hindi)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

print("Training complete!")

Epoch 1/30, Loss: 0.4161
Epoch 2/30, Loss: 0.3534
Epoch 3/30, Loss: 0.3355
Epoch 4/30, Loss: 0.3245
Epoch 5/30, Loss: 0.3157
Epoch 6/30, Loss: 0.3093
Epoch 7/30, Loss: 0.3032
Epoch 8/30, Loss: 0.2983
Epoch 9/30, Loss: 0.2935
Epoch 10/30, Loss: 0.2882
Epoch 11/30, Loss: 0.2844
Epoch 12/30, Loss: 0.2806
Epoch 13/30, Loss: 0.2768
Epoch 14/30, Loss: 0.2727
Epoch 15/30, Loss: 0.2688
Epoch 16/30, Loss: 0.2652
Epoch 17/30, Loss: 0.2625
Epoch 18/30, Loss: 0.2589
Epoch 19/30, Loss: 0.2561
Epoch 20/30, Loss: 0.2526
Epoch 21/30, Loss: 0.2501
Epoch 22/30, Loss: 0.2464
Epoch 23/30, Loss: 0.2441
Epoch 24/30, Loss: 0.2422
Epoch 25/30, Loss: 0.2392
Epoch 26/30, Loss: 0.2364
Epoch 27/30, Loss: 0.2339


KeyboardInterrupt: 

In [None]:
# Validate the model
val_loss, val_accuracy = validate_model(model_hindi, val_loader_hindi, criterion, device)
print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")