In [None]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.1 MB[0m [31m4.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.1 MB[0m [31m7.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━[0m [32m1.0/1.1 MB[0m [31m9.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.5.3


In [1]:
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
import random

# Download NLTK punkt tokenizer (one-time download)
nltk.download('punkt')

# Load and preprocess the dataset
def load_data(file_path):
    """Loads text data from a file."""
    with open(file_path, 'r') as f:
        text = f.read()
    return text

def preprocess(text):
    """Preprocesses text data into tokens, vocabulary, and index mappings."""
    tokens = word_tokenize(text)
    word_counts = Counter(tokens)
    vocab = {word: idx for idx, (word, _) in enumerate(word_counts.most_common())}
    idx_to_word = {idx: word for word, idx in vocab.items()}
    return tokens, vocab, idx_to_word

# Define file path and load data
file_path = r'/content/drive/MyDrive/parah2.txt'
text = load_data(file_path)

# Preprocess the text
tokens, vocab, idx_to_word = preprocess(text)

# Get vocabulary size
vocab_size = len(vocab)
print("Vocabulary size:", vocab_size)

# Define window_size based on your design
window_size = 5  # Example value, adjust as needed

# Example input and target sequences creation
def create_input_target_sequences(tokens, window_size):
    input_seq = []
    target_seq = []
    for i in range(len(tokens) - window_size):
        input_seq.append(tokens[i:i+window_size])
        target_seq.append(tokens[i+window_size])
    return input_seq, target_seq

input_seq, target_seq = create_input_target_sequences(tokens, window_size)

# Data augmentation
def augment_data(input_seq, target_seq):
    augmented_input_seq = []
    augmented_target_seq = []
    for i in range(len(input_seq)):
        if random.random() > 0.5:
            seq = input_seq[i]
            random.shuffle(seq)
            augmented_input_seq.append(seq)
            augmented_target_seq.append(target_seq[i])
    return augmented_input_seq + input_seq, augmented_target_seq + target_seq

input_seq, target_seq = augment_data(input_seq, target_seq)

# Convert input and target sequences to tensors
input_tensor = torch.LongTensor([[vocab[word] for word in seq] for seq in input_seq])
target_tensor = torch.LongTensor([vocab[word] for word in target_seq])

# Define dataset and data loader
dataset = TensorDataset(input_tensor, target_tensor)

split_ratio = 0.8
split_index = int(len(input_seq) * split_ratio)

# Split the input and target sequences into training and validation sets
train_input_seq = input_seq[:split_index]
train_target_seq = target_seq[:split_index]
val_input_seq = input_seq[split_index:]
val_target_seq = target_seq[split_index:]

# Convert training and validation sequences to tensors
train_input_tensor = torch.LongTensor([[vocab[word] for word in seq] for seq in train_input_seq])
train_target_tensor = torch.LongTensor([vocab[word] for word in train_target_seq])
val_input_tensor = torch.LongTensor([[vocab[word] for word in seq] for seq in val_input_seq])
val_target_tensor = torch.LongTensor([vocab[word] for word in val_target_seq])

# Define DataLoader for training and validation data
train_dataset = TensorDataset(train_input_tensor, train_target_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataset = TensorDataset(val_input_tensor, val_target_tensor)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)  # No need to shuffle for validation

# Define the GNN model
class GNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, window_size, dropout_rate=0.5):
        super(GNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim * window_size, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        x = self.embedding(x)  # Lookup embeddings
        x = x.view(-1, x.size(1) * x.size(2))  # Flatten
        x = self.dropout(F.relu(self.bn1(self.fc1(x))))
        x = self.dropout(F.relu(self.bn2(self.fc2(x))))
        x = self.fc3(x)
        return F.log_softmax(x, dim=1)  # Use log_softmax for prediction

# Instantiate the GNN model
embedding_dim = 300
hidden_dim = 256
output_dim = vocab_size
model = GNN(vocab_size, embedding_dim, hidden_dim, output_dim, window_size)

# Define the loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Train the GNN model
num_epochs = 10
for epoch in range(num_epochs):
    total_loss = 0
    model.train()
    for batch_input, batch_target in train_dataloader:
        optimizer.zero_grad()
        output = model(batch_input)
        loss = criterion(output, batch_target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_dataloader):.4f}')

# Calculate perplexity for validation data using the GNN model
def calculate_perplexity(model, validation_dataloader, vocab):
    total_loss = 0
    total_words = 0

    model.eval()
    with torch.no_grad():
        for batch_input, batch_target in validation_dataloader:
            output = model(batch_input)
            loss = criterion(output, batch_target)
            total_loss += loss.item()
            total_words += batch_target.size(0)

    average_loss = total_loss / len(validation_dataloader)
    perplexity = torch.exp(torch.tensor(average_loss))

    return perplexity.item()

perplexity = calculate_perplexity(model, val_dataloader, vocab)
print("Perplexity (GNN Model):", perplexity)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Vocabulary size: 8443
Epoch [1/10], Loss: 7.3884
Epoch [2/10], Loss: 6.9186
Epoch [3/10], Loss: 6.7116
Epoch [4/10], Loss: 6.5608
Epoch [5/10], Loss: 6.4353
Epoch [6/10], Loss: 6.3130
Epoch [7/10], Loss: 6.1847
Epoch [8/10], Loss: 6.0769
Epoch [9/10], Loss: 5.9680
Epoch [10/10], Loss: 5.8547
Perplexity (GNN Model): 1553.2080078125
