# Training a Language Model with PyTorch

## Import Necessary Libraries

In this step, we import the necessary libraries for building and working with a neural language model using PyTorch.

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.util import ngrams
from collections import defaultdict
from random import random
import torch
import torch.nn as nn
import torch.optim as optim

## Load and Preprocess Data

In this step, we load the data that we will be using to build the neural language model and preprocess it by tokenizing and removing stop words.

In [2]:
# Load data into a Pandas dataframe
data_path = 'datasets/data.csv'
data = pd.read_csv(data_path)

# Preprocess data
stop_words = set(nltk.corpus.stopwords.words('english'))

def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

data['tokens'] = data['text'].apply(preprocess_text)

## Build a Neural Language Model with PyTorch

In this step, we define a class for the neural language model using PyTorch. This class includes an embedding layer, a LSTM layer, and a fully connected layer.

In [3]:
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1):
        super(LanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, h):
        x = self.embedding(x)
        x, h = self.lstm(x, h)
        x = x.contiguous().view(-1, x.shape[2])
        x = self.fc(x)
        return x, h


## Train the Neural Language Model

In this step, we define a function to train the neural language model using the preprocessed data. This function loops over the data and performs backpropagation on each batch to update the model parameters.

In [4]:
def train(model, data, optimizer, criterion, epochs, batch_size, sequence_length, device):
    model.train()
    for epoch in range(epochs):
        h = (torch.zeros(model.lstm.num_layers, batch_size, model.lstm.hidden_size).to(device),
             torch.zeros(model.lstm.num_layers, batch_size, model.lstm.hidden_size).to(device))
        for i in range(0, len(data)-sequence_length, sequence_length):
            # Extract the sequence from the data
            x = data[i:i+sequence_length]
            y = data[i+1:i+sequence_length+1]

            # Convert sequences to torch tensors
            try:
                # Assuming x and y are lists of integers
                x = torch.tensor(x, dtype=torch.long).view(sequence_length, -1).to(device)
                y = torch.tensor(y, dtype=torch.long).view(-1).to(device)
            except ValueError as e:
                # Handle the case where x and y are not lists of integers
                print(f"ValueError: {e}. Check that your data only contains integers.")
                return

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass, backward pass, and optimize
            output, h = model(x, h)
            loss = criterion(output, y.view(-1))
            loss.backward()
            optimizer.step()

            # Detach hidden states
            h = tuple([each.data for each in h])
        
        # print(f'Epoch {epoch+1}/{epochs} loss: {loss.item():.4f}')


In [5]:
def save_model(model, model_path):
    """
    Saves the trained model to disk.

    Parameters:
    model (torch.nn.Module): The trained model to save.
    model_path (str): The path to the file where the model should be saved.

    Returns:
    None
    """
    # Ensure the model_path is a string and not empty
    if not isinstance(model_path, str) or not model_path:
        raise ValueError("Please provide a valid model path as a string.")

    # Save the model to disk
    try:
        # If using a GPU, move the model to CPU to avoid unnecessary GPU RAM usage
        # during loading if the user doesn't have a GPU setup.
        model_to_save = model.cpu()
        torch.save(model_to_save.state_dict(), model_path)
        print(f"Model successfully saved to {model_path}")
    except Exception as e:
        # Handle exceptions that may occur during the save process.
        print(f"An error occurred while saving the model: {str(e)}")


## Train and Save the Language Model

In this step, we define the hyperparameters for the neural language model, initialize the model, define the loss and optimizer, and train the model using the train function we defined earlier. We then save the trained model to disk using the save_model function.

In [6]:
# Define hyperparameters
vocab_size = len(set([word for tokens in data['tokens'] for word in tokens]))
embedding_dim = 100
hidden_dim = 256
num_layers = 2
learning_rate = 0.001
epochs = 10
batch_size = 128
sequence_length = 50
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize model
model = LanguageModel(vocab_size, embedding_dim, hidden_dim, num_layers).to(device)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train model
train(model, data['tokens'], optimizer, criterion, epochs, batch_size, sequence_length, device)

# Save model
model_path = 'model.pt'
save_model(model, model_path)

Model successfully saved to model.pt


In [7]:
vocab_size

257