<a href="https://colab.research.google.com/github/pranavsrinivas29/Language-Models/blob/main/Language_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nltk



# **Statistical Language Model (n-gram)**

In [None]:
import nltk
from nltk import bigrams
from collections import Counter, defaultdict

# Example corpus
corpus = "We are about to demonstrate a simple language model. We will show how statistical models can predict the next word."

# Tokenization and bigram model preparation
nltk.download('punkt')
tokens = nltk.word_tokenize(corpus.lower())
bigram_counts = Counter(bigrams(tokens))
total_counts = len(tokens)

# Predict next word
def predict_next_word_statistical(previous_word):
    possibilities = {pair[1]: count for pair, count in bigram_counts.items() if pair[0] == previous_word}
    if possibilities:
        return max(possibilities, key=possibilities.get)
    else:
        return "No prediction available"

# Example usage
print("Statistical Model Prediction:", predict_next_word_statistical("language"))


Statistical Model Prediction: model


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
print("Statistical Model Prediction:", predict_next_word_statistical("we"))


Statistical Model Prediction: are


In [None]:
from nltk import ngrams
from collections import Counter, defaultdict

# Example corpus and tokenization
corpus = "We are about to demonstrate a simple language model. We will show how statistical models can predict the next word."
tokens = nltk.word_tokenize(corpus.lower())
fourgram_counts = Counter(ngrams(tokens, 4))
threegram_counts = Counter(ngrams(tokens, 3))

# Function to predict next word based on previous three words
def predict_next_word_statistical_4gram(word1, word2, word3):
    possibilities = {(pair[1], pair[2], pair[3]): count for pair, count in fourgram_counts.items() if pair[0] == word1 and pair[1] == word2 and pair[2] == word3}
    if possibilities:
        return max(possibilities, key=possibilities.get)[2]  # Return the fourth word of the most common 4-gram
    else:
        return "No prediction available"

# Example usage
print("Statistical 4-gram Model Prediction:", predict_next_word_statistical_4gram("a", "simple", "language"))


Statistical 4-gram Model Prediction: model


# **Neural Language Model**

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np

# Prepare data
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])
sequences = tokenizer.texts_to_sequences([corpus])[0]
vocab_size = len(tokenizer.word_index) + 1

# Create sequences
input_sequences = []
for i in range(1, len(sequences)):
    n_gram_sequence = sequences[:i+1]
    input_sequences.append(n_gram_sequence)

# Pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Predictors and label
X, labels = input_sequences[:,:-1],input_sequences[:,-1]
y = tf.keras.utils.to_categorical(labels, num_classes=vocab_size)

# Model
model = Sequential([
    Embedding(vocab_size, 10, input_length=max_sequence_len-1),
    LSTM(150),
    Dense(vocab_size, activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
model.fit(X, y, epochs=100, verbose=0)

def predict_next_word_neural(previous_text):
    token_list = tokenizer.texts_to_sequences([previous_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predictions = model.predict(token_list, verbose=0)
    predicted_index = np.argmax(predictions, axis=-1)[0]
    predicted_word = tokenizer.index_word[predicted_index] if predicted_index in tokenizer.index_word else "No prediction available"
    return predicted_word

# Example usage, ensuring "we are about to" is part of the training corpus for meaningful prediction
print("Neural Model Prediction:", predict_next_word_neural("we are not going"))



Neural Model Prediction: simple


#  **LLM**

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Ensure the tokenizer's pad token is set
tokenizer.pad_token = tokenizer.eos_token

def left_pad_sequence(sequence, max_length, pad_token):
    """
    Manually pads the sequence from the left to a specified max_length with the given pad_token.
    """
    padding_length = max_length - len(sequence)
    if padding_length > 0:
        return [pad_token] * padding_length + sequence
    else:
        return sequence

def generate_text_simple(prompt_text, max_length=100):
    # Tokenize the input text
    input_ids = tokenizer.encode(prompt_text, add_special_tokens=True)

    # Manually apply left-side padding
    padded_input_ids = left_pad_sequence(input_ids, max_length, tokenizer.pad_token_id)

    # Convert to tensors
    input_ids_tensor = torch.tensor([padded_input_ids])

    # Generate a sequence of text with attention mask
    attention_mask = [0 if token == tokenizer.pad_token_id else 1 for token in padded_input_ids]
    attention_mask_tensor = torch.tensor([attention_mask])

    # Generate text
    output = model.generate(input_ids_tensor, attention_mask=attention_mask_tensor, max_length=max_length, pad_token_id=tokenizer.pad_token_id)

    # Decode and return the generated text
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example usage
generated_text = generate_text_simple("We are about to", max_length=100)
print("Generated Text:", generated_text)


Generated Text: We are about to launch


# **GPT**

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Function to predict next words
def predict_next_word_large_model(prompt_text):
    input_ids = tokenizer.encode(prompt_text, return_tensors='pt')
    beam_output = model.generate(input_ids, max_length=50, num_return_sequences=1, no_repeat_ngram_size=2, early_stopping=True)
    return tokenizer.decode(beam_output[0], skip_special_tokens=True)

# Example usage
print("Large Model (GPT) Prediction:", predict_next_word_large_model("We are not going"))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Large Model (GPT) Prediction: We are not going to be able to do that. We are going through a period of time where we are trying to figure out how to get the best out of our players.

"We have to make sure we have the right players in


**GPT2LMHeadModel**: This is the GPT-2 model class that includes the language modeling head on top of the base GPT-2 architecture. The language modeling head allows the model to generate text.

**GPT2Tokenizer**: This class is responsible for converting text into a format that the GPT-2 model can understand (i.e., tokenizing the text into numbers).

**from_pretrained**('gpt2'): This method loads the pre-trained GPT-2 model and tokenizer. The 'gpt2' identifier refers to the default version of the GPT-2 model, which is trained on a wide variety of internet text.

**tokenizer.encode()**: Converts the prompt_text into a list of token IDs using the GPT-2 tokenizer. These token IDs represent the numerical values assigned to each piece of the text.

**return_tensors='pt'**: Specifies that the output should be a PyTorch tensor.
python


**model.generate()**: Generates text based on the provided input_ids. It uses several parameters to control the generation process:
**max_length=50**: The maximum length of the sequence to generate, including the given prompt text.

**num_return_sequences=1**: The number of generated sequences to return. Here, it's set to 1, meaning only one sequence will be generated.

**no_repeat_ngram_size=2**: Ensures that no 2-gram (sequence of two tokens) repeats within the generated text, helping to increase the diversity of the generated text.

**early_stopping**=True: Stops generation when all sequences reach the end of the sequence token (<EOS>).


**tokenizer.decode()**: Converts the generated token IDs back into a string of text. The skip_special_tokens=True argument tells the decoder to ignore special tokens, such as padding tokens or other non-text tokens, making the output cleaner and more readable.

# **Perplexity**

In [None]:

from collections import Counter
import math

# Ensure nltk's 'punkt' tokenizer models are downloaded
nltk.download('punkt')

# Tokenize the corpus
tokens = "language"

# Calculate N-gram and (N-1)-gram counts for 2-gram and 4-gram models
fourgram_counts = Counter(ngrams(tokens, 4))
threegram_counts = Counter(ngrams(tokens, 3))
twogram_counts = Counter(ngrams(tokens, 2))
unigram_counts = Counter(ngrams(tokens, 1))

def calculate_ngram_perplexity(ngram_counts, previous_ngram_counts):
    log_prob = 0
    N = 0
    for ngram in ngram_counts:
        if len(ngram) > 1:
            previous_ngram = ngram[:-1]
            prob = ngram_counts[ngram] / previous_ngram_counts[previous_ngram] if previous_ngram_counts[previous_ngram] > 0 else 1e-5
            log_prob += math.log(prob, 2) * ngram_counts[ngram]
            N += ngram_counts[ngram]
    perplexity = math.pow(2, -log_prob / N)
    return perplexity

# Calculate and print perplexities
fourgram_perplexity = calculate_ngram_perplexity(fourgram_counts, threegram_counts)
twogram_perplexity = calculate_ngram_perplexity(twogram_counts, unigram_counts)

print(f"2-gram Model Perplexity: {twogram_perplexity}")
print(f"4-gram Model Perplexity: {fourgram_perplexity}")


2-gram Model Perplexity: 1.4859942891369484
4-gram Model Perplexity: 1.0


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
