# NLTK TOKENIZER AND TF-IDF VECTORIZER

In [27]:
import sys
sys.path.append('../')
from tokenizer import tokenizer
from vectorizer import vectorizer

import numpy as np
import pandas as pd
import sklearn as sk

In [28]:
data = pd.read_csv('../../_data/Reviews.csv') # Loading the dataset
X, y = data['Text'], data['Score']

In [29]:
# FOR TESTING : only select first 500 samples
X, y = X[:500], y[:500]

In [30]:
tokenized_documents = tokenizer(X)
X, vect = vectorizer(tokenized_documents)



In [31]:
# most frequent words
print("Top 10 most frequent words in the dataset")
print(vect.get_feature_names_out()[:10])

# least frequent words
print("Top 10 least frequent words in the dataset")
print(vect.get_feature_names_out()[-10:])

Top 10 most frequent words in the dataset
['0' '00' '1' '10' '100' '1000' '100ml' '10lb' '10lbs' '11']
Top 10 least frequent words in the dataset
['yucky' 'yum' 'yummy' 'zack' 'zen' 'zero' 'zest' 'zing' 'zip' 'ît']


In [32]:
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

print(X_train[0])

(400, 3977) (100, 3977) (400,) (100,)
  (0, 2619)	0.32608001899182526
  (0, 1692)	0.32608001899182526
  (0, 1628)	0.30581253945938824
  (0, 350)	0.30581253945938824
  (0, 954)	0.2914325338838896
  (0, 3508)	0.28027853501045363
  (0, 737)	0.23651756924351694
  (0, 3747)	0.1738972664967719
  (0, 1461)	0.19416474602920894
  (0, 1929)	0.20621941350316783
  (0, 3888)	0.14168859457871105
  (0, 2429)	0.11230763438727291
  (0, 400)	0.2711650543514526
  (0, 3964)	0.3303641616264214
  (0, 3959)	0.10830663921640983
  (0, 1795)	0.13416209657346923
  (0, 80)	0.06554353390434704
  (0, 201)	0.12198063892331497
  (0, 2406)	0.07835926214328101


# N-grams
### Model starts here

In [34]:
#Import all libraries
from collections import defaultdict
from collections import  defaultdict
import math
import random

In [35]:
def Create_And_Count_ngrams(tokens, n):
    # Creates n-grams and counts their occurrences
    ngram_counts = defaultdict(int)
    
    for i in range(len(tokens) - n + 1):
        ngram = tuple(tokens[i:i+n])
        ngram_counts[ngram] += 1
    
    return ngram_counts

def add_laplace(ngram, k):
    # Applies Laplace smoothing
    for key in ngram:
        ngram[key] += k

def Create_ngram(tokenized_documents, n, k=0.00001):
    # Creates the needed n-gram
    token_set = set(token for doc in tokenized_documents for token in doc)  # To not have duplicate tokens
    len_token_set = len(token_set)

    # Step 1: Count n-grams and (n-1)-grams
    tmp_ng_with_count = defaultdict(int)
    prefix_count = defaultdict(int)

    for doc in tokenized_documents:
        for i in range(len(doc) - n + 1):
            ngram = tuple(doc[i:i+n])
            tmp_ng_with_count[ngram] += 1
            prefix_count[ngram[:-1]] += 1

    final_ng = defaultdict(float)

    # Step 2: Apply Laplace smoothing
    add_laplace(tmp_ng_with_count, k)
    add_laplace(prefix_count, k)

    # Step 3: Calculate n-gram probabilities
    for ng in tmp_ng_with_count:
        final_ng[ng] = (tmp_ng_with_count[ng] + k) / (prefix_count[ng[:-1]] + k * len_token_set)

    return final_ng

def calculate_perplexity(tokenized_documents, ngram, n):
    # Calculates perplexity
    log_probability_sum = 0
    ngram_count = 0

    for doc in tokenized_documents:
        doc_tokens = doc.nonzero()[1]  # Get the non-zero token indices
        for i in range(len(doc_tokens) - n + 1):
            ngram_tuple = tuple(doc_tokens[i:i+n])
            if ngram_tuple in ngram:
                log_probability_sum += math.log2(ngram[ngram_tuple])
                ngram_count += 1

    if ngram_count == 0:
        return float('inf')

    average_log_probability = -log_probability_sum / ngram_count
    perplexity = math.pow(2, average_log_probability)

    return perplexity


def greedy_sampling(context, ngram_probabilities, n, max_length=50):
    # Take the most probable option only
    sentence = []
    sentence.extend(tuple(context[-(n):]))

    # Check is context toot small
    if len(context) < (n-1):
        print("context too small")
        return sentence

    context = tuple(context[-(n-1):])  
    
    for _ in range(max_length):
        probs = {}
        
        for token in ngram_probabilities:
            if token[:n-1] == context:
                probs[token[-1]] = ngram_probabilities[token]

        if not probs:
            print("No token possible in context")
            return sentence

        # For greedy algorithm
        best_token = max(probs, key=probs.get)  

        if best_token not in probs:
            print("No best token possible")
            return sentence

        sentence.append(best_token)
        context = context[1:] + (best_token,) 

    return sentence

import random

def top_k(context, ngram_probabilities, n, max_length=50, k=1):
    # Top k version. if k = 1, same result as greedy
    sentence = []
    sentence.extend(tuple(context[-(n):]))

    # Check if context too small
    if len(context) < (n-1):
        print("context too small")
        return sentence

    context = tuple(context[-(n-1):])  
    
    for _ in range(max_length):
        probs = {}
        
        for token in ngram_probabilities:
            if token[:n-1] == context:
                probs[token[-1]] = ngram_probabilities[token]

        if not probs:
            print("No token possible in context")
            return sentence

        # Selecting top k tokens probabilistically
        top_k_tokens = sorted(probs, key=probs.get, reverse=True)[:k]

        token_weights = [probs[token] for token in top_k_tokens]
        best_token = random.choices(top_k_tokens, weights=token_weights, k=1)[0]


        if best_token not in probs:
            print("No best token possible")
            return sentence

        sentence.append(best_token)
        context = context[1:] + (best_token,) 

    return sentence

In [36]:
n = 1  # Choose the desired n-gram size
k = 0.00001  # Choose the desired smoothing factor

# Convert X_train from csr_matrix to dense matrix
X_train_dense = X_train.toarray()

# Create n-gram probabilities using the dense matrix
ngram_probabilities = Create_ngram(X_train_dense, n, k)

# Use ngram_probabilities for further processing, such as perplexity calculation or sampling

In [37]:
# Calculate perplexity for the training data
train_perplexity = calculate_perplexity(X_train, ngram_probabilities, n)
print(f"Training Perplexity: {train_perplexity}")

# Calculate perplexity for the test data
test_perplexity = calculate_perplexity(X_test, ngram_probabilities, n)
print(f"Test Perplexity: {test_perplexity}")

Training Perplexity: 1.0127002288886564
Test Perplexity: inf


In [44]:
context = ['I', 'am', 'a', 'good']
print(top_k(context, ngram_probabilities, 4, 2, 1))


No token possible in context
['I', 'am', 'a', 'good']
