# NLTK TOKENIZER AND TF-IDF VECTORIZER

In [1]:
import sys
sys.path.append('../')
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import sklearn as sk
from collections import Counter
import re

In [2]:
data = pd.read_csv('../_data/Reviews.csv') # Loading the dataset
X, y = data['Text'], data['Score'] # Separating the data into input and output

# Function to tokenize each review in X_train
# puts them in a list
def tokenize_by_space(X_train):
    token_list = [review.split() for review in X_train]
    flattened_X_train = [word for review in token_list for word in review]
    return flattened_X_train

def tokenize_by_space_and_lowercase(X_train):
    token_list = [review.lower().split() for review in X_train]
    flattened_X_train = [word for review in token_list for word in review]
    return flattened_X_train

def tokenize_by_space_lowercase_and_punctuation(X_train):
    tokens_with_punctuation = []
    for token in X_train:
        token_lower = token.lower()
        tokens_with_punctuation.extend(re.findall(r"[\w']+|[^\w\s]", token_lower))
    return tokens_with_punctuation

smaller_X = X[:100000] # Add [:100] to get only the first 100 rows of the dataset
X_train, X_test = train_test_split(smaller_X, test_size=0.2, random_state=42) #split into X_train and X_test 




In [3]:
test_config_1 = tokenize_by_space(X_test)
train_config_1 = tokenize_by_space(X_train)
test_config_2 = tokenize_by_space_and_lowercase(X_test)
train_config_2 = tokenize_by_space_and_lowercase(X_train)
test_config_3 = tokenize_by_space_lowercase_and_punctuation(X_test)
train_config_3 = tokenize_by_space_lowercase_and_punctuation(X_train)

# N-grams
### Model starts here

In [4]:
#Import all libraries
from collections import defaultdict
from collections import  defaultdict
import math
import random

In [5]:
def Create_And_Count_ngrams(tokens, n):
    # Creates n-grams and counts their occurrences
    ngram_counts = defaultdict(int)
    
    for i in range(len(tokens) - n + 1):
        ngram = tuple(tokens[i:i+n])
        ngram_counts[ngram] += 1
    
    return ngram_counts
    

def add_laplace(ngram, k):
    # Applies Laplace smoothing 
    for key in ngram:
        ngram[key] += k

def Create_ngram(train, test, n, k=0.00001):
# Creates the needed n-gram
    
    token_set = set(train)  # To not have duplicate tokens
    len_token_set = len(token_set)
    
    # Step 1: Count n-grams and (n-1)-grams 
    tmp_ng_with_count = Create_And_Count_ngrams(train, n)
    prefix_count = Create_And_Count_ngrams(train, n - 1)
    final_ng = defaultdict(float)

    # Step 2: Apply Laplace smoothing
    add_laplace(tmp_ng_with_count, k)
    add_laplace(prefix_count, k)
   
    # Step 3: Calculate n-gram probabilities
    for ng in tmp_ng_with_count:
        final_ng[ng] = (tmp_ng_with_count[ng] + k) / (prefix_count[ng[:-1]] + k * len_token_set)

    # Step 4: Fixing unseen data that exists in the test set
    for i in range(len(test) - n + 1):
        ng = tuple(test[i:i + n])
        if ng not in tmp_ng_with_count:
            final_ng[ng] = k / (prefix_count[ng[:-1]] + k * len_token_set)
    return final_ng

def greedy_sampling(context, ngram_probabilities, n, max_length=50):
    # Take the most probable option only
    sentence = []
    sentence.extend(tuple(context[-(n):]))

    # Check is context toot small
    if len(context) < (n-1):
        print("context too small")
        return sentence

    context = tuple(context[-(n-1):])  
    
    for _ in range(max_length):
        probs = {}
        
        for token in ngram_probabilities:
            if token[:n-1] == context:
                probs[token[-1]] = ngram_probabilities[token]

        if not probs:
            print("No token possible in context")
            return sentence

        # For greedy algorithm
        best_token = max(probs, key=probs.get)  

        if best_token not in probs:
            print("No best token possible")
            return sentence

        sentence.append(best_token)
        context = context[1:] + (best_token,) 

    return sentence

import random

def top_k(context, ngram_probabilities, n, max_length=50, k=1):
    # Top k version. if k = 1, same result as greedy
    sentence = []
    sentence.extend(tuple(context[-(n):]))

    # Check if context too small
    if len(context) < (n-1):
        print("context too small")
        return sentence

    context = tuple(context[-(n-1):])  
    
    for _ in range(max_length):
        probs = {}
        
        for token in ngram_probabilities:
            if token[:n-1] == context:
                probs[token[-1]] = ngram_probabilities[token]

        if not probs:
            print("No token possible in context")
            return sentence

        # Selecting top k tokens probabilistically
        top_k_tokens = sorted(probs, key=probs.get, reverse=True)[:k]

        token_weights = [probs[token] for token in top_k_tokens]
        best_token = random.choices(top_k_tokens, weights=token_weights, k=1)[0]


        if best_token not in probs:
            print("No best token possible")
            return sentence

        sentence.append(best_token)
        context = context[1:] + (best_token,) 

    return sentence

def calculate_perplexity(test, ngram, n):
    # Calculates perplexity
    log_probability_sum = 0
    ngram_count = 0
    
    for i in range(len(test)-n+1):
        log_probability_sum += math.log2(ngram[tuple(test[i:i+n])])
        ngram_count += 1
    
    average_log_probability = -log_probability_sum / ngram_count
    perplexity = math.pow(2, average_log_probability)
    
    return perplexity

def calculate_coverage(test_data, ngram_probabilities):
    # Count the number of n-grams in the test data
    total_ngrams = len(test_data) - (n - 1) * len(test_data)
    
    # Check if there are n-grams in the test data
    if total_ngrams == 0:
        return 0
    
    # Count the number of n-grams covered by the n-gram model
    covered_ngrams = sum(1 for ng in test_data if ng in ngram_probabilities)
    
    # Calculate coverage percentage
    coverage_percentage = (covered_ngrams / total_ngrams) * 100
    
    return coverage_percentage

def create_and_test(train, test, n, k):
    # Create n-gram model
    ngram = Create_ngram(train, test, n, k)
    # Calculate train and test perplexity
    train_perplexity = calculate_perplexity(train, ngram, n)
    test_perplexity = calculate_perplexity(test, ngram, n)
    # print
    print(f"Train perplexity: {train_perplexity} with k={k} and n={n}") 
    print(f"Test perplexity: {test_perplexity} with k={k} and n={n}")
    return ngram


In [None]:
print("Tokenize by space")
create_and_test(train_config_1, test_config_1, 2, 1)
create_and_test(train_config_1, test_config_1, 2, 0.0001)
create_and_test(train_config_1, test_config_1, 3, 1)
create_and_test(train_config_1, test_config_1, 3, 0.0001)
create_and_test(train_config_1, test_config_1, 5, 1)
conf1_6 = create_and_test(train_config_1, test_config_1, 5, 0.0001)

Tokenize by space
Train perplexity: 4016.246389511696 with k=1 and n=2
Test perplexity: 5010.116266825135 with k=1 and n=2
Train perplexity: 108.89481486689421 with k=0.0001 and n=2
Test perplexity: 512.7857387543766 with k=0.0001 and n=2
Train perplexity: 25739.50823889081 with k=1 and n=3
Test perplexity: 44527.349299523026 with k=1 and n=3
Train perplexity: 29.134200733167308 with k=0.0001 and n=3
Test perplexity: 1811.7597535961013 with k=0.0001 and n=3


: 

Train perplexity: 17.45639959537635 with k=0.0001 and n=5
Test perplexity: 23722.516059924772 with k=0.0001 and n=5


In [6]:
print("Tokenize by space")
conf1_1 = create_and_test(train_config_1, test_config_1, 2, 1)
conf1_2 = create_and_test(train_config_1, test_config_1, 2, 0.0001)
conf1_3 = create_and_test(train_config_1, test_config_1, 3, 1)
conf1_4 = create_and_test(train_config_1, test_config_1, 3, 0.0001)
conf1_5 = create_and_test(train_config_1, test_config_1, 5, 1)
conf1_6 = create_and_test(train_config_1, test_config_1, 5, 0.0001)


Tokenize by space
Train perplexity: 4016.246389511696 with k=1 and n=2
Test perplexity: 5010.116266825135 with k=1 and n=2
Train perplexity: 108.89481486689421 with k=0.0001 and n=2
Test perplexity: 512.7857387543766 with k=0.0001 and n=2
Train perplexity: 25739.50823889081 with k=1 and n=3
Test perplexity: 44527.349299523026 with k=1 and n=3
Train perplexity: 29.134200733167308 with k=0.0001 and n=3
Test perplexity: 1811.7597535961013 with k=0.0001 and n=3
Train perplexity: 57298.18756943105 with k=1 and n=5
Test perplexity: 140665.09498627184 with k=1 and n=5


: 

In [None]:
print("Tokenize by space and lowercase all capital letters")
conf2_1 = create_and_test(train_config_2, test_config_2, 2, 1)
conf2_2 = create_and_test(train_config_2, test_config_2, 2, 0.0001)
conf2_3 = create_and_test(train_config_2, test_config_2, 3, 1)
conf2_4 = create_and_test(train_config_2, test_config_2, 3, 0.0001)
conf2_5 = create_and_test(train_config_2, test_config_2, 5, 1)
conf2_6 = create_and_test(train_config_2, test_config_2, 5, 0.0001)

In [None]:
print("Tokenize by space and punctuation and lowercase all capital letter")
conf3_1 = create_and_test(train_config_3, test_config_3, 2, 1)
conf3_2 = create_and_test(train_config_3, test_config_3, 2, 0.0001)
conf3_3 = create_and_test(train_config_3, test_config_3, 3, 1)
conf3_4 = create_and_test(train_config_3, test_config_3, 3, 0.0001)
conf3_5 = create_and_test(train_config_3, test_config_3, 5, 1)
conf3_6 = create_and_test(train_config_3, test_config_3, 5, 0.0001)

In [52]:
# More explanation on the effect of different tokenizer and other parameters have on the n-gram model text generation can be found in the README

context = ['I','am','very','satisfied','with']
print(top_k(context, conf1_6, 5, 50, 5))


['I', 'am', 'very', 'satisfied', 'with', 'the', 'product', 'and', 'the', 'lightning', "delivery...I've", 'shopped', 'thru', 'Amazon', 'for', 'years', 'and', 'this', 'is', 'the', 'only', 'one', 'I', 'use.', 'Not', 'as', 'tasty', 'as', 'the', 'other', 'reviewers', 'have', 'said,', 'is', 'great', 'to', 'mix', 'into', 'other', 'foods.', "I've", 'tried', 'to', 'reconstitute', 'it', 'in', 'order', 'to', 'really', 'enjoy', 'the', 'depth.', 'I', 'like', 'the']


In [54]:
context = ['I','am','very','satisfied','with']
print(greedy_sampling(context, conf1_6, 5))

['I', 'am', 'very', 'satisfied', 'with', 'the', 'taste', 'and', 'texture', 'of', 'this', 'pasta', 'is', 'beyond', 'compare.', 'I', 'have', 'not', 'bought', 'store', 'pasta', 'since', 'and', 'will', 'never', 'buy', 'from', 'a', 'grocery', 'store', 'again.<br', '/>THE', 'BEST', 'PASTA', 'EVER!', "Lucy's", 'is', 'one', 'of', 'the', 'best', 'tasting', 'bars', "I've", 'eaten...', 'ever.', 'I', 'love', 'the', 'tangy', 'taste', 'of', 'the', 'cherry', 'and']
