# NLTK TOKENIZER AND TF-IDF VECTORIZER

In [1]:
import sys
sys.path.append('../')
from tokenizer import tokenizer
from vectorizer import vectorizer
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import sklearn as sk
from collections import Counter
import re

[nltk_data] Downloading package punkt to /home/alexandre/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/alexandre/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
data = pd.read_csv('../../_data/Reviews.csv') # Loading the dataset
X, y = data['Text'], data['Score'] # Separating the data into input and output

# Function to tokenize each review in X_train
# puts them in a list
def tokenize_by_space(X_train):
    token_list = [review.split() for review in X_train]
    flattened_X_train = [word for review in token_list for word in review]
    return flattened_X_train

def tokenize_by_space_and_lowercase(X_train):
    token_list = [review.lower().split() for review in X_train]
    flattened_X_train = [word for review in token_list for word in review]
    return flattened_X_train

def tokenize_by_space_lowercase_and_punctuation(X_train):
    tokens_with_punctuation = []
    for token in X_train:
        token_lower = token.lower()
        tokens_with_punctuation.extend(re.findall(r"[\w']+|[^\w\s]", token_lower))
    return tokens_with_punctuation

smaller_X = X[:100000] # Add [:100] to get only the first 100 rows of the dataset
X_train, X_test = train_test_split(smaller_X, test_size=0.2, random_state=42) #split into X_train and X_test 




In [35]:
test = tokenize_by_space(X_test)
train = tokenize_by_space(X_train)


In [36]:
print(train[:100])
print(len(train))

['We', 'have', 'two', 'St.', 'Bernards,', 'one', 'of', 'whom', 'is', 'close', 'to', 'ancient,', 'and', 'last', 'year', 'she', 'developed', 'a', 'dental', 'problem.', 'Someone', 'mentioned', 'GREENIES', 'and', 'off', 'we', 'went', 'to', 'find', 'our', 'old', 'girl', 'something', 'to', 'help', 'her', 'age', 'gracefully', 'and', 'hopefully', 'without', 'pain.', 'Well,', 'she', 'just', 'loved', 'them', '-', 'as', 'did', 'the', 'younger', 'Saint', '-', 'and', 'here', 'we', 'are', 'a', 'couple', 'of', 'thousand', 'dollars', 'later', 'wondering', 'how', 'to', 'wean', 'at', 'least', 'the', 'younger', 'dog', 'off', 'these', 'obviously', 'addictive', 'green', 'hard', 'gel', 'bones', 'that', "don't", 'have', 'a', 'single', 'expensive', 'ingredient', 'in', 'them.', 'The', 'packaging,', 'however,', 'is', 'nicer', 'than', 'that', 'of', 'most', 'department']
6496499


# N-grams
### Model starts here

In [37]:
#Import all libraries
from collections import defaultdict
from collections import  defaultdict
import math
import random

In [38]:
def Create_And_Count_ngrams(tokens, n):
    # Creates n-grams and counts their occurrences
    ngram_counts = defaultdict(int)
    
    for i in range(len(tokens) - n + 1):
        ngram = tuple(tokens[i:i+n])
        ngram_counts[ngram] += 1
    
    return ngram_counts
    

def add_laplace(ngram, k):
    # Applies Laplace smoothing 
    for key in ngram:
        ngram[key] += k

def Create_ngram(train, test, n, k=0.00001):
# Creates the needed n-gram
    
    token_set = set(train)  # To not have duplicate tokens
    len_token_set = len(token_set)
    
    # Step 1: Count n-grams and (n-1)-grams 
    tmp_ng_with_count = Create_And_Count_ngrams(train, n)
    prefix_count = Create_And_Count_ngrams(train, n - 1)
    final_ng = defaultdict(float)

    # Step 2: Apply Laplace smoothing
    add_laplace(tmp_ng_with_count, k)
    add_laplace(prefix_count, k)
   
    # Step 3: Calculate n-gram probabilities
    for ng in tmp_ng_with_count:
        final_ng[ng] = (tmp_ng_with_count[ng] + k) / (prefix_count[ng[:-1]] + k * len_token_set)

    # Step 4: Fixing unseen data that exists in the test set
    for i in range(len(test) - n + 1):
        ng = tuple(test[i:i + n])
        if ng not in tmp_ng_with_count:
            final_ng[ng] = k / (prefix_count[ng[:-1]] + k * len_token_set)
    return final_ng

def greedy_sampling(context, ngram_probabilities, n, max_length=50):
    # Take the most probable option only
    sentence = []
    sentence.extend(tuple(context[-(n):]))

    # Check is context toot small
    if len(context) < (n-1):
        print("context too small")
        return sentence

    context = tuple(context[-(n-1):])  
    
    for _ in range(max_length):
        probs = {}
        
        for token in ngram_probabilities:
            if token[:n-1] == context:
                probs[token[-1]] = ngram_probabilities[token]

        if not probs:
            print("No token possible in context")
            return sentence

        # For greedy algorithm
        best_token = max(probs, key=probs.get)  

        if best_token not in probs:
            print("No best token possible")
            return sentence

        sentence.append(best_token)
        context = context[1:] + (best_token,) 

    return sentence

import random

def top_k(context, ngram_probabilities, n, max_length=50, k=1):
    # Top k version. if k = 1, same result as greedy
    sentence = []
    sentence.extend(tuple(context[-(n):]))

    # Check if context too small
    if len(context) < (n-1):
        print("context too small")
        return sentence

    context = tuple(context[-(n-1):])  
    
    for _ in range(max_length):
        probs = {}
        
        for token in ngram_probabilities:
            if token[:n-1] == context:
                probs[token[-1]] = ngram_probabilities[token]

        if not probs:
            print("No token possible in context")
            return sentence

        # Selecting top k tokens probabilistically
        top_k_tokens = sorted(probs, key=probs.get, reverse=True)[:k]

        token_weights = [probs[token] for token in top_k_tokens]
        best_token = random.choices(top_k_tokens, weights=token_weights, k=1)[0]


        if best_token not in probs:
            print("No best token possible")
            return sentence

        sentence.append(best_token)
        context = context[1:] + (best_token,) 

    return sentence

def calculate_perplexity(test, ngram, n):
    # Calculates perplexity
    log_probability_sum = 0
    ngram_count = 0
    
    for i in range(len(test)-n+1):
        log_probability_sum += math.log2(ngram[tuple(test[i:i+n])])
        ngram_count += 1
    
    average_log_probability = -log_probability_sum / ngram_count
    perplexity = math.pow(2, average_log_probability)
    
    return perplexity

def calculate_coverage(test_data, ngram_probabilities):
    # Count the number of n-grams in the test data
    total_ngrams = len(test_data) - (n - 1) * len(test_data)
    
    # Check if there are n-grams in the test data
    if total_ngrams == 0:
        return 0
    
    # Count the number of n-grams covered by the n-gram model
    covered_ngrams = sum(1 for ng in test_data if ng in ngram_probabilities)
    
    # Calculate coverage percentage
    coverage_percentage = (covered_ngrams / total_ngrams) * 100
    
    return coverage_percentage



In [50]:
n = 5 # Choose the desired n-gram size
k = 0.0001  # Choose the desired smoothing factor
k2 = 5  # Choose the desired top k value


# Create n-gram probabilities using the dense matrix
ngram_probabilities = Create_ngram(train,test, n, k)

# Use ngram_probabilities for further processing, such as perplexity calculation or sampling

In [51]:
from collections import Counter

# Calculate perplexity for the training data
train_perplexity = calculate_perplexity(train, ngram_probabilities, n)
print(f"Training Perplexity: {train_perplexity}")

# Calculate perplexity for the test data
test_perplexity = calculate_perplexity(test, ngram_probabilities, n)
print(f"Test Perplexity: {test_perplexity}")

Training Perplexity: 17.45639959537635
Test Perplexity: 23722.516059924772


In [52]:
context = ['I','am','very','satisfied','with']
print(top_k(context, ngram_probabilities, n, 50, k2))


['I', 'am', 'very', 'satisfied', 'with', 'the', 'product', 'and', 'the', 'lightning', "delivery...I've", 'shopped', 'thru', 'Amazon', 'for', 'years', 'and', 'this', 'is', 'the', 'only', 'one', 'I', 'use.', 'Not', 'as', 'tasty', 'as', 'the', 'other', 'reviewers', 'have', 'said,', 'is', 'great', 'to', 'mix', 'into', 'other', 'foods.', "I've", 'tried', 'to', 'reconstitute', 'it', 'in', 'order', 'to', 'really', 'enjoy', 'the', 'depth.', 'I', 'like', 'the']


In [54]:
context = ['I','am','very','satisfied','with']
print(greedy_sampling(context, ngram_probabilities, n))

['I', 'am', 'very', 'satisfied', 'with', 'the', 'taste', 'and', 'texture', 'of', 'this', 'pasta', 'is', 'beyond', 'compare.', 'I', 'have', 'not', 'bought', 'store', 'pasta', 'since', 'and', 'will', 'never', 'buy', 'from', 'a', 'grocery', 'store', 'again.<br', '/>THE', 'BEST', 'PASTA', 'EVER!', "Lucy's", 'is', 'one', 'of', 'the', 'best', 'tasting', 'bars', "I've", 'eaten...', 'ever.', 'I', 'love', 'the', 'tangy', 'taste', 'of', 'the', 'cherry', 'and']
