In [12]:
import io
import pandas as pd
import re
import os
import nltk
from nltk.corpus import cmudict

# Download CMUdict if not already available
nltk.download('cmudict')
cmu_dict = cmudict.dict()


corpus = '/content/sample_data/corpus.csv'
output_dir = '/content/Data'
os.makedirs(output_dir, exist_ok=True)

# Function to read the CSV file line by line
def read_csv_line_by_line(file_path):
    punctuation_removal = re.compile(r'[^\w\s]')

    # Reading the CSV with pandas, assuming 'content' is the column containing text
    for chunk in pd.read_csv(file_path, chunksize=1):
        if 'content' in chunk:
            content = chunk['content'].values[0]
            if isinstance(content, str):
                # Remove punctuation
                cleaned_content = punctuation_removal.sub('', content).lower()
                yield cleaned_content

# Tokenizer definitions

#Word-Based tokenizer

def word_tokenize(text):
    # Tokenizes text into words and punctuation
    return re.findall(r'\b\w+\b|\S', text)

# Character-based tokenizer
def character_tokenize(text):
    return list(text)

# Simplified sub-word tokenizer (manual split without external libraries)
def subword_tokenize(text):
    words = text.split()
    subwords = []
    for word in words:
        if len(word) > 5:  # Arbitrarily split longer words into parts
            mid = len(word) // 2
            subwords.extend([word[:mid], word[mid:]])
        else:
            subwords.append(word)
    return subwords

# Syllable-based tokenizer (very simplified version)
def syllable_tokenize(text):
    # Function to extract syllables based on CMUdict
    def extract_syllables(word):
        # Normalize to lowercase
        word = word.lower()
        if word in cmu_dict:
            # Extract the phoneme sequence for the first pronunciation
            syllables = [''.join([char for char in phon if not char.isdigit()]) for phon in cmu_dict[word][0]]
            # Return a flat list of syllables
            return ' '.join(syllables)
        else:
            # Fallback to a rough heuristic if the word is not in the dictionary
            return ' '.join([char for char in word if char.isalpha()])

    # Split the text into words and process each word for syllables
    words = re.findall(r'\b\w+\b', text)
    syllable_list = [extract_syllables(word) for word in words]
    return ' '.join(syllable_list)

# Buffered writing function
def buffered_write(file_path, generator, mode='w'):
    with io.open(file_path, mode, buffering=1024) as f:
        for data in generator:
            # Ensure data is a string before writing
            if isinstance(data, list):
                data = ' '.join(data)
            f.write(data + '\n')

# File paths for output
word_output_path = os.path.join(output_dir, 'word_tokens.txt')
char_output_path = os.path.join(output_dir, 'char_tokens.txt')
subword_output_path = os.path.join(output_dir, 'subword_tokens.txt')
syllable_output_path = os.path.join(output_dir, 'syllable_tokens.txt')



# Process and write tokens to files
buffered_write(word_output_path, (word_tokenize(line) for line in read_csv_line_by_line(corpus)))
buffered_write(char_output_path, (character_tokenize(line) for line in read_csv_line_by_line(corpus)))
buffered_write(subword_output_path, (subword_tokenize(line) for line in read_csv_line_by_line(corpus)))
buffered_write(syllable_output_path, (syllable_tokenize(line) for line in read_csv_line_by_line(corpus)))



[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [13]:
# Confirm file paths
print(word_output_path, char_output_path, subword_output_path, syllable_output_path)

/content/Data/word_tokens.txt /content/Data/char_tokens.txt /content/Data/subword_tokens.txt /content/Data/syllable_tokens.txt


In [14]:
from collections import defaultdict
import pickle
word_model_output_dir = '/content/Models/Word'
char_model_output_dir = '/content/Models/Char'
sw_model_output_dir = '/content/Models/Sw'
sy_model_output_dir = '/content/Models/Sy'
os.makedirs(word_model_output_dir, exist_ok=True)
os.makedirs(char_model_output_dir, exist_ok=True)
os.makedirs(sw_model_output_dir, exist_ok=True)
os.makedirs(sy_model_output_dir, exist_ok=True)
# Function to generate n-grams from tokenized data
def generate_ngrams(tokens, n):
    return (tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1))

# Function to build the n-gram model
def build_ngram_model(file_path, n):
    ngram_model = defaultdict(lambda: defaultdict(int))
    with io.open(file_path, 'r', buffering=1024) as file:
        for line in file:
            tokens = line.strip().split()
            for i in range(len(tokens) - n + 1):
                context = tuple(tokens[i:i+n-1])
                word = tokens[i+n-1]
                ngram_model[context][word] += 1

    # Convert counts to probabilities
    for context, words in ngram_model.items():
        total = sum(words.values())
        for word in words:
            words[word] /= total

    return ngram_model

# Build and test the model with different N values using the different tokenization tokens
nList = [2,3,4,5]

w2gram_model_path = os.path.join(word_model_output_dir, 'word-based2Gram.pkl')
w3gram_model_path = os.path.join(word_model_output_dir, 'word-based3Gram.pkl')
w4gram_model_path = os.path.join(word_model_output_dir, 'word-based4Gram.pkl')
w5gram_model_path = os.path.join(word_model_output_dir, 'word-based5Gram.pkl')
w_List = [w2gram_model_path, w3gram_model_path, w4gram_model_path, w5gram_model_path]

c2gram_model_path = os.path.join(char_model_output_dir, 'char-based2Gram.pkl')
c3gram_model_path = os.path.join(char_model_output_dir, 'char-based3Gram.pkl')
c4gram_model_path = os.path.join(char_model_output_dir, 'char-based4Gram.pkl')
c5gram_model_path = os.path.join(char_model_output_dir, 'char-based5Gram.pkl')
c_List = [c2gram_model_path, c3gram_model_path, c4gram_model_path, c5gram_model_path]

sw2gram_model_path = os.path.join(sw_model_output_dir, 'subWord-based2Gram.pkl')
sw3gram_model_path = os.path.join(sw_model_output_dir, 'subWord-based3Gram.pkl')
sw4gram_model_path = os.path.join(sw_model_output_dir, 'subWord-based4Gram.pkl')
sw5gram_model_path = os.path.join(sw_model_output_dir, 'subWord-based5Gram.pkl')
sw_List = [sw2gram_model_path, sw3gram_model_path, sw4gram_model_path, sw5gram_model_path]

sy2gram_model_path = os.path.join(sy_model_output_dir, 'syllable-based2Gram.pkl')
sy3gram_model_path = os.path.join(sy_model_output_dir, 'syllable-based3Gram.pkl')
sy4gram_model_path = os.path.join(sy_model_output_dir, 'syllable-based4Gram.pkl')
sy5gram_model_path = os.path.join(sy_model_output_dir, 'syllable-based5Gram.pkl')
sy_List = [sy2gram_model_path, sy3gram_model_path, sy4gram_model_path, sy5gram_model_path]

def process_tokenizations(nList, tokenization_lists, tokenization_paths):
    for tokenization_list, tokenization_path in zip(tokenization_lists, tokenization_paths):
        for n, model_path in zip(nList, tokenization_list):
            model = build_ngram_model(tokenization_path, n)
            with open(model_path, 'wb') as f:
                pickle.dump(dict(model), f)

# List of tokenization output paths
tokenization_paths = [word_output_path,char_output_path, subword_output_path, syllable_output_path]
tokenization_lists = [w_List,c_List, sw_List, sy_List]

# Process each tokenization method
process_tokenizations(nList, tokenization_lists, tokenization_paths)




In [15]:
model_path = '/content/Models/Word/word-based5Gram.pkl'
with open(model_path, 'rb') as file:
    model = pickle.load(file)


# Print the first few entries of the model to see examples of stored n-grams
print(list(model.keys())[:10])

def print_model_statistics(model):
    total_entries = len(model)
    sample_entries = list(model.items())[:10]

    print(f"Total number of unique contexts in model: {total_entries}")
    print("Sample entries from model:")
    for context, continuations in sample_entries:
        print(f"{context} -> {continuations}")

    # Check average number of continuations per context
    average_continuations = sum(len(continuations) for continuations in model.values()) / total_entries
    print(f"Average number of continuations per context: {average_continuations:.2f}")

# Assuming 'model' is already loaded
print_model_statistics(model)

[('turn', 'passion', 'into', 'purpose'), ('passion', 'into', 'purpose', 'at'), ('into', 'purpose', 'at', 'st'), ('purpose', 'at', 'st', 'johns'), ('at', 'st', 'johns', 'university'), ('st', 'johns', 'university', 'new'), ('johns', 'university', 'new', 'york'), ('university', 'new', 'york', 'ny'), ('new', 'york', 'ny', 'hi'), ('york', 'ny', 'hi', 'im')]
Total number of unique contexts in model: 253926
Sample entries from model:
('turn', 'passion', 'into', 'purpose') -> defaultdict(<class 'int'>, {'at': 0.3333333333333333, 'explore': 0.3333333333333333, 'plan': 0.3333333333333333})
('passion', 'into', 'purpose', 'at') -> defaultdict(<class 'int'>, {'st': 1.0})
('into', 'purpose', 'at', 'st') -> defaultdict(<class 'int'>, {'johns': 1.0})
('purpose', 'at', 'st', 'johns') -> defaultdict(<class 'int'>, {'university': 1.0})
('at', 'st', 'johns', 'university') -> defaultdict(<class 'int'>, {'new': 0.011396011396011397, 'we': 0.011396011396011397, 'who': 0.002849002849002849, 'why': 0.002849002

In [16]:
from collections import deque
import random
# Define test prompts


def predict_text(model, prompt, num_words=2, max_context_length=5):
    words = prompt.split()
    output = prompt
    for _ in range(num_words):
        found = False
        # Try from the longest context down to two words
        for length in range(min(max_context_length, len(words)), 1, -1):
            context = tuple(words[-length:])
            if context in model:
                # Selecting the continuation probabilistically
                continuations = list(model[context].items())
                next_word = max(continuations, key=lambda x: x[1])[0]
                output += ' ' + next_word
                words.append(next_word)
                found = True
                break  # Exit the loop if a continuation is found
        if not found:
            output += " [No suitable continuation found.]"
            break  # Stop trying if no suitable continuation is found at any level

    return output



In [17]:
def test_models(directory, prompts):
    results = {}
    model_files = [f for f in os.listdir(directory) if f.endswith('.pkl')]

    for model_file in model_files:
        model_path = os.path.join(directory, model_file)
        try:
            with open(model_path, 'rb') as file:
                model = pickle.load(file)
        except Exception as e:
            print(f"Failed to load model {model_file}: {e}")
            continue

        model_results = {}
        for prompt in prompts:
            completion = predict_text(model, prompt, num_words=20)
            model_results[prompt] = completion
        results[model_file] = model_results

    return results

# Testing usage with Word based Models
models_directory = '/content/Models/Word'
prompts = [
    "in the fall semester the business department offers courses such as",
    "to apply for financial aid and scholarships, students must first",
    "the process for undergraduate admissions includes steps like",
    "graduate students have research opportunities in areas like",
    "on campus housing options provide amenities such as",
    "the universitys policies on academic integrity include guidelines like",
    "students can engage in extracurricular activities such as",
    "support for international students includes services like",
    "submitting a thesis or dissertation requires steps such as",
    "the university's career center offers services like"
]
model_test_results = test_models(models_directory, prompts)

for model_name, results in model_test_results.items():
    print(f"Results for {model_name}:")
    for prompt, completion in results.items():
        print(f"Prompt: {prompt}\nCompletion: {completion}\n")

Results for word-based5Gram.pkl:
Prompt: in the fall semester the business department offers courses such as
Completion: in the fall semester the business department offers courses such as [No suitable continuation found.]

Prompt: to apply for financial aid and scholarships, students must first
Completion: to apply for financial aid and scholarships, students must first [No suitable continuation found.]

Prompt: the process for undergraduate admissions includes steps like
Completion: the process for undergraduate admissions includes steps like [No suitable continuation found.]

Prompt: graduate students have research opportunities in areas like
Completion: graduate students have research opportunities in areas like [No suitable continuation found.]

Prompt: on campus housing options provide amenities such as
Completion: on campus housing options provide amenities such as [No suitable continuation found.]

Prompt: the universitys policies on academic integrity include guidelines like
C

# Char Based Testing #

In [18]:
    # Testing usage with Word based Models
models_directory = '/content/Models/Char'
prompts = [
   "i n   t h e   f a l l   s e m e s t e r   t h e   b u s i n e s s   d e p a r t m e n t   o f f e r s   c o u r s e s   s u c h   a s",
    "t o   a p p l y   f o r   f i n a n c i a l   a i d   a n d   s c h o l a r s h i p s ,   s t u d e n t s   m u s t   f i r s t",
    "t h e   p r o c e s s   f o r   u n d e r g r a d u a t e   a d m i s s i o n s   i n c l u d e s   s t e p s   l i k e",
    "g r a d u a t e   s t u d e n t s   h a v e   r e s e a r c h   o p p o r t u n i t i e s   i n   a r e a s   l i k e",
    "o n  c a m p u s   h o u s i n g   o p t i o n s   p r o v i d e   a m e n i t i e s   s u c h   a s",
    "t h e   u n i v e r s i t y ' s   p o l i c i e s   o n   a c a d e m i c   i n t e g r i t y   i n c l u d e   g u i d e l i n e s   l i k e",
    "s t u d e n t s   c a n   e n g a g e   i n   e x t r a c u r r i c u l a r   a c t i v i t i e s   s u c h   a s",
    "s u p p o r t   f o r   i n t e r n a t i o n a l   s t u d e n t s   i n c l u d e s   s e r v i c e s   l i k e",
    "s u b m i t t i n g   a   t h e s i s   o r   d i s s e r t a t i o n   r e q u i r e s   s t e p s   s u c h   a s",
    "t h e   u n i v e r s i t y ' s   c a r e e r   c e n t e r   o f f e r s   s e r v i c e s   l i k e"

]
model_test_results = test_models(models_directory, prompts)

for model_name, results in model_test_results.items():
  print(f"Results for {model_name}:")
  for prompt, completion in results.items():
        print(f"Prompt: {prompt}\nCompletion: {completion}\n")

Results for char-based4Gram.pkl:
Prompt: i n   t h e   f a l l   s e m e s t e r   t h e   b u s i n e s s   d e p a r t m e n t   o f f e r s   c o u r s e s   s u c h   a s
Completion: i n   t h e   f a l l   s e m e s t e r   t h e   b u s i n e s s   d e p a r t m e n t   o f f e r s   c o u r s e s   s u c h   a s a m e l a t i o n a l s t u d e n t s a

Prompt: t o   a p p l y   f o r   f i n a n c i a l   a i d   a n d   s c h o l a r s h i p s ,   s t u d e n t s   m u s t   f i r s t
Completion: t o   a p p l y   f o r   f i n a n c i a l   a i d   a n d   s c h o l a r s h i p s ,   s t u d e n t s   m u s t   f i r s t u d e n t s a n d s c i e n c e s s i o

Prompt: t h e   p r o c e s s   f o r   u n d e r g r a d u a t e   a d m i s s i o n s   i n c l u d e s   s t e p s   l i k e
Completion: t h e   p r o c e s s   f o r   u n d e r g r a d u a t e   a d m i s s i o n s   i n c l u d e s   s t e p s   l i k e h a v e a n d s c i e n c e s s i o n a

Prompt: g r a d u a 

# Subword Based Testing #

In [19]:


models_directory = '/content/Models/Sw'
prompts = [
   "in the fall sem ester the busi ness depart ment off ers cour ses such as",
    "to apply for finan cial aid and schol ar ships, stu dents must first",
    "the proc ess for under gradu ate admi ssions inclu des steps like",
    "gradu ate stu dents have rese arch oppor tunit ies in areas like",
    "on campus hous ing opt ions prov ide amen ities such as",
    "the univer sity's poli cies on acad emic integ rity inclu de guide lines like",
    "stu dents can eng age in extra curri cular activ ities such as",
    "sup port for intern ational stu dents inclu des serv ices like",
    "sub mit ting a thesis or disser tation requ ires steps such as",
    "the univer sity's car eer cent er off ers serv ices like"
]
model_test_results = test_models(models_directory, prompts)

for model_name, results in model_test_results.items():
  print(f"Results for {model_name}:")
  for prompt, completion in results.items():
        print(f"Prompt: {prompt}\nCompletion: {completion}\n")

Results for subWord-based4Gram.pkl:
Prompt: in the fall sem ester the busi ness depart ment off ers cour ses such as
Completion: in the fall sem ester the busi ness depart ment off ers cour ses such as [No suitable continuation found.]

Prompt: to apply for finan cial aid and schol ar ships, stu dents must first
Completion: to apply for finan cial aid and schol ar ships, stu dents must first [No suitable continuation found.]

Prompt: the proc ess for under gradu ate admi ssions inclu des steps like
Completion: the proc ess for under gradu ate admi ssions inclu des steps like [No suitable continuation found.]

Prompt: gradu ate stu dents have rese arch oppor tunit ies in areas like
Completion: gradu ate stu dents have rese arch oppor tunit ies in areas like [No suitable continuation found.]

Prompt: on campus hous ing opt ions prov ide amen ities such as
Completion: on campus hous ing opt ions prov ide amen ities such as hon ors athl etic awa rds recog nition for exem plary partic ipati

# Syllable Based Testing #


In [20]:
   # Testing usage with Syllable based Models
models_directory = '/content/Models/Sy'
prompts = [
   "in the fall sem es ter the busi ness de part ment of fers cours es such as",
    "to ap ply for fi nan cial aid and schol ar ships stu dents must first",
    "the proc ess for un der grad u ate ad mis sions in cludes steps like",
    "grad u ate stu dents have re search op por tu ni ties in ar eas like",
    "on cam pus hous ing op tions pro vide am en i ties such as",
    "the u ni ver si ty's pol i cies on a ca dem ic in teg ri ty in clude guide lines like",
    "stu dents can en gage in ex tra cur ric u lar ac tiv i ties such as",
    "sup port for in ter na tion al stu dents in cludes serv ices like",
    "sub mit ting a the sis or dis ser ta tion re quires steps such as",
    "the u ni ver si ty's ca reer cen ter of fers serv ices like"
]
model_test_results = test_models(models_directory, prompts)

for model_name, results in model_test_results.items():
    print(f"Results for {model_name}:")
    for prompt, completion in results.items():
        print(f"Prompt: {prompt}\nCompletion: {completion}\n")

Results for syllable-based3Gram.pkl:
Prompt: in the fall sem es ter the busi ness de part ment of fers cours es such as
Completion: in the fall sem es ter the busi ness de part ment of fers cours es such as [No suitable continuation found.]

Prompt: to ap ply for fi nan cial aid and schol ar ships stu dents must first
Completion: to ap ply for fi nan cial aid and schol ar ships stu dents must first [No suitable continuation found.]

Prompt: the proc ess for un der grad u ate ad mis sions in cludes steps like
Completion: the proc ess for un der grad u ate ad mis sions in cludes steps like [No suitable continuation found.]

Prompt: grad u ate stu dents have re search op por tu ni ties in ar eas like
Completion: grad u ate stu dents have re search op por tu ni ties in ar eas like [No suitable continuation found.]

Prompt: on cam pus hous ing op tions pro vide am en i ties such as
Completion: on cam pus hous ing op tions pro vide am en i ties such as [No suitable continuation found.]

Prom

# Results #
These indicates syllable based tokenization may be innefective for this particular range of N values.  The word based tokenization technique works most effectively, particularly at the 3Gram level. The character and subword based approaches also yielded some suitable predictions across the current range of n values. Ultimately, a more in depth corpus would yield more successful results.