In [1]:
import os
import re

In [2]:
DATASET_PATH = '/Users/srivatsasinha/Desktop/assignment/anlp-monsoon-24/dataset/processed/Auguste_Maquet.txt'

In [3]:
def remove_non_alphanumeric(text):
    # Use regular expression to keep alphanumeric characters, spaces, tabs, and periods, and remove newlines
    cleaned_text = re.sub(r'[^\w\s\t.]', '', text)
    # Remove newline characters
    cleaned_text = cleaned_text.replace('\n', '')
    if cleaned_text == '':
        cleaned_text = ' '
    return cleaned_text

def find_chapter_indices(list_of_lists):
    indices = []
    for i, sublist in enumerate(list_of_lists):
        # Check if 'chapter' is in any of the elements of the sublist
        if any("chapter" in str(element).lower() for element in sublist):
            indices.append(i)
    return indices

def delete_elements_at_indices(list_of_lists, indices):
    return [sublist for i, sublist in enumerate(list_of_lists) if i not in indices]

def delete_elements_of_len_less_than_k(list_of_lists, k):
    return [sublist for i, sublist in enumerate(list_of_lists) if len(sublist)>k]

def clean_data(dataset):
    dataset_cleaned = list(map(lambda x: remove_non_alphanumeric(x),dataset))
    dataset_cleaned = (' ').join(dataset_cleaned)
    dataset_cleaned = dataset_cleaned.split('.')
    dataset_cleaned = list(map(lambda x: x.strip(),dataset_cleaned))
    dataset_cleaned = list(map(lambda x: x.split(' '),dataset_cleaned))
    dataset_cleaned = delete_elements_at_indices(dataset_cleaned,find_chapter_indices(dataset_cleaned))
    dataset_cleaned = delete_elements_of_len_less_than_k(dataset_cleaned,6)
    return dataset_cleaned 

def generate_labeled_data(dataset_cleaned,train_size,test_size,context_size,left_pad=False):
    dataset_cleaned = ' '.join(list(map(lambda x: ' '.join(['<SOS>'] + x + ['<EOS>']),dataset_cleaned)))
    

In [4]:
import random

def generate_labeled_data(dataset_cleaned, train_size, test_size, context_size, left_pad=False):
    vocab_set = set()
    def create_context_target_pair(sequence, context_size, left_pad):
        vocab_set.update(sequence)
        context_target_pairs = []
        for i in range(len(sequence)):
            if left_pad:
                context = sequence[max(0, i - context_size):i]
                context = ['<UNK>'] * (context_size - len(context)) + context  # Left padding with 0s
            else:
                context = sequence[i:i+context_size+1]
            if(len(context)==6):
                context_target_pairs.append((context[:-1] + ['<UNK>'] * (context_size - len(context) + 1), context[-1]))
        
        return context_target_pairs
    
    # Shuffle the dataset to randomize the training and test sets
    # random.shuffle(dataset_cleaned)
    
    # Create labeled data pairs (context, target) for each sequence in the dataset
    labeled_data = []
    for sequence in dataset_cleaned:
        labeled_data.extend(create_context_target_pair(sequence, context_size, left_pad))
    
    # Split the data into training and testing sets
#     train_data = labeled_data[:train_size]
#     test_data = labeled_data[train_size:train_size + test_size]
    
    return labeled_data,vocab_set

In [5]:
with open(DATASET_PATH, 'r') as f:
    dataset = f.readlines()

In [6]:
dataset_cl = clean_data(dataset)

In [7]:
dataset_cl

[['In',
  'a',
  'splendid',
  'chamber',
  'of',
  'the',
  'Palais',
  'Royal',
  'formerly',
  'styled',
  'the',
  'Palais',
  'Cardinal',
  'a',
  'man',
  'was',
  'sitting',
  'in',
  'deep',
  'reverie',
  'his',
  'head',
  'supported',
  'on',
  'his',
  'hands',
  'leaning',
  'over',
  'a',
  'gilt',
  'and',
  'inlaid',
  'table',
  'which',
  'was',
  'covered',
  'with',
  'letters',
  'and',
  'papers'],
 ['Behind',
  'this',
  'figure',
  'glowed',
  'a',
  'vast',
  'fireplace',
  'alive',
  'with',
  'leaping',
  'flames',
  'great',
  'logs',
  'of',
  'oak',
  'blazed',
  'and',
  'crackled',
  'on',
  'the',
  'polished',
  'brass',
  'andirons',
  'whose',
  'flicker',
  'shone',
  'upon',
  'the',
  'superb',
  'habiliments',
  'of',
  'the',
  'lonely',
  'tenant',
  'of',
  'the',
  'room',
  'which',
  'was',
  'illumined',
  'grandly',
  'by',
  'twin',
  'candelabra',
  'rich',
  'with',
  'waxlights'],
 ['Any',
  'one',
  'who',
  'happened',
  'at',
  'th

In [8]:
train,vocab = generate_labeled_data(dataset_cl,20000,10000,5)

In [9]:
train

[(['In', 'a', 'splendid', 'chamber', 'of'], 'the'),
 (['a', 'splendid', 'chamber', 'of', 'the'], 'Palais'),
 (['splendid', 'chamber', 'of', 'the', 'Palais'], 'Royal'),
 (['chamber', 'of', 'the', 'Palais', 'Royal'], 'formerly'),
 (['of', 'the', 'Palais', 'Royal', 'formerly'], 'styled'),
 (['the', 'Palais', 'Royal', 'formerly', 'styled'], 'the'),
 (['Palais', 'Royal', 'formerly', 'styled', 'the'], 'Palais'),
 (['Royal', 'formerly', 'styled', 'the', 'Palais'], 'Cardinal'),
 (['formerly', 'styled', 'the', 'Palais', 'Cardinal'], 'a'),
 (['styled', 'the', 'Palais', 'Cardinal', 'a'], 'man'),
 (['the', 'Palais', 'Cardinal', 'a', 'man'], 'was'),
 (['Palais', 'Cardinal', 'a', 'man', 'was'], 'sitting'),
 (['Cardinal', 'a', 'man', 'was', 'sitting'], 'in'),
 (['a', 'man', 'was', 'sitting', 'in'], 'deep'),
 (['man', 'was', 'sitting', 'in', 'deep'], 'reverie'),
 (['was', 'sitting', 'in', 'deep', 'reverie'], 'his'),
 (['sitting', 'in', 'deep', 'reverie', 'his'], 'head'),
 (['in', 'deep', 'reverie', 'h

In [54]:
len(vocab)

27719

In [10]:
import tiktoken

In [21]:
enc = tiktoken.get_encoding("gpt2")

In [22]:
assert enc.decode(enc.encode("Hello world")) == "Hello world"

AttributeError: 'Encoding' object has no attribute 'vocab'

In [27]:
enc.max_token_value

50256