In [1]:
import numpy as np
import pandas as pd
import os
import sys
import re
import stanza
import nltk
from nltk.corpus import stopwords
from joblib import Parallel, delayed  # Parallel processing

from tqdm import tqdm

nltk.download('stopwords')
nltk.download('punkt')
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

  backends.update(_get_backends("networkx.backends"))
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ghorbas1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ghorbas1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Load the dataset

In [4]:
root_path = os.path.dirname(os.getcwd())
data_path = os.path.join(root_path, 'Data')
train_set = pd.read_csv(os.path.join(data_path, 'train_2025.csv'), header=0, index_col='id')
train_set
nlp = stanza.Pipeline(lang='en', processors='tokenize,lemma', use_gpu=True, verbose=False)

# Tokenize and Lemmatize 

In [5]:
 

def tokenize_and_normalize(text, stopwords):
    """Tokenizes, lemmatizes, lowercases and removes stop words.
    
    this function takes in a path to a song, reads the song file,
    tokenizes it into words, then lemmatizes and lowercases these words.
    finally, stopwords given to the function are removed from the list of song lemmas
    
    Parameters
    ----------
    file_name : str
        a path to a text file
    stopwords : list of strings
        stopwords that should be removed
    
    Returns
    -------
    normalized_song : list of strings
        a song represented as a list of its lemmas
    """
    
    
    
    # YOUR CODE HERE
    text = re.sub(r'[^\w\s]', '', text)
    result = [word.lemma.lower()
               for token in nlp(text).iter_tokens()
               for word in token.words
               if word.lemma.lower() not in stopwords]
    #raise NotImplementedError()
    
    return result
    
 


In [6]:
stop_words = set(stopwords.words('english'))
completed_preprocessed_text = []
for sentence in tqdm(train_set.text):
    token_list = tokenize_and_normalize(sentence, stop_words)
    if len(token_list) > 0:
        # preprocessed_sentence = token_list[0]
        # for token in token_list[1:]:
        #     preprocessed_sentence += (' ' + token)
        preprocessed_sentence = " ".join(token_list)
    else:
        preprocessed_sentence = ''
    completed_preprocessed_text.append(preprocessed_sentence)
train_set.insert(1, 'preprocessed_text', completed_preprocessed_text)


100%|██████████| 98637/98637 [36:42<00:00, 44.79it/s]  


In [8]:
# Save result as a new file to avoid re-normalize
train_set.to_pickle(os.path.join(data_path, 'train_2025_tokenized.pkl'))
train_set

Unnamed: 0_level_0,text,preprocessed_text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
eng_train0,I supported Barack Obama. I thought it was abs...,support barack obama think absurdity harassmen...,0
eng_train1,what to hell with that!,hell,1
eng_train2,"and the stupidity of the haters continues, thi...",stupidity hater continue usual nothing make ra...,1
eng_train3,Alberta has been in debt under the Conservativ...,alberta debt conservatives ndp canadian federa...,0
eng_train4,"The TV is in Channel Search mode, and I have p...",tv channel search mode put antenna wall right ...,0
...,...,...,...
eng_train98995,My bad for thinking you could get off your nea...,bad think could get neato gun soapbox waste ti...,1
eng_train98996,It's fixed now. Jackman Wilson Editorial page...,'s fix jackman wilson editorial page editor re...,0
eng_train98997,Could certainly be inconvenient for consumers ...,could certainly inconvenient consumer hey isnt...,0
eng_train98998,It is sad that Hawaii has the lowest turnout. ...,sad hawaii low turnout hawaii first become sta...,0


### 