In [2]:
import numpy as np
import pandas as pd
import os
import sys

import re
import stanza
import nltk
from nltk.corpus import stopwords

from tqdm import tqdm

nltk.download('stopwords')
nltk.download('punkt')
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sarag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sarag\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Load the dataset

In [25]:
root_path = os.path.dirname(os.getcwd())
data_path = os.path.join(root_path, 'data')
train_set = pd.read_csv(os.path.join(data_path, 'train_2025.csv'), header=0, index_col='id')
train_set

Unnamed: 0_level_0,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
eng_train0,I supported Barack Obama. I thought it was abs...,0
eng_train1,what to hell with that!,1
eng_train2,"and the stupidity of the haters continues, thi...",1
eng_train3,Alberta has been in debt under the Conservativ...,0
eng_train4,"The TV is in Channel Search mode, and I have p...",0
...,...,...
eng_train98995,My bad for thinking you could get off your nea...,1
eng_train98996,It's fixed now. Jackman Wilson Editorial page...,0
eng_train98997,Could certainly be inconvenient for consumers ...,0
eng_train98998,It is sad that Hawaii has the lowest turnout. ...,0


# Tokenize and Lemmatize 

In [26]:
def tokenize_and_normalize(text, stopwords):
    """Tokenizes, lemmatizes, lowercases and removes stop words.
    
    this function takes in a path to a song, reads the song file,
    tokenizes it into words, then lemmatizes and lowercases these words.
    finally, stopwords given to the function are removed from the list of song lemmas
    
    Parameters
    ----------
    file_name : str
        a path to a text file
    stopwords : list of strings
        stopwords that should be removed
    
    Returns
    -------
    normalized_song : list of strings
        a song represented as a list of its lemmas
    """
    
    nlp = stanza.Pipeline(lang='en', processors='tokenize, lemma',  verbose=False)
    
    # YOUR CODE HERE
    text = re.sub(r'[^\w\s]', '', text)
    result = [word.lemma.lower()
               for token in nlp(text).iter_tokens()
               for word in token.words
               if word.lemma.lower() not in stopwords]
    #raise NotImplementedError()
    
    return result

In [None]:
stop_words = set(stopwords.words('english'))
completed_preprocessed_text = []
for sentence in tqdm(train_set.text):
    token_list = tokenize_and_normalize(sentence, stop_words)
    if len(token_list) > 0:
        preprocessed_sentence = token_list[0]
        for token in token_list[1:]:
            preprocessed_sentence += (' ' + token)
    else:
        preprocessed_sentence = ''
    completed_preprocessed_text.append(preprocessed_sentence)
train_set.insert(1, 'preprocessed_text', completed_preprocessed_text)

  1%|          | 652/98637 [18:55<36:03:44,  1.32s/it]

In [None]:
# Save result as a new file to avoid re-normalize
train_set.to_pickle(os.path.join(data_path, 'train_2024_tokenized.pkl'))
train_set

Unnamed: 0_level_0,text,preprocessed_text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Except that Desmond played first base last nig...,except desmond play first base last night tapi...,0
1,What i find funny is the loyalty and blindness...,find funny loyalty blindness english community...,0
2,Read the article not just the headline & you ...,read article headline find,0
3,Speaking of a horses backside is that where y...,speak horse backside head,1
4,Michael Barone- gee are you dumb. No other wo...,michael barone gee dumb word need,1
...,...,...,...
98995,the libs could just pass a law that pulls them...,lib could pass law pull treaty easily exite ball,1
98996,Really? How does this post in any way relate t...,really post way relate article article take pa...,0
98997,Hey illegals if your reading this get the hel...,hey illegal read get hell country,1
98998,"Excellent description ""he playground bully ol...",excellent description playground bully old lit...,1


### 