In [1]:
import numpy as np
import pandas as pd
import os
import sys

import re
import stanza
import nltk
from nltk.corpus import stopwords

from tqdm import tqdm

nltk.download('stopwords')
nltk.download('punkt')
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sarag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sarag\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Load the dataset

In [2]:
root_path = os.path.dirname(os.getcwd())
data_path = os.path.join(root_path, 'data')
train_set = pd.read_csv(os.path.join(data_path, 'train_2025.csv'), quoting=3, header=0, index_col='id')
train_set

Unnamed: 0_level_0,text,label,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
eng_train0,I supported Barack Obama. I thought it was abs...,medical records,tax returns,school transcripts,family tree,DNA,dental records,fingerprints,etc. It was proof of their desire to strip Ob...,stamp collections,third grade report card,etc,and I don't care about Trump's tax returns,third grade report card,or stamp collection,either. There was already enough public infor...,public policy positions,campaign style,etc. for me to vote for Obama and against Tru...,0.0
eng_train1,what to hell with that!,1,,,,,,,,,,,,,,,,,,
eng_train2,and the stupidity of the haters continues,this as usual is nothing but made up rage bas...,,,,you will be forced yet again to eat crow!!,1,,,,,,,,,,,,,
eng_train3,Alberta has been in debt under the Conservatives,and the NDP. Canadian Federal debt just kept ...,with Kenney in the side seat. Kenney will ch...,0,,,,,,,,,,,,,,,,
eng_train4,The TV is in Channel Search mode,and I have put the antenna up on the wall rig...,which I taped it,because the suction cups don't stick to the p...,0,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eng_train69,If Canada had the guts they would shut all exp...,Let the clowns south of the border buy thei...,1,,,,,,,,,,,,,,,,,
eng_train70,Really? Does everyone really want to round up...,1,,,,,,,,,,,,,,,,,,
eng_train71,There are over a million of these 'immigrants'...,and that's the problem. They concentrate the...,Sweden? What will the future hold for Canada...,0,,,,,,,,,,,,,,,,
eng_train72,Bishop,whose real name was Head,married a half-sister of Williams'. Williams ...,or body-snatcher,,0,,,,,,,,,,,,,,


# Tokenize and Lemmatize 

In [None]:
def tokenize_and_normalize(text, stopwords):
    """Tokenizes, lemmatizes, lowercases and removes stop words.
    
    this function takes in a path to a song, reads the song file,
    tokenizes it into words, then lemmatizes and lowercases these words.
    finally, stopwords given to the function are removed from the list of song lemmas
    
    Parameters
    ----------
    file_name : str
        a path to a text file
    stopwords : list of strings
        stopwords that should be removed
    
    Returns
    -------
    normalized_song : list of strings
        a song represented as a list of its lemmas
    """
    
    nlp = stanza.Pipeline(lang='en', processors='tokenize, lemma',  verbose=False)
    
    # YOUR CODE HERE
    text = re.sub(r'[^\w\s]', '', text)
    result = [word.lemma.lower()
               for token in nlp(text).iter_tokens()
               for word in token.words
               if word.lemma.lower() not in stopwords]
    #raise NotImplementedError()
    
    return result

In [None]:
stop_words = set(stopwords.words('english'))
completed_preprocessed_text = []
for sentence in tqdm(train_set.text):
    token_list = tokenize_and_normalize(sentence, stop_words)
    if len(token_list) > 0:
        preprocessed_sentence = token_list[0]
        for token in token_list[1:]:
            preprocessed_sentence += (' ' + token)
    else:
        preprocessed_sentence = ''
    completed_preprocessed_text.append(preprocessed_sentence)
train_set.insert(1, 'preprocessed_text', completed_preprocessed_text)

100%|███████████████████████████████████| 99000/99000 [9:22:26<00:00,  2.93it/s]


In [None]:
# Save result as a new file to avoid re-normalize
train_set.to_pickle(os.path.join(data_path, 'train_2024_tokenized.pkl'))
train_set

Unnamed: 0_level_0,text,preprocessed_text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Except that Desmond played first base last nig...,except desmond play first base last night tapi...,0
1,What i find funny is the loyalty and blindness...,find funny loyalty blindness english community...,0
2,Read the article not just the headline & you ...,read article headline find,0
3,Speaking of a horses backside is that where y...,speak horse backside head,1
4,Michael Barone- gee are you dumb. No other wo...,michael barone gee dumb word need,1
...,...,...,...
98995,the libs could just pass a law that pulls them...,lib could pass law pull treaty easily exite ball,1
98996,Really? How does this post in any way relate t...,really post way relate article article take pa...,0
98997,Hey illegals if your reading this get the hel...,hey illegal read get hell country,1
98998,"Excellent description ""he playground bully ol...",excellent description playground bully old lit...,1


### 