In [57]:
from src.cleaning import clean_text_basic, clean_text_advanced

# quick smoke test
s = "I'm thrilled! Shipping was great — 10/10 would buy again. https://x.y"
print("BASIC :", clean_text_basic(s))
print("ADV   :", clean_text_advanced(s))


BASIC : im thrill shipping great 1010 would buy httpsxy
ADV   : thrilled shipping great 1010 would buy


In [2]:
import random
import pandas as pd
import nltk

In [3]:
nltk.download('twitter_samples')
# 'twitter_samples' ---> a dataset of sample tweets provided by NLTK
# comes with two files ---> "positive_tweets.json" and "negative_tweets.json"
nltk.download('punkt')
#  tokenizer model for sentence splitting and word tokenization.
nltk.download('stopwords')
# list of common words (stopwords) in multiple languages.
nltk.download('averaged_perceptron_tagger')
# "averaged_perceptron_tagger" --> Part-of_Speech (POS) tagger model based on the averaged perceptron algorithm
nltk.download('wordnet')
#  "wordnet"  -----> WordNet lexical database for English
# supports lemmatization (WordNetLemmatizer) and synonym/antonym
nltk.download('omw-1.4')
# Open Multilingual WordNet
# Extends WordNet with multilingual support and richer semantic relations.

# Example: Translating synonyms into other languages.

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\SkyTech\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\twitter_samples.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SkyTech\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SkyTech\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\SkyTech\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SkyTech\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\SkyTech\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already

True

In [4]:
from nltk.corpus import twitter_samples

In [5]:
# Load positive and negative tweets (small, clen English sample)
pos = twitter_samples.strings('positive_tweets.json')
neg = twitter_samples.strings('negative_tweets.json')

In [8]:
# Build a DataFrame
df_raw = pd.DataFrame(
    {"label": ["pos"] * len(pos) + ["neg"] * len(neg),
     "text": pos + neg
     }
)

"""
    "label": ["pos"] * len(pos) + ["neg"] * len(neg)
        ["pos"] * len(pos) → creates a list filled with "pos" repeated as many times as the number of positive samples.

                Example: If len(pos) = 2, then → ["pos", "pos"]

        ["neg"] * len(neg) → creates a list filled with "neg" repeated for each negative sample.

                Example: If len(neg) = 2, then → ["neg", "neg"]

+ combines these two lists.

Example: ["pos", "pos"] + ["neg", "neg"] → ["pos", "pos", "neg", "neg"]

pos + neg simply concatenates the two lists of text.
"""

'\n    "label": ["pos"] * len(pos) + ["neg"] * len(neg)\n        ["pos"] * len(pos) → creates a list filled with "pos" repeated as many times as the number of positive samples.\n\n                Example: If len(pos) = 2, then → ["pos", "pos"]\n\n        ["neg"] * len(neg) → creates a list filled with "neg" repeated for each negative sample.\n\n                Example: If len(neg) = 2, then → ["neg", "neg"]\n\n+ combines these two lists.\n\nExample: ["pos", "pos"] + ["neg", "neg"] → ["pos", "pos", "neg", "neg"]\n\n\n'

In [10]:
# Shuffle for variety
df_raw = df_raw.sample(frac=1.0, random_state=42).reset_index(drop=True)
# “Shuffle all rows randomly in a reproducible way, then reset the index to start from zero.”
# farc = 1.0 return all rows
# optionally subsample to 2000 rows
# People just pick 42 in random_state because it’s a running inside joke in programming circles. wny other number will work
df_raw = df_raw.head(2000)

In [15]:
print("Loaded dataset Shape: ", df_raw.shape)
print("\nFirst 5 rows: ")
print(df_raw.head(5).to_string(index=True))

print("\nRandom 5 examples: ")
sample_df = df_raw.sample(5, random_state=11)[["label", "text"]]
print(sample_df.to_string(index=True))

Loaded dataset Shape:  (2000, 2)

First 5 rows: 
  label                                                                                 text
0   pos                              Will you be my happy ending? @IanPrasetya insyaAllah :)
1   pos       "@divarh15: @GraceGithakwa Seems like you go out alot" something like that..:)
2   pos  What was your favorite subject in school? — PHYSICS :))))))) http://t.co/h8wqtuoP8T
3   pos                                                           @Omar_Omark thanks omar :)
4   pos                                                                           Thanks :))

Random 5 examples: 
     label                                                                                                       text
41     pos  i was so anxious i was shaking and my dad was like calm down and then well apparently only 10:30 right :)
1457   neg                                                     @tv3midday Aw no.... was just about to switch over :-(
1373   pos        

## Step 2-- Quick Data Exploration

In [16]:
print("Rows:", len(df_raw))

Rows: 2000


In [19]:
print("Columns: ", df_raw.columns.tolist())
# convert it to list type by default it is object type

Columns:  ['label', 'text']


In [20]:
print("\nClass Balance: ")
print(df_raw["label"].value_counts())


Class Balance: 
label
pos    1012
neg     988
Name: count, dtype: int64


In [None]:
# Quick length probes
# creating a cloumn that hold total characters
df_raw["len_chars"] = df_raw["text"].str.len()
# creating a column that hold total words
df_raw["len_tokens"] = df_raw["text"].str.split().apply(len)

In [23]:
df_raw.head(5)

Unnamed: 0,label,text,len_chars,len_tokens
0,pos,Will you be my happy ending? @IanPrasetya insy...,55,9
1,pos,"""@divarh15: @GraceGithakwa Seems like you go o...",78,11
2,pos,What was your favorite subject in school? — PH...,83,11
3,pos,@Omar_Omark thanks omar :),26,4
4,pos,Thanks :)),10,2


In [25]:
print("\nLength (chars) - min/mean/median/max: ")
min_char = df_raw["len_chars"].min()
max_char = df_raw["len_chars"].max()
mean_char = df_raw["len_chars"].mean()
median_char = df_raw["len_chars"].median()
print(min_char, mean_char, median_char, max_char)


Length (chars) - min/mean/median/max: 
7 68.862 62.0 147


In [26]:
print("\nLength (tokens) - min/mean/median/max: ")
min_token = df_raw["len_tokens"].min()
max_token = df_raw["len_tokens"].max()
mean_token = df_raw["len_tokens"].mean()
median_token = df_raw["len_tokens"].median()
print(min_token, mean_token, median_token, max_token)


Length (tokens) - min/mean/median/max: 
2 11.6705 10.0 31


In [30]:
print("\nRandom 5 raw examples:")
print(df_raw.sample(5, random_state=101)[["label","text","len_chars", "len_tokens"]].to_string(index=False))


Random 5 raw examples:
label                                                                                                                                            text  len_chars  len_tokens
  neg                                                                                                                         @lostboxuk Very sad! :(         23           4
  neg                     @_orrhettofrappe they don't know how to make linis kasi :((( so sad. that's why im sweating kanina and it's so init pa huhu        123          23
  neg All is fair in love and war kapan update :(\n\nOh ya udah dihapus. Hilang dari muka bumi.\n\nI want to read it once more someone give me link 😢        139          30
  pos                                        There are startup community in the tropics too! Geeks on the beach :) #startupPH https://t.co/Bg4SxKN3tg        104          15
  neg    @Michael5SOS @_8bitsenpai_  can someone send me a screenshot of this conversation i want to see what i

### Step 3 — Function 1: Lowercasing

In [31]:
# lowercasing
def to_lower(text: str) -> str:
    return text.lower()

# keep a working copy to add columns step by step

df_work = df_raw[["label", "text"]].copy()
df_work["text_lower"] = df_work["text"].apply(to_lower)

print("LowerCasing preview: ")
print(df_work.sample(5, random_state=2025)[["text", "text_lower"]].to_string(index=True))

LowerCasing preview: 
                                                                                                                                            text                                                                                                                                  text_lower
1746                  It's my last day working with the munchkin today...:(...bought her a little parting gift...so far… https://t.co/0xSWksXs2t                  it's my last day working with the munchkin today...:(...bought her a little parting gift...so far… https://t.co/0xswksxs2t
844                                                                                               @nattan23 hahahaha i remember it so clearly :p                                                                                              @nattan23 hahahaha i remember it so clearly :p
1520                                                                          Wft.. can't watch the awesome replay!! :-( ht

### Step 4 — Function 2: Punctuation removal

In [36]:
import string

# Include smart quotes/dashes/ellipsis beyond ASCII punctuation
SMART_PUNCT = "“”‘’—–…"

PUNCT_TABLE = str.maketrans("", "", string.punctuation + SMART_PUNCT)

def remove_punct(text: str) -> str:
    return text.translate(PUNCT_TABLE)

# Now applying this to the lowercased text from Step 3
df_work["text_nopunct"] = df_work["text_lower"].apply(remove_punct)


print("Punctuation removal preview: ")
rand = df_work.sample(6, random_state=4445)[["text_lower", "text_nopunct"]]
print(rand.to_string())


# checking how many rows changed

changed = (df_work["text_lower"] != df_work["text_nopunct"]).sum()
print(f"\nRows altered by punctuation removal: {changed} / {len(df_work)}")


Punctuation removal preview: 
                                                                                                         text_lower                                                                                    text_nopunct
745                  come fly with me baby! :) http://t.co/jjmrvoblzl #retweet #marine #navy #airforce #battlefield               come fly with me baby  httptcojjmrvoblzl retweet marine navy airforce battlefield
852                           ive got so much things to do in 3 days. :( what is syawal now. http://t.co/qz4k9f36bs                    ive got so much things to do in 3 days  what is syawal now httptcoqz4k9f36bs
366   guys add my kik : taknottem477 #kik #kikgirl #skype #booty #nudes #mpoints #oralsex :( http://t.co/egplp1egr9  guys add my kik  taknottem477 kik kikgirl skype booty nudes mpoints oralsex  httptcoegplp1egr9
1076                            rly sad that i had to rush off when that was the last time i would see everyone :-(       

###  Step 5: Tokenization + Stopword removal

#### Step 5A — Tokenize

In [38]:
# Step 5A -- Tokenize the punctuation -stripped text
from nltk.tokenize import word_tokenize

def tokenize(text: str):
    return word_tokenize(text)

df_work["tokens"] = df_work["text_nopunct"].apply(tokenize)

print("Tokenization preview: ")
print(df_work.sample(5, random_state=555)[["text_nopunct", "tokens"]].to_string())

print("\nToken count stats (before stopword removal): ")
lens = df_work["tokens"].apply(len)

Tokenization preview: 
                                                                                                   text_nopunct                                                                                                                     tokens
239                                                                       day in lifevideo uppe om 60 minuter d                                                                             [day, in, lifevideo, uppe, om, 60, minuter, d]
409          syedihusain polite izzat  \nwese does she trust him khawateen k sath selfies say to mana kar deya            [syedihusain, polite, izzat, wese, does, she, trust, him, khawateen, k, sath, selfies, say, to, mana, kar, deya]
923                                                                         sinsalem this is a very sad moment                                                                                  [sinsalem, this, is, a, very, sad, moment]
50    pret  wkwkwwlkjhope verfied wlk

### Step 5B — Remove stopwords

In [39]:
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words("english"))

def remove_stopwords(tokens):
    return [t for t in tokens if t.lower() not in STOPWORDS]

df_work["tokens_nostop"] = df_work["tokens"].apply(remove_stopwords)

print("Stopword removal preview: ")
print(df_work.sample(6, random_state= 777)[["tokens", "tokens_nostop"]].to_string())

Stopword removal preview: 
                                                                                                      tokens                                                         tokens_nostop
563                                                                               [realliampayne, and, zayn]                                                 [realliampayne, zayn]
892                                         [i, love, you, too, and, now, i, want, corn, chips, soldhersoul]                                [love, want, corn, chips, soldhersoul]
827                                                                                  [joiredve, follback, d]                                                  [joiredve, follback]
316                            [notjagath, are, you, a, member, of, හෙල, හවුල, by, any, chance, d, chevindu]                      [notjagath, member, හෙල, හවුල, chance, chevindu]
1968  [parentingwt, well, good, luck, anne, you, can, always, go, the, indie, 

### Step 6: POS-aware lemmatization

In [40]:
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

##### 6.1 Map NLTK POS (Treebank) -> WordNet POS

In [41]:
def get_wordnet_pos(tag:str):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN # Default

#### 6.2 POS tag and lemma

In [42]:
lemmatizer = WordNetLemmatizer()


def pos_lemmatize(tokens):
    # POS tag the token list
    pos_tags = nltk.pos_tag(tokens)
    
    # Lemmatize using POS mapping
    return [lemmatizer.lemmatize(w, get_wordnet_pos(tag)) for w, tag in pos_tags]

In [44]:
df_work["tokens_lemma"] = df_work["tokens_nostop"].apply(pos_lemmatize)

print("Lemmatization preview: ")
print(
    df_work.sample(6, random_state=888)[
        ["tokens_nostop", "tokens_lemma"]
    ].to_string()
)

Lemmatization preview: 
                                                                                                                 tokens_nostop                                                                                                             tokens_lemma
1423                                                            [periplusstore, youre, welcome, thanks, exploring, retweeting]                                                             [periplusstore, youre, welcome, thanks, explore, retweeting]
479                     [made, stuff, tonight, streamer, felt, really, nice, getting, creative, juices, flowing, havent, done]                             [make, stuff, tonight, streamer, felt, really, nice, get, creative, juice, flow, havent, do]
275                                                                                     [usually, happens, httpstco6o3zgnonvh]                                                                                    [usually, happen, http

In [45]:
# Quick delta check: how many tokens changed by lemmatization

changed_counts = [
    sum(1 for a, b in zip(a_list, b_list) if a != b)
    for a_list, b_list in zip(df_work["tokens_nostop"], df_work["tokens_lemma"])
]
print("\nAvg tokens changed by lemmatization (per row):", round(sum(changed_counts)/len(changed_counts), 3))


Avg tokens changed by lemmatization (per row): 0.846


### Step 7 — Whitespace cleanup & assemble clean_text

In [46]:
def detokenize(tokens):
    # join with single spac3es and strip ends
    return " ".join(tokens).strip()

# Build the final clean_text column from tokens_lemma
df_work["clean_text"] = df_work["tokens_lemma"].apply(detokenize)

# Preview: before vs after for 5 random rows
import pandas as pd
pd.set_option("display.max_colwidth", 160)

preview = df_work.sample(5, random_state=777)[["text", "clean_text"]]
print("Before vs After (5 samples):")
print(preview.to_string(index=False))

# Quick sanity checks
num_empty = (df_work["clean_text"].str.len() == 0).sum()
print(f"\nEmpty cleaned rows: {num_empty} / {len(df_work)}")



Before vs After (5 samples):
                                                                                      text                                                clean_text
                                                       @Real_Liam_Payne :))) and zayn :)))                                        realliampayne zayn
                              I love you too :) And now I want corn chips :)\n@SoldHerSoul                           love want corn chip soldhersoul
                                                                     @joiredve follback :D                                         joiredve follback
                       @NotJagath are you a member of හෙල හවුල by any chance? :D @Chevindu                 notjagath member හෙල හවුල chance chevindu
@ParentingWT Well good luck Anne. You can always go the Indie route if you have no joy. :) parentingwt well good luck anne always go indie route joy

Empty cleaned rows: 3 / 2000


### Step 8 — Wrap into clean_text_basic and apply to all rows

In [47]:
# STEP 8 — Wrap into a single function + apply

import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import nltk

# 8.1 POS mapper (same as Step 6)
def get_wordnet_pos(tag: str):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# 8.2 Punctuation table (ASCII + smart punctuation)
SMART_PUNCT = "“”‘’—–…"
PUNCT_TABLE = str.maketrans("", "", string.punctuation + SMART_PUNCT)

STOPWORDS = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text_basic(text: str) -> str:
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(PUNCT_TABLE)
    # Tokenize
    tokens = word_tokenize(text)
    # Drop stopwords
    tokens = [t for t in tokens if t not in STOPWORDS]
    # POS-aware lemmatize
    pos_tags = nltk.pos_tag(tokens)
    lemmas = [lemmatizer.lemmatize(w, get_wordnet_pos(tag)) for w, tag in pos_tags]
    # Join
    return " ".join(lemmas).strip()

# 8.3 Apply to full dataset (keeping a clean final DataFrame)
df_final = df_raw[["label", "text"]].copy()
df_final["clean_text"] = df_final["text"].apply(clean_text_basic)

print("Applied clean_text_basic to all rows.")
print("\nBefore vs After (5 random rows):")
print(df_final.sample(5, random_state=1312)[["text","clean_text"]].to_string(index=False))

# Sanity: no empties ideally
empty_count = (df_final["clean_text"].str.len() == 0).sum()
print(f"\nEmpty cleaned rows: {empty_count} / {len(df_final)}")


Applied clean_text_basic to all rows.

Before vs After (5 random rows):
                                                                                                                         text                                                                                  clean_text
@donnaledesma_ from PHL to Abu Dhabi, with love and blessing hihihi. :) #TeamJanuaryCLaims100 #GoDonna http://t.co/ayIMB6aITD donnaledesma phl abu dhabi love bless hihihi teamjanuaryclaims100 godonna httptcoayimb6aitd
                                                            @EMPERYtech supposedly one of the worst kernels for any device :(                                                 emperytech supposedly one bad kernel device
                                                                                    @adamhulme86 @QPRFC @IJTaylor81 Enjoy. :)                                                          adamhulme86 qprfc ijtaylor81 enjoy
                                                        

#### Step 9 — Side-by-side comparison (at least 5 samples)

In [48]:
# STEP 9 — Side-by-side comparison (at least 5 samples)
import pandas as pd
pd.set_option("display.max_colwidth", 200)

compare = df_final.sample(8, random_state=2025)[["text","clean_text"]].reset_index(drop=True)
print("Before vs After (8 samples):")
print(compare.to_string(index=False))


Before vs After (8 samples):
                                                                                                                                      text                                                                                            clean_text
                It's my last day working with the munchkin today...:(...bought her a little parting gift...so far… https://t.co/0xSWksXs2t                          last day work munchkin todaybought little part giftso far httpstco0xswksxs2t
                                                                                            @nattan23 hahahaha i remember it so clearly :p                                                                  nattan23 hahahaha remember clearly p
                                                                        Wft.. can't watch the awesome replay!! :-( https://t.co/ChzrqtelPh                                                      wft cant watch awesome replay httpstcochzrqtelph
       

#### Step 10 — Visualizations (matplotlib only) + save PNGs

In [50]:
# STEP 10 — Visualizations (matplotlib only)

import matplotlib.pyplot as plt
from collections import Counter

# 10A) Histograms of text lengths (characters) BEFORE vs AFTER
df_final["len_before_chars"] = df_final["text"].str.len()
df_final["len_after_chars"]  = df_final["clean_text"].str.len()

# BEFORE histogram
plt.figure()
plt.hist(df_final["len_before_chars"], bins=30)
plt.title("Histogram of Text Lengths (Chars) — BEFORE Cleaning")
plt.xlabel("Length (characters)")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig("hist_len_chars_before.png", bbox_inches="tight")
plt.close()

# AFTER histogram
plt.figure()
plt.hist(df_final["len_after_chars"], bins=30)
plt.title("Histogram of Text Lengths (Chars) — AFTER Cleaning")
plt.xlabel("Length (characters)")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig("hist_len_chars_after.png", bbox_inches="tight")
plt.close()

print("Saved: hist_len_chars_before.png, hist_len_chars_after.png")

# 10B) Top-20 most frequent tokens (AFTER cleaning)
all_tokens = []
for s in df_final["clean_text"]:
    if s:
        all_tokens.extend(s.split())

token_counts = Counter(all_tokens)
top20 = token_counts.most_common(20)

tokens_20 = [t for t, c in top20]
counts_20 = [c for t, c in top20]

plt.figure(figsize=(10,5))
plt.bar(range(len(tokens_20)), counts_20)
plt.xticks(range(len(tokens_20)), tokens_20, rotation=45, ha="right")
plt.title("Top 20 Tokens — AFTER Cleaning")
plt.xlabel("Token")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig("top20_tokens_after.png", bbox_inches="tight")
plt.close()

print("Saved: top20_tokens_after.png")

# 10C) (Optional) Top-15 bigrams (AFTER cleaning)
from itertools import tee

def bigrams(tokens):
    a, b = tee(tokens)
    next(b, None)
    return list(zip(a, b))

all_bigrams = []
for s in df_final["clean_text"]:
    toks = s.split() if s else []
    all_bigrams.extend(bigrams(toks))

from collections import Counter
bigram_counts = Counter(all_bigrams)
top15_bigrams = bigram_counts.most_common(15)

if top15_bigrams:
    bigram_labels = [f"{a} {b}" for (a,b), _ in top15_bigrams]
    bigram_vals   = [c for _, c in top15_bigrams]

    plt.figure(figsize=(10,5))
    plt.bar(range(len(bigram_labels)), bigram_vals)
    plt.xticks(range(len(bigram_labels)), bigram_labels, rotation=45, ha="right")
    plt.title("Top 15 Bigrams — AFTER Cleaning")
    plt.xlabel("Bigram")
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.savefig("top15_bigrams_after.png", bbox_inches="tight")
    plt.close()
    print("Saved: top15_bigrams_after.png")
else:
    print("No bigrams to plot (dataset too small or too sparse).")


Matplotlib is building the font cache; this may take a moment.


Saved: hist_len_chars_before.png, hist_len_chars_after.png
Saved: top20_tokens_after.png
Saved: top15_bigrams_after.png


#### Step 11 — Display cleaned DataFrame & save CSV

In [51]:
# STEP 11 — Display cleaned DataFrame & save CSV

print("Cleaned DataFrame preview (first 10 rows):")
print(df_final.head(10).to_string(index=False))

output_csv = "cleaned_twitter_samples.csv"
df_final.to_csv(output_csv, index=False, encoding="utf-8")
print(f"\nSaved cleaned CSV to: {output_csv}")


Cleaned DataFrame preview (first 10 rows):
label                                                                                                             text                                                            clean_text  len_before_chars  len_after_chars
  pos                                                          Will you be my happy ending? @IanPrasetya insyaAllah :)                                      happy end ianprasetya insyaallah                55               32
  pos                                   "@divarh15: @GraceGithakwa Seems like you go out alot" something like that..:)               divarh15 gracegithakwa seem like go alot something like                78               55
  pos                              What was your favorite subject in school? — PHYSICS :))))))) http://t.co/h8wqtuoP8T                      favorite subject school physic httptcoh8wqtuop8t                83               48
  pos                                                        

### Step 12 — Contractions expansion (e.g., “i’m → i am”)

Why: improves lemmatization and downstream meaning.
Where in pipeline: right after lowercasing, before punctuation removal.

In [52]:
# STEP 12 — Contractions expansion
import re

# Minimal, production-friendly set (extend anytime)
_CONTRACTIONS = {
    "ain't":"am not", "aren't":"are not", "can't":"can not", "can't've":"can not have",
    "could've":"could have", "couldn't":"could not", "couldn't've":"could not have",
    "didn't":"did not", "doesn't":"does not", "don't":"do not",
    "hadn't":"had not", "hasn't":"has not", "haven't":"have not",
    "he'd":"he would", "he'll":"he will", "he's":"he is",
    "i'd":"i would", "i'll":"i will", "i'm":"i am", "i've":"i have",
    "isn't":"is not", "it'd":"it would", "it'll":"it will", "it's":"it is",
    "let's":"let us", "ma'am":"madam", "might've":"might have", "mightn't":"might not",
    "must've":"must have", "mustn't":"must not",
    "needn't":"need not", "o'clock":"of the clock",
    "shan't":"shall not", "she'd":"she would", "she'll":"she will", "she's":"she is",
    "should've":"should have", "shouldn't":"should not",
    "that'd":"that would", "that's":"that is", "there'd":"there would", "there's":"there is",
    "they'd":"they would", "they'll":"they will", "they're":"they are", "they've":"they have",
    "wasn't":"was not", "we'd":"we would", "we'll":"we will", "we're":"we are", "we've":"we have",
    "weren't":"were not", "what's":"what is", "when's":"when is", "where's":"where is",
    "who's":"who is", "why's":"why is", "won't":"will not", "would've":"would have",
    "wouldn't":"would not", "y'all":"you all", "you'd":"you would", "you'll":"you will",
    "you're":"you are", "you've":"you have"
}

# also handle curly apostrophes ’
_APOS_VARIANTS = ("'", "’")

# build a single regex that matches any contraction (case-insensitive)
_contr_keys = sorted(_CONTRACTIONS.keys(), key=len, reverse=True)
pattern = r"\b(" + "|".join(map(re.escape, _contr_keys)) + r")\b"
_CONTR_RE = re.compile(pattern, flags=re.IGNORECASE)

def expand_contractions(text: str) -> str:
    def _repl(m):
        key = m.group(0).lower()
        # normalize curly apostrophes to straight for lookup
        for apos in _APOS_VARIANTS[1:]:
            key = key.replace(apos, _APOS_VARIANTS[0])
        return _CONTRACTIONS.get(key, key)
    # normalize curly to straight before matching
    norm = text
    for apos in _APOS_VARIANTS[1:]:
        norm = norm.replace(apos, _APOS_VARIANTS[0])
    return _CONTR_RE.sub(_repl, norm)

# Quick smoke test:
print(expand_contractions("i’m happy but i can't go, it's late."))


i am happy but i can not go, it is late.


#### Step 13 — URLs & HTML normalization

In [53]:
# STEP 13 — URL and HTML normalization
import re

_URL_RE = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)
_HTML_TAG_RE = re.compile(r"<[^>]+>")  # simple tag stripper

def normalize_urls_and_html(text: str, replace_url_with="<URL>") -> str:
    # replace URLs with a placeholder token (or set to "" to drop entirely)
    text = _URL_RE.sub(replace_url_with, text)
    # strip basic HTML tags
    text = _HTML_TAG_RE.sub(" ", text)
    return text

# Quick smoke test:
print(normalize_urls_and_html("New post: <b>Great tips</b> https://example.com/x?y=1"))


New post:  Great tips   


#### Step 14 — Unicode normalization (accents, smart dashes) & tiny emoji mapping

In [54]:
# STEP 14 — Unicode normalization
import unicodedata

# tiny emoji mapping you can extend
_EMOJI_MAP = {
    "😀":"<POS_EMOJI>", "😄":"<POS_EMOJI>", "🙂":"<POS_EMOJI>", "😍":"<POS_EMOJI>", "👍":"<POS_EMOJI>",
    "😢":"<NEG_EMOJI>", "😤":"<NEG_EMOJI>", "👎":"<NEG_EMOJI>",
    "🚚":"<DELIVERY>", "🔋":"<BATTERY>"
}

def normalize_unicode(text: str, remove_accents: bool = True) -> str:
    # fix common smart punctuation
    text = (text.replace("“", '"').replace("”", '"')
                .replace("‘", "'").replace("’", "'")
                .replace("–", "-").replace("—", "-")
                .replace("…", "..."))
    # map a few emojis to tags
    text = "".join(_EMOJI_MAP.get(ch, ch) for ch in text)
    # NFC first, then optionally strip accents via NFKD
    text = unicodedata.normalize("NFC", text)
    if remove_accents:
        text = "".join(
            c for c in unicodedata.normalize("NFKD", text)
            if not unicodedata.combining(c)
        )
    # squeeze whitespace from any replacements
    return " ".join(text.split())


#### Step 15 — Compose the advanced cleaner

In [55]:
# STEP 15 — Compose advanced cleaner (lower -> unicode -> contractions -> URL/HTML -> punct -> tokenize -> stopwords -> POS-lemma -> join)
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import nltk

SMART_PUNCT = "“”‘’—–…"
PUNCT_TABLE = str.maketrans("", "", string.punctuation + SMART_PUNCT)
STOPWORDS = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag: str):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_text_advanced(text: str) -> str:
    text = text.lower()
    text = normalize_unicode(text, remove_accents=True)
    text = expand_contractions(text)
    text = normalize_urls_and_html(text, replace_url_with="<URL>")
    text = text.translate(PUNCT_TABLE)
    tokens = word_tokenize(text)
    # optional: keep negations by removing them from stopwords
    # NEGATIONS = {"no","nor","not","n't"}; tokens = [t for t in tokens if (t not in STOPWORDS or t in NEGATIONS)]
    tokens = [t for t in tokens if t not in STOPWORDS]
    pos_tags = nltk.pos_tag(tokens)
    lemmas = [lemmatizer.lemmatize(w, get_wordnet_pos(tag)) for w, tag in pos_tags]
    return " ".join(lemmas).strip()

# Apply to a copy for comparison
df_compare = df_final.copy()
df_compare["clean_text_adv"] = df_compare["text"].apply(clean_text_advanced)

print("Advanced cleaner preview (6 rows):")
print(df_compare.sample(6, random_state=909)[["text", "clean_text", "clean_text_adv"]].to_string(index=False))


Advanced cleaner preview (6 rows):
                                                                                                        text                                                   clean_text                                          clean_text_adv
                                                                               My house scary AF at night :(                                         house scary af night                                    house scary af night
                                                                          i want pretzels now :( #bb17 #bblf                                       want pretzel bb17 bblf                                  want pretzel bb17 bblf
I told myself I can survive living alone for the rest of my life but I can't even be left alone for a day :(  told survive live alone rest life cant even leave alone day  told survive live alone rest life even leave alone day
                                         @BellissimaEx @Marri

#### Step 16 — Save advanced CSV (+ optional re-plots)

In [56]:
# STEP 16 — Save advanced CSV
adv_csv = "cleaned_twitter_samples_advanced.csv"
df_compare[["label","text","clean_text_adv"]].to_csv(adv_csv, index=False, encoding="utf-8")
print("Saved:", adv_csv)

# Optional:  replot with advanced lengths:
import matplotlib.pyplot as plt
df_compare["len_after_chars_adv"] = df_compare["clean_text_adv"].str.len()
plt.figure(); plt.hist(df_compare["len_after_chars_adv"], bins=30); plt.title("Histogram — AFTER (Advanced)"); plt.xlabel("Length"); plt.ylabel("Count"); plt.tight_layout(); plt.savefig("hist_len_chars_after_advanced.png"); plt.close()


Saved: cleaned_twitter_samples_advanced.csv
