In [2]:
import random
import pandas as pd
import nltk

In [None]:
nltk.download('twitter_samples')
# 'twitter_samples' ---> a dataset of sample tweets provided by NLTK
# comes with two files ---> "positive_tweets.json" and "negative_tweets.json"
nltk.download('punkt')
#  tokenizer model for sentence splitting and word tokenization.
nltk.download('stopwords')
# list of common words (stopwords) in multiple languages.
nltk.download('averaged_perceptron_tagger')
# "averaged_perceptron_tagger" --> Part-of_Speech (POS) tagger model based on the averaged perceptron algorithm
nltk.download('wordnet')
#  "wordnet"  -----> WordNet lexical database for English
# supports lemmatization (WordNetLemmatizer) and synonym/antonym
nltk.download('omw-1.4')
# Open Multilingual WordNet
# Extends WordNet with multilingual support and richer semantic relations.

# Example: Translating synonyms into other languages.

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\SkyTech\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\twitter_samples.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SkyTech\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SkyTech\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\SkyTech\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SkyTech\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\SkyTech\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already

True

In [4]:
from nltk.corpus import twitter_samples

In [5]:
# Load positive and negative tweets (small, clen English sample)
pos = twitter_samples.strings('positive_tweets.json')
neg = twitter_samples.strings('negative_tweets.json')

In [None]:
# Build a DataFrame
df_raw = pd.DataFrame(
    {"label": ["pos"] * len(pos) + ["neg"] * len(neg),
     "text": pos + neg
     }
)

"""
    "label": ["pos"] * len(pos) + ["neg"] * len(neg)
        ["pos"] * len(pos) → creates a list filled with "pos" repeated as many times as the number of positive samples.

                Example: If len(pos) = 2, then → ["pos", "pos"]

        ["neg"] * len(neg) → creates a list filled with "neg" repeated for each negative sample.

                Example: If len(neg) = 2, then → ["neg", "neg"]

+ combines these two lists.

Example: ["pos", "pos"] + ["neg", "neg"] → ["pos", "pos", "neg", "neg"]

pos + neg simply concatenates the two lists of text.
"""

'\n    "label": ["pos"] * len(pos) + ["neg"] * len(neg)\n        ["pos"] * len(pos) → creates a list filled with "pos" repeated as many times as the number of positive samples.\n\n                Example: If len(pos) = 2, then → ["pos", "pos"]\n\n        ["neg"] * len(neg) → creates a list filled with "neg" repeated for each negative sample.\n\n                Example: If len(neg) = 2, then → ["neg", "neg"]\n\n+ combines these two lists.\n\nExample: ["pos", "pos"] + ["neg", "neg"] → ["pos", "pos", "neg", "neg"]\n\n\n'

In [10]:
# Shuffle for variety
df_raw = df_raw.sample(frac=1.0, random_state=42).reset_index(drop=True)
# “Shuffle all rows randomly in a reproducible way, then reset the index to start from zero.”
# farc = 1.0 return all rows
# optionally subsample to 2000 rows
# People just pick 42 in random_state because it’s a running inside joke in programming circles. wny other number will work
df_raw = df_raw.head(2000)

In [15]:
print("Loaded dataset Shape: ", df_raw.shape)
print("\nFirst 5 rows: ")
print(df_raw.head(5).to_string(index=True))

print("\nRandom 5 examples: ")
sample_df = df_raw.sample(5, random_state=11)[["label", "text"]]
print(sample_df.to_string(index=True))

Loaded dataset Shape:  (2000, 2)

First 5 rows: 
  label                                                                                 text
0   pos                              Will you be my happy ending? @IanPrasetya insyaAllah :)
1   pos       "@divarh15: @GraceGithakwa Seems like you go out alot" something like that..:)
2   pos  What was your favorite subject in school? — PHYSICS :))))))) http://t.co/h8wqtuoP8T
3   pos                                                           @Omar_Omark thanks omar :)
4   pos                                                                           Thanks :))

Random 5 examples: 
     label                                                                                                       text
41     pos  i was so anxious i was shaking and my dad was like calm down and then well apparently only 10:30 right :)
1457   neg                                                     @tv3midday Aw no.... was just about to switch over :-(
1373   pos        

## Step 2-- Quick Data Exploration

In [16]:
print("Rows:", len(df_raw))

Rows: 2000


In [19]:
print("Columns: ", df_raw.columns.tolist())
# convert it to list type by default it is object type

Columns:  ['label', 'text']


In [20]:
print("\nClass Balance: ")
print(df_raw["label"].value_counts())


Class Balance: 
label
pos    1012
neg     988
Name: count, dtype: int64


In [None]:
# Quick length probes
# creating a cloumn that hold total characters
df_raw["len_chars"] = df_raw["text"].str.len()
# creating a column that hold total words
df_raw["len_tokens"] = df_raw["text"].str.split().apply(len)

In [23]:
df_raw.head(5)

Unnamed: 0,label,text,len_chars,len_tokens
0,pos,Will you be my happy ending? @IanPrasetya insy...,55,9
1,pos,"""@divarh15: @GraceGithakwa Seems like you go o...",78,11
2,pos,What was your favorite subject in school? — PH...,83,11
3,pos,@Omar_Omark thanks omar :),26,4
4,pos,Thanks :)),10,2


In [25]:
print("\nLength (chars) - min/mean/median/max: ")
min_char = df_raw["len_chars"].min()
max_char = df_raw["len_chars"].max()
mean_char = df_raw["len_chars"].mean()
median_char = df_raw["len_chars"].median()
print(min_char, mean_char, median_char, max_char)


Length (chars) - min/mean/median/max: 
7 68.862 62.0 147


In [26]:
print("\nLength (tokens) - min/mean/median/max: ")
min_token = df_raw["len_tokens"].min()
max_token = df_raw["len_tokens"].max()
mean_token = df_raw["len_tokens"].mean()
median_token = df_raw["len_tokens"].median()
print(min_token, mean_token, median_token, max_token)


Length (tokens) - min/mean/median/max: 
2 11.6705 10.0 31


In [30]:
print("\nRandom 5 raw examples:")
print(df_raw.sample(5, random_state=101)[["label","text","len_chars", "len_tokens"]].to_string(index=False))


Random 5 raw examples:
label                                                                                                                                            text  len_chars  len_tokens
  neg                                                                                                                         @lostboxuk Very sad! :(         23           4
  neg                     @_orrhettofrappe they don't know how to make linis kasi :((( so sad. that's why im sweating kanina and it's so init pa huhu        123          23
  neg All is fair in love and war kapan update :(\n\nOh ya udah dihapus. Hilang dari muka bumi.\n\nI want to read it once more someone give me link 😢        139          30
  pos                                        There are startup community in the tropics too! Geeks on the beach :) #startupPH https://t.co/Bg4SxKN3tg        104          15
  neg    @Michael5SOS @_8bitsenpai_  can someone send me a screenshot of this conversation i want to see what i

### Step 3 — Function 1: Lowercasing

In [31]:
# lowercasing
def to_lower(text: str) -> str:
    return text.lower()

# keep a working copy to add columns step by step

df_work = df_raw[["label", "text"]].copy()
df_work["text_lower"] = df_work["text"].apply(to_lower)

print("LowerCasing preview: ")
print(df_work.sample(5, random_state=2025)[["text", "text_lower"]].to_string(index=True))

LowerCasing preview: 
                                                                                                                                            text                                                                                                                                  text_lower
1746                  It's my last day working with the munchkin today...:(...bought her a little parting gift...so far… https://t.co/0xSWksXs2t                  it's my last day working with the munchkin today...:(...bought her a little parting gift...so far… https://t.co/0xswksxs2t
844                                                                                               @nattan23 hahahaha i remember it so clearly :p                                                                                              @nattan23 hahahaha i remember it so clearly :p
1520                                                                          Wft.. can't watch the awesome replay!! :-( ht

### Step 4 — Function 2: Punctuation removal

In [36]:
import string

# Include smart quotes/dashes/ellipsis beyond ASCII punctuation
SMART_PUNCT = "“”‘’—–…"

PUNCT_TABLE = str.maketrans("", "", string.punctuation + SMART_PUNCT)

def remove_punct(text: str) -> str:
    return text.translate(PUNCT_TABLE)

# Now applying this to the lowercased text from Step 3
df_work["text_nopunct"] = df_work["text_lower"].apply(remove_punct)


print("Punctuation removal preview: ")
rand = df_work.sample(6, random_state=4445)[["text_lower", "text_nopunct"]]
print(rand.to_string())


# checking how many rows changed

changed = (df_work["text_lower"] != df_work["text_nopunct"]).sum()
print(f"\nRows altered by punctuation removal: {changed} / {len(df_work)}")


Punctuation removal preview: 
                                                                                                         text_lower                                                                                    text_nopunct
745                  come fly with me baby! :) http://t.co/jjmrvoblzl #retweet #marine #navy #airforce #battlefield               come fly with me baby  httptcojjmrvoblzl retweet marine navy airforce battlefield
852                           ive got so much things to do in 3 days. :( what is syawal now. http://t.co/qz4k9f36bs                    ive got so much things to do in 3 days  what is syawal now httptcoqz4k9f36bs
366   guys add my kik : taknottem477 #kik #kikgirl #skype #booty #nudes #mpoints #oralsex :( http://t.co/egplp1egr9  guys add my kik  taknottem477 kik kikgirl skype booty nudes mpoints oralsex  httptcoegplp1egr9
1076                            rly sad that i had to rush off when that was the last time i would see everyone :-(       

###  Step 5: Tokenization + Stopword removal

#### Step 5A — Tokenize

In [None]:
# Step 5A -- Tokenize the punctuation -stripped text
from nltk.tokenize import word_tokenize

def tokenize(text: str):
    return word_tokenize(text)

df_work["tokens"] = df_work["text_nopunct"].apply(tokenize)

print("Tokenization preview: ")
print(df_work.sample(5, random_state=555)[["text_nopunct", "tokens"]].to_string())

print("\nToken count stats (before stopword removal): ")
lens = df_work["tokens"].apply(len)

Tokenization preview: 
                                                                                                   text_nopunct                                                                                                                     tokens
239                                                                       day in lifevideo uppe om 60 minuter d                                                                             [day, in, lifevideo, uppe, om, 60, minuter, d]
409          syedihusain polite izzat  \nwese does she trust him khawateen k sath selfies say to mana kar deya            [syedihusain, polite, izzat, wese, does, she, trust, him, khawateen, k, sath, selfies, say, to, mana, kar, deya]
923                                                                         sinsalem this is a very sad moment                                                                                  [sinsalem, this, is, a, very, sad, moment]
50    pret  wkwkwwlkjhope verfied wlk

### Step 5B — Remove stopwords

In [39]:
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words("english"))

def remove_stopwords(tokens):
    return [t for t in tokens if t.lower() not in STOPWORDS]

df_work["tokens_nostop"] = df_work["tokens"].apply(remove_stopwords)

print("Stopword removal preview: ")
print(df_work.sample(6, random_state= 777)[["tokens", "tokens_nostop"]].to_string())

Stopword removal preview: 
                                                                                                      tokens                                                         tokens_nostop
563                                                                               [realliampayne, and, zayn]                                                 [realliampayne, zayn]
892                                         [i, love, you, too, and, now, i, want, corn, chips, soldhersoul]                                [love, want, corn, chips, soldhersoul]
827                                                                                  [joiredve, follback, d]                                                  [joiredve, follback]
316                            [notjagath, are, you, a, member, of, හෙල, හවුල, by, any, chance, d, chevindu]                      [notjagath, member, හෙල, හවුල, chance, chevindu]
1968  [parentingwt, well, good, luck, anne, you, can, always, go, the, indie, 