# Import Packages

In [4]:
import pandas as pd
import nltk
import re
import emoji
import contractions
import html
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer

# Ensure necessary NLTK data is available
for pkg in ['punkt', 'averaged_perceptron_tagger', 'wordnet', 'stopwords']:
    nltk.download(pkg, quiet=True)

# Initialize tools
lemmatizer = WordNetLemmatizer()

# Sample Data
We subset 20% of the data for training

In [12]:
df = pd.read_csv('Tweets.csv', encoding='utf-8-sig')
df_sample = df.sample(frac=0.2, random_state=42)

In [13]:
df_sample.head(10)

Unnamed: 0,airline_sentiment,sentiment_confidence,text
4793,positive,1.0,@SouthwestAir you're my early frontrunner for ...
4802,negative,1.0,@SouthwestAir luggage delivery between 1-4am? ...
12427,positive,0.6593,@AmericanAir is rising like the sun at DCA thi...
8879,neutral,1.0,@JetBlue do they have to depart from Washingto...
8291,negative,0.6625,@JetBlue I can probably find some of them. Are...
927,neutral,0.6612,@united I would love if someone could get me b...
13470,negative,1.0,@AmericanAir Why did I have to stand at baggag...
2816,negative,1.0,@united this means within one week i will have...
4504,negative,0.6442,@SouthwestAir darn! I bought it on the wrong d...
6853,neutral,1.0,"@JetBlue Flight 1447 (N351JB) ""JBLU"" arrives a..."


# Fully Cleaned Sample Data 
## (Traditional Features Extraction, TF-IDF)

In [6]:
NEGATIONS = {
    "no","nor","not","never","cannot","cant","can't",
    "do not","does not","did not",
    "is not","are not","was not","were not",
    "will not","won't","wont",
    "would not","should not","could not",
    "have not","has not","had not",
    "can not"  # sometimes appears split
}

KEEP_SHORT = {"ok","ugh","wtf","meh","yay","lol","omg"}
KEEP_INTENSIFIERS = {"very","so","too","really","extremely","quite","super","incredibly","totally","absolutely"}

# Domain stopwords that we want to KEEP (they carry meaning here)
PROTECTED_STOPWORDS = {
    "down",   # wifi down (negative)
    "out",    # system out / sold out
    "off",    # turned off / took off
    "up",     # back up / upgrade
    "back",   # got back / baggage back
    "over"    # overbooked
}

base_stop = set(stopwords.words('english'))
# keep negations, intensifiers, and protected domain words
custom_stop = {
    w for w in base_stop
    if (w not in NEGATIONS)
    and (w not in KEEP_INTENSIFIERS)
    and (w not in PROTECTED_STOPWORDS)
}

def _to_wordnet_pos(tb_tag):
    if tb_tag.startswith('J'): return wordnet.ADJ
    if tb_tag.startswith('V'): return wordnet.VERB
    if tb_tag.startswith('N'): return wordnet.NOUN
    if tb_tag.startswith('R'): return wordnet.ADV
    return wordnet.NOUN

def clean_tweet(text):
    # 1) normalize & basic noise removal (preserve tone signals)
    text = contractions.fix(str(text))                         # "don't" -> "do not"
    text = emoji.demojize(text, delimiters=(" ", " "))        # 🙂 -> slightly_smiling_face
    text = text.lower()
    text = re.sub(r'http\S+|www\.\S+', ' ', text)             # remove URLs
    text = re.sub(r'@\w+', ' ', text)                         # remove mentions
    text = text.replace('#', ' ')                             # keep hashtag token
    # keep letters, underscores, spaces, and sentiment punctuation ! ?
    text = re.sub(r'[^a-z_!\?\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def protect_negations(text):
    """
    Glue common (expanded + contracted) negations to the next token:
    e.g., 'do not like' -> 'do_not_like', 'won't board' -> 'won't_board'
    """
    neg_re = (
        r"(?:no|nor|not|never|cannot|cant|can't|"
        r"do not|does not|did not|is not|are not|was not|were not|"
        r"will not|won't|wont|would not|should not|could not|"
        r"have not|has not|had not|can not)"
    )
    # replace spaces inside [NEGATION + next_token] span with underscores
    def repl(m): return m.group(0).replace(' ', '_')
    return re.sub(rf"\b{neg_re}\s+[a-z_]+", repl, text)

def pos_lemmatize_with_stop(text):
    """
    Keep negations/emoji tokens; keep intensifiers and short emotion tokens;
    lemmatize nouns/verbs; avoid lemmatizing ADJ/ADV to preserve tone.
    """
    toks = word_tokenize(text)
    kept = []
    for t in toks:
        if t in NEGATIONS or '_' in t:                  # glued negations & emoji words
            kept.append(t)
        elif t in KEEP_SHORT or t in KEEP_INTENSIFIERS:
            kept.append(t)
        elif t in custom_stop:
            continue
        else:
            kept.append(t)

    tagged = pos_tag(kept)
    lemmas = []
    for w, tag in tagged:
        if tag.startswith('J') or tag.startswith('R'):  # keep adjectives/adverbs as-is
            lemmas.append(w)
        else:
            lemmas.append(lemmatizer.lemmatize(w, _to_wordnet_pos(tag)))
    return ' '.join(lemmas)

def clean_pipeline(text):
    text = clean_tweet(text)
    text = protect_negations(text)
    text = pos_lemmatize_with_stop(text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [7]:
df_cleaned_sample = df_sample.copy()
df_cleaned_sample['clean_text'] = df_cleaned_sample['text'].apply(clean_pipeline)
df_cleaned_sample.to_csv('cleaned_sample_tweets.csv', index=False)

In [8]:
df_cleaned_sample.head(10)

Unnamed: 0,airline_sentiment,sentiment_confidence,text,clean_text
4793,positive,1.0,@SouthwestAir you're my early frontrunner for ...,early frontrunner best airline ! oscar
4802,negative,1.0,@SouthwestAir luggage delivery between 1-4am? ...,luggage delivery ? really ? tell midnight mult...
12427,positive,0.6593,@AmericanAir is rising like the sun at DCA thi...,rise like sun dca morning member best view avgeek
8879,neutral,1.0,@JetBlue do they have to depart from Washingto...,depart washington c ? ?
8291,negative,0.6625,@JetBlue I can probably find some of them. Are...,probably find ticket ?
927,neutral,0.6612,@united I would love if someone could get me b...,would love someone could get back austin tonig...
13470,negative,1.0,@AmericanAir Why did I have to stand at baggag...,stand baggage claim hour wait bag know never_m...
2816,negative,1.0,@united this means within one week i will have...,mean within one week file compensation complai...
4504,negative,0.6442,@SouthwestAir darn! I bought it on the wrong d...,darn ! buy wrong device ! no_way switch sure ?
6853,neutral,1.0,"@JetBlue Flight 1447 (N351JB) ""JBLU"" arrives a...",flight n jb jblu arrive follow flight westches...


# Minimally Cleaned Sample Data
## (GloVe, SBERT, BERTweet, CardiffNLP)

In [9]:
def min_clean_pipeline(text):
    text = html.unescape(text)             # Decode &amp;, &gt;, etc.
    text = re.sub(r"http\S+", "", text)    # Remove URLs
    text = re.sub(r"@\w+", "", text)       # Remove mentions
    text = re.sub(r"\s+", " ", text)       # Normalize whitespace
    return text.strip()

In [10]:
df_min_cleaned_sample = df_sample.copy()
df_min_cleaned_sample['clean_text'] = df_min_cleaned_sample['text'].apply(min_clean_pipeline)
df_min_cleaned_sample.to_csv('min_cleaned_sample_tweets.csv', index=False, encoding='utf-8-sig')

In [11]:
df_min_cleaned_sample.head(10)

Unnamed: 0,airline_sentiment,sentiment_confidence,text,clean_text
4793,positive,1.0,@SouthwestAir you're my early frontrunner for ...,you're my early frontrunner for best airline! ...
4802,negative,1.0,@SouthwestAir luggage delivery between 1-4am? ...,luggage delivery between 1-4am? Really? After ...
12427,positive,0.6593,@AmericanAir is rising like the sun at DCA thi...,is rising like the sun at DCA this morning. me...
8879,neutral,1.0,@JetBlue do they have to depart from Washingto...,"do they have to depart from Washington, D.C.??"
8291,negative,0.6625,@JetBlue I can probably find some of them. Are...,I can probably find some of them. Are the tick...
927,neutral,0.6612,@united I would love if someone could get me b...,I would love if someone could get me back to A...
13470,negative,1.0,@AmericanAir Why did I have to stand at baggag...,Why did I have to stand at baggage claim for a...
2816,negative,1.0,@united this means within one week i will have...,this means within one week i will have filed 2...
4504,negative,0.6442,@SouthwestAir darn! I bought it on the wrong d...,darn! I bought it on the wrong device! No way ...
6853,neutral,1.0,"@JetBlue Flight 1447 (N351JB) ""JBLU"" arrives a...","Flight 1447 (N351JB) ""JBLU"" arrives at followi..."
