# Import Packages

In [11]:
import pandas as pd
import nltk
import re
import emoji
import contractions
import html
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer

# Ensure necessary NLTK data is available
for pkg in ['punkt', 'averaged_perceptron_tagger', 'wordnet', 'stopwords']:
    nltk.download(pkg, quiet=True)

# Initialize tools
lemmatizer = WordNetLemmatizer()

# Original Data

In [5]:
df = pd.read_csv('Tweets.csv', encoding='utf-8-sig')
df.head(10)

Unnamed: 0,airline_sentiment,sentiment_confidence,text
0,neutral,1.0,@VirginAmerica What @dhepburn said.
1,positive,0.3486,@VirginAmerica plus you've added commercials t...
2,neutral,0.6837,@VirginAmerica I didn't today... Must mean I n...
3,negative,1.0,@VirginAmerica it's really aggressive to blast...
4,negative,1.0,@VirginAmerica and it's a really big bad thing...
5,negative,1.0,@VirginAmerica seriously would pay $30 a fligh...
6,positive,0.6745,"@VirginAmerica yes, nearly every time I fly VX..."
7,neutral,0.634,@VirginAmerica Really missed a prime opportuni...
8,positive,0.6559,"@virginamerica Well, I didn'tâ€¦but NOW I DO! :-D"
9,positive,1.0,"@VirginAmerica it was amazing, and arrived an ..."


# Fully Cleaned Data 
## (Traditional Features Extraction, TF-IDF)

In [6]:
NEGATIONS = {
    "no","nor","not","never","cannot","cant","can't",
    "do not","does not","did not",
    "is not","are not","was not","were not",
    "will not","won't","wont",
    "would not","should not","could not",
    "have not","has not","had not",
    "can not"  # sometimes appears split
}

KEEP_SHORT = {"ok","ugh","wtf","meh","yay","lol","omg"}
KEEP_INTENSIFIERS = {"very","so","too","really","extremely","quite","super","incredibly","totally","absolutely"}

# Domain stopwords that we want to KEEP (they carry meaning here)
PROTECTED_STOPWORDS = {
    "down",   # wifi down (negative)
    "out",    # system out / sold out
    "off",    # turned off / took off
    "up",     # back up / upgrade
    "back",   # got back / baggage back
    "over"    # overbooked
}

base_stop = set(stopwords.words('english'))
# keep negations, intensifiers, and protected domain words
custom_stop = {
    w for w in base_stop
    if (w not in NEGATIONS)
    and (w not in KEEP_INTENSIFIERS)
    and (w not in PROTECTED_STOPWORDS)
}

def _to_wordnet_pos(tb_tag):
    if tb_tag.startswith('J'): return wordnet.ADJ
    if tb_tag.startswith('V'): return wordnet.VERB
    if tb_tag.startswith('N'): return wordnet.NOUN
    if tb_tag.startswith('R'): return wordnet.ADV
    return wordnet.NOUN

def clean_tweet(text):
    # 1) normalize & basic noise removal (preserve tone signals)
    text = contractions.fix(str(text))                         # "don't" -> "do not"
    text = emoji.demojize(text, delimiters=(" ", " "))        # ðŸ™‚ -> slightly_smiling_face
    text = text.lower()
    text = re.sub(r'http\S+|www\.\S+', ' ', text)             # remove URLs
    text = re.sub(r'@\w+', ' ', text)                         # remove mentions
    text = text.replace('#', ' ')                             # keep hashtag token
    # keep letters, underscores, spaces, and sentiment punctuation ! ?
    text = re.sub(r'[^a-z_!\?\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def protect_negations(text):
    """
    Glue common (expanded + contracted) negations to the next token:
    e.g., 'do not like' -> 'do_not_like', 'won't board' -> 'won't_board'
    """
    neg_re = (
        r"(?:no|nor|not|never|cannot|cant|can't|"
        r"do not|does not|did not|is not|are not|was not|were not|"
        r"will not|won't|wont|would not|should not|could not|"
        r"have not|has not|had not|can not)"
    )
    # replace spaces inside [NEGATION + next_token] span with underscores
    def repl(m): return m.group(0).replace(' ', '_')
    return re.sub(rf"\b{neg_re}\s+[a-z_]+", repl, text)

def pos_lemmatize_with_stop(text):
    """
    Keep negations/emoji tokens; keep intensifiers and short emotion tokens;
    lemmatize nouns/verbs; avoid lemmatizing ADJ/ADV to preserve tone.
    """
    toks = word_tokenize(text)
    kept = []
    for t in toks:
        if t in NEGATIONS or '_' in t:                  # glued negations & emoji words
            kept.append(t)
        elif t in KEEP_SHORT or t in KEEP_INTENSIFIERS:
            kept.append(t)
        elif t in custom_stop:
            continue
        else:
            kept.append(t)

    tagged = pos_tag(kept)
    lemmas = []
    for w, tag in tagged:
        if tag.startswith('J') or tag.startswith('R'):  # keep adjectives/adverbs as-is
            lemmas.append(w)
        else:
            lemmas.append(lemmatizer.lemmatize(w, _to_wordnet_pos(tag)))
    return ' '.join(lemmas)

def clean_pipeline(text):
    text = clean_tweet(text)
    text = protect_negations(text)
    text = pos_lemmatize_with_stop(text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [7]:
df_cleaned = df.copy()
df_cleaned['clean_text'] = df_cleaned['text'].apply(clean_pipeline)
df_cleaned.to_csv('cleaned_tweets.csv', index=False)

In [8]:
df_cleaned.head(10)

Unnamed: 0,airline_sentiment,sentiment_confidence,text,clean_text
0,neutral,1.0,@VirginAmerica What @dhepburn said.,say
1,positive,0.3486,@VirginAmerica plus you've added commercials t...,plus added commercial experience tacky
2,neutral,0.6837,@VirginAmerica I didn't today... Must mean I n...,did_not_today must mean need take another trip !
3,negative,1.0,@VirginAmerica it's really aggressive to blast...,really aggressive blast obnoxious entertainmen...
4,negative,1.0,@VirginAmerica and it's a really big bad thing...,really big bad thing
5,negative,1.0,@VirginAmerica seriously would pay $30 a fligh...,seriously would pay flight seat did_not_have p...
6,positive,0.6745,"@VirginAmerica yes, nearly every time I fly VX...",yes nearly every time fly vx ear worm will_not...
7,neutral,0.634,@VirginAmerica Really missed a prime opportuni...,really missed prime opportunity men without ha...
8,positive,0.6559,"@virginamerica Well, I didn'tâ€¦but NOW I DO! :-D",well did_not_but !
9,positive,1.0,"@VirginAmerica it was amazing, and arrived an ...",amaze arrive hour early too good


# Minimally Cleaned Data
## (GloVe, SBERT, BERTweet, CardiffNLP)

In [9]:
def min_clean_pipeline(text):
    text = html.unescape(text)             # Decode &amp;, &gt;, etc.
    text = re.sub(r"http\S+", "", text)    # Remove URLs
    text = re.sub(r"@\w+", "", text)       # Remove mentions
    text = re.sub(r"\s+", " ", text)       # Normalize whitespace
    return text.strip()

In [12]:
df_min_cleaned = df.copy()
df_min_cleaned['clean_text'] = df_min_cleaned['text'].apply(min_clean_pipeline)
df_min_cleaned.to_csv('min_cleaned_tweets.csv', index=False, encoding='utf-8-sig')

In [13]:
df_min_cleaned.head(10)

Unnamed: 0,airline_sentiment,sentiment_confidence,text,clean_text
0,neutral,1.0,@VirginAmerica What @dhepburn said.,What said.
1,positive,0.3486,@VirginAmerica plus you've added commercials t...,plus you've added commercials to the experienc...
2,neutral,0.6837,@VirginAmerica I didn't today... Must mean I n...,I didn't today... Must mean I need to take ano...
3,negative,1.0,@VirginAmerica it's really aggressive to blast...,"it's really aggressive to blast obnoxious ""ent..."
4,negative,1.0,@VirginAmerica and it's a really big bad thing...,and it's a really big bad thing about it
5,negative,1.0,@VirginAmerica seriously would pay $30 a fligh...,seriously would pay $30 a flight for seats tha...
6,positive,0.6745,"@VirginAmerica yes, nearly every time I fly VX...","yes, nearly every time I fly VX this â€œear worm..."
7,neutral,0.634,@VirginAmerica Really missed a prime opportuni...,Really missed a prime opportunity for Men With...
8,positive,0.6559,"@virginamerica Well, I didn'tâ€¦but NOW I DO! :-D","Well, I didn'tâ€¦but NOW I DO! :-D"
9,positive,1.0,"@VirginAmerica it was amazing, and arrived an ...","it was amazing, and arrived an hour early. You..."
