In [4]:
# Imports & setup

import os, re, numpy as np, pandas as pd, tensorflow as tf
from nltk import download as nltk_download
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

'''
Sets the seed for TensorFlow’s internal random number generator
TensorFlow uses randomness in many places: Initializing weights, Shuffling data, Dropout layers;
Setting this makes sure that every time running the notebook, we'll get the same model initialization and training behavior
'''
tf.random.set_seed(42) # common placeholder seed value
np.random.seed(42) # helps ensure NumPy-based random behavior is the same across runs.

nltk_download("punkt")
nltk_download("stopwords")
nltk_download("wordnet")    
nltk_download("omw-1.4") 


[nltk_data] Downloading package punkt to /Users/yifanchen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yifanchen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yifanchen/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/yifanchen/nltk_data...


True

In [5]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

STOP_WORDS = set(stopwords.words("english"))
NEGATION_WORDS = {"no", "not", "nor", "cannot", "can't", "won't", "n't", "never"}
CUSTOM_STOPWORDS = STOP_WORDS - NEGATION_WORDS

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"@\w+|#", "", text)
    text = re.sub(r"[^a-zA-Z']", " ", text)
    tokens = word_tokenize(text.strip())
    tokens = [word for word in tokens if word not in CUSTOM_STOPWORDS]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(lemmatized_tokens)



In [6]:
'''Load & clean'''

CSV_PATH = "Tweets.csv" # dataset file to be tested

# Read the raw dataset into a DataFrame
df = pd.read_csv(CSV_PATH)
# Apply the text cleaner to every tweet
df["cleaned_text"] = df["text"].apply(preprocess_text)

# Split out features and labels
texts  = df["cleaned_text"].values
labels = df["airline_sentiment"].values
print("Dataset size:", len(df))


Dataset size: 14640


In [7]:
'''Train/Test split + label encoding'''

# Split so every model sees the exact same data partitions
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.20, random_state=42, stratify=labels
)

# Map string labels to integer IDs
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc  = label_encoder.transform(y_test)

num_classes  = len(label_encoder.classes_)
y_train_cat  = to_categorical(y_train_enc, num_classes)
y_test_cat   = to_categorical(y_test_enc,  num_classes)

print("Classes →", label_encoder.classes_)


Classes → ['negative' 'neutral' 'positive']


In [8]:
'''Tokeniser & padding'''

# Hyper‑parameters for the tokeniser and sequences
MAX_VOCAB, MAX_SEQ_LEN, EMBED_DIM = 10_000, 100, 64 # max_vocab limits the size of the vocabulary that the model is allowed to learn

# Build and fit the tokeniser on train texts only 
tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert texts to integer sequences and pad sequences to uniform length
X_train_pad = pad_sequences(tokenizer.texts_to_sequences(X_train),
                            maxlen=MAX_SEQ_LEN, padding="post")
X_test_pad  = pad_sequences(tokenizer.texts_to_sequences(X_test),
                            maxlen=MAX_SEQ_LEN, padding="post")

print("Vocab size (training):", len(tokenizer.word_index))


Vocab size (training): 9088


In [9]:
'''Build Bidirectional-LSTM architecture '''

# Define the model sequentially; add input_shape so Keras builds immediately
model = Sequential([
    Embedding(input_dim=MAX_VOCAB, # vocabulary cap
              output_dim=EMBED_DIM, # embedding vector length
              input_shape=(MAX_SEQ_LEN,)),  # one integer per time‑step
    Bidirectional(LSTM(64)),  # 64 units forward + 64 backward
    Dropout(0.5), # regularisation
    Dense(num_classes, activation="softmax") # 3‑way sentiment output
])

# Compile with categorical cross‑entropy & Adam optimiser
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary() 


  super().__init__(**kwargs)


In [10]:
'''Compute balanced weights for each class'''

# Balanced weighting: each class contributes equally to loss
weights = compute_class_weight("balanced",
                               classes=np.unique(y_train_enc),
                               y=y_train_enc)
class_weights = dict(enumerate(weights))
print("Class weights:", class_weights)


Class weights: {0: 0.5316628081165736, 1: 1.5748285599031868, 2: 2.0656084656084657}


In [11]:
# Train with early stopping

early_stop = EarlyStopping(
	monitor="val_loss",  # watch validation loss
	patience=3, # stop after 3 epochs w/o improvement
    restore_best_weights=True) # roll back to the best epoch

history = model.fit(
    X_train_pad, y_train_cat,
    epochs=15, batch_size=32,
    validation_split=0.2, # hold out 20 % of train as val
    class_weight=class_weights, # handle class imbalance
    callbacks=[early_stop],
    verbose=2 # 1 line/epoch
)


Epoch 1/15
293/293 - 11s - 36ms/step - accuracy: 0.6374 - loss: 0.8351 - val_accuracy: 0.7759 - val_loss: 0.5657
Epoch 2/15
293/293 - 11s - 37ms/step - accuracy: 0.8082 - loss: 0.4980 - val_accuracy: 0.7840 - val_loss: 0.5452
Epoch 3/15
293/293 - 11s - 38ms/step - accuracy: 0.8720 - loss: 0.3496 - val_accuracy: 0.7708 - val_loss: 0.6155
Epoch 4/15
293/293 - 11s - 38ms/step - accuracy: 0.9056 - loss: 0.2615 - val_accuracy: 0.7589 - val_loss: 0.7028
Epoch 5/15
293/293 - 11s - 39ms/step - accuracy: 0.9273 - loss: 0.2109 - val_accuracy: 0.7554 - val_loss: 0.8055


In [12]:
'''Evaluation & classification report'''

# Evaluate on the reserved test partition
test_loss, test_acc = model.evaluate(X_test_pad, y_test_cat, verbose=0)
print(f"Test accuracy: {test_acc:.4f} | Test loss: {test_loss:.4f}")

# Predict class indices on test set
y_pred = model.predict(X_test_pad, verbose=0).argmax(axis=1)

# Report
print("\nClassification report:\n")
print(classification_report(y_test_enc, y_pred,
                            target_names=label_encoder.classes_))
print("Macro-F1:", f1_score(y_test_enc, y_pred, average="macro"))


Test accuracy: 0.7760 | Test loss: 0.5550

Classification report:

              precision    recall  f1-score   support

    negative       0.88      0.84      0.86      1835
     neutral       0.59      0.63      0.61       620
    positive       0.68      0.70      0.69       473

    accuracy                           0.78      2928
   macro avg       0.71      0.73      0.72      2928
weighted avg       0.78      0.78      0.78      2928

Macro-F1: 0.7185641204283146


In [None]:
'''Save predictions to CSV'''

pred_df = pd.DataFrame({
    "text"           : X_test,
    "airline_sentiment"     : label_encoder.inverse_transform(y_test_enc),
    "predicted_sentiment": label_encoder.inverse_transform(y_pred)
})
pred_df.to_csv("tf_predictions_lemmatization.csv", index=False)
print("Predictions saved → tf_predictions_lemmatization.csv")


Predictions saved → tf_predictions.csv
