In [2]:
# Imports & setup

import os, re, numpy as np, pandas as pd, tensorflow as tf
from nltk import download as nltk_download
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

tf.random.set_seed(42);  np.random.seed(42)

nltk_download("punkt");  nltk_download("stopwords")


[nltk_data] Downloading package punkt to /Users/yifanchen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yifanchen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:

# Pre‑processing

STOP_WORDS = set(stopwords.words("english"))
# keep negation words so sentiment isn’t flipped
NEGATION_WORDS = {"no", "not", "nor", "cannot", "can't", "won't", "n't"}
CUSTOM_STOPWORDS = STOP_WORDS - NEGATION_WORDS

def preprocess_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"@\w+|#", "", text)                # strip mentions & hashtags
    text = re.sub(r"[^a-zA-Z]", " ", text)            # keep letters only
    tokens = word_tokenize(text.strip())
    tokens = [tok for tok in tokens if tok not in CUSTOM_STOPWORDS]
    return " ".join(tokens)


In [4]:
# Load & clean

CSV_PATH = "Tweets.csv"   # ← ensure this file is present
df = pd.read_csv(CSV_PATH)
df["cleaned_text"] = df["text"].apply(preprocess_text)

texts  = df["cleaned_text"].values
labels = df["airline_sentiment"].values
print("Dataset size:", len(df))


Dataset size: 14640


In [5]:
# Train/Test split + label encoding

X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.20, random_state=42, stratify=labels
)

label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc  = label_encoder.transform(y_test)

num_classes  = len(label_encoder.classes_)
y_train_cat  = to_categorical(y_train_enc, num_classes)
y_test_cat   = to_categorical(y_test_enc,  num_classes)

print("Classes →", label_encoder.classes_)


Classes → ['negative' 'neutral' 'positive']


In [6]:
# Tokeniser & padding

MAX_VOCAB, MAX_SEQ_LEN, EMBED_DIM = 10_000, 100, 64

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_pad = pad_sequences(tokenizer.texts_to_sequences(X_train),
                            maxlen=MAX_SEQ_LEN, padding="post")
X_test_pad  = pad_sequences(tokenizer.texts_to_sequences(X_test),
                            maxlen=MAX_SEQ_LEN, padding="post")

print("Vocab size (training):", len(tokenizer.word_index))


Vocab size (training): 9824


In [7]:
# Build model


model = Sequential([
    Embedding(input_dim=MAX_VOCAB,
              output_dim=EMBED_DIM,
              input_shape=(MAX_SEQ_LEN,)),   # ← change here
    Bidirectional(LSTM(64)),
    Dropout(0.5),
    Dense(num_classes, activation="softmax")
])

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()          # now shows non‑zero params


  super().__init__(**kwargs)


In [8]:
# Compute class weights

weights = compute_class_weight("balanced",
                               classes=np.unique(y_train_enc),
                               y=y_train_enc)
class_weights = dict(enumerate(weights))
print("Class weights:", class_weights)


Class weights: {0: 0.5316628081165736, 1: 1.5748285599031868, 2: 2.0656084656084657}


In [9]:
# Train

early_stop = EarlyStopping(monitor="val_loss", patience=3,
                           restore_best_weights=True)

history = model.fit(
    X_train_pad, y_train_cat,
    epochs=15, batch_size=32,
    validation_split=0.2,
    class_weight=class_weights,
    callbacks=[early_stop],
    verbose=2
)


Epoch 1/15
293/293 - 10s - 36ms/step - accuracy: 0.6304 - loss: 0.8414 - val_accuracy: 0.7405 - val_loss: 0.6255
Epoch 2/15
293/293 - 11s - 38ms/step - accuracy: 0.8067 - loss: 0.4967 - val_accuracy: 0.7764 - val_loss: 0.5570
Epoch 3/15
293/293 - 11s - 38ms/step - accuracy: 0.8711 - loss: 0.3434 - val_accuracy: 0.7682 - val_loss: 0.6313
Epoch 4/15
293/293 - 11s - 38ms/step - accuracy: 0.9095 - loss: 0.2507 - val_accuracy: 0.7554 - val_loss: 0.7721
Epoch 5/15
293/293 - 11s - 38ms/step - accuracy: 0.9305 - loss: 0.1972 - val_accuracy: 0.7490 - val_loss: 0.8650


In [10]:
#  Evaluate & report

test_loss, test_acc = model.evaluate(X_test_pad, y_test_cat, verbose=0)
print(f"Test accuracy: {test_acc:.4f} | Test loss: {test_loss:.4f}")

y_pred = model.predict(X_test_pad, verbose=0).argmax(axis=1)

print("\nClassification report:\n")
print(classification_report(y_test_enc, y_pred,
                            target_names=label_encoder.classes_))
print("Macro‑F1:", f1_score(y_test_enc, y_pred, average="macro"))


Test accuracy: 0.7770 | Test loss: 0.5572

Classification report:

              precision    recall  f1-score   support

    negative       0.88      0.83      0.85      1835
     neutral       0.59      0.68      0.63       620
    positive       0.68      0.71      0.69       473

    accuracy                           0.78      2928
   macro avg       0.72      0.74      0.73      2928
weighted avg       0.79      0.78      0.78      2928

Macro‑F1: 0.7265062695390565


In [11]:
# Save per‑tweet predictions

pred_df = pd.DataFrame({
    "text"           : X_test,
    "true_label"     : label_encoder.inverse_transform(y_test_enc),
    "predicted_label": label_encoder.inverse_transform(y_pred)
})
pred_df.to_csv("tf_predictions.csv", index=False)
print("Predictions saved → tf_predictions.csv")


Predictions saved → tf_predictions.csv
