<a href="https://colab.research.google.com/github/prashanth741/NLP-LAB/blob/main/12_09.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Download stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Example tweets dataset
tweets = [
    "I love NLP! #AI",
    "Deep learning is amazing. @user",
    "Worst product ever, I hate it!!!",
    "Good service and friendly staff :)"
]

# Cleaning function
def clean_tweet(text):
    text = text.lower()                             # lowercase
    text = re.sub(r'@\w+', '', text)                # remove mentions
    text = re.sub(r'#\w+', '', text)                # remove hashtags
    text = re.sub(r'http\S+', '', text)             # remove URLs
    text = re.sub(r'[^a-z\s]', '', text)            # remove numbers/punct
    text = ' '.join([w for w in text.split() if w not in stop_words])  # remove stopwords
    return text

# Apply cleaning
cleaned_tweets = [clean_tweet(t) for t in tweets]
print("Cleaned Tweets:", cleaned_tweets)

# Tokenization
max_words = 5000   # vocabulary size
tokenizer = Tokenizer(num_words=max_words, oov_token="")
tokenizer.fit_on_texts(cleaned_tweets)

# Convert to sequences
sequences = tokenizer.texts_to_sequences(cleaned_tweets)
print("Tokenized Sequences:", sequences)

# Padding
max_len = 10   # maximum length for padding
padded = pad_sequences(sequences, maxlen=max_len, padding="post")
print("Padded Sequences:\n", padded)

Cleaned Tweets: ['love nlp', 'deep learning amazing', 'worst product ever hate', 'good service friendly staff']
Tokenized Sequences: [[2, 3], [4, 5, 6], [7, 8, 9, 10], [11, 12, 13, 14]]
Padded Sequences:
 [[ 2  3  0  0  0  0  0  0  0  0]
 [ 4  5  6  0  0  0  0  0  0  0]
 [ 7  8  9 10  0  0  0  0  0  0]
 [11 12 13 14  0  0  0  0  0  0]]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Use the cleaned tweets from Task 1
cleaned_tweets = [
    'love nlp',
    'deep learning amazing',
    'worst product ever hate',
    'good service friendly staff'
]

# ----- CountVectorizer -----
count_vectorizer = CountVectorizer()
X_count = count_vectorizer.fit_transform(cleaned_tweets)

print("CountVectorizer Vocabulary:\n", count_vectorizer.vocabulary_)
print("\nCountVectorizer Feature Matrix:\n", X_count.toarray())

# ----- TF-IDF Vectorizer -----
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(cleaned_tweets)

print("\nTF-IDF Vocabulary:\n", tfidf_vectorizer.vocabulary_)
print("\nTF-IDF Feature Matrix:\n", X_tfidf.toarray())

CountVectorizer Vocabulary:
 {'love': 7, 'nlp': 8, 'deep': 1, 'learning': 6, 'amazing': 0, 'worst': 12, 'product': 9, 'ever': 2, 'hate': 5, 'good': 4, 'service': 10, 'friendly': 3, 'staff': 11}

CountVectorizer Feature Matrix:
 [[0 0 0 0 0 0 0 1 1 0 0 0 0]
 [1 1 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 1 0 0 1 0 0 0 1 0 0 1]
 [0 0 0 1 1 0 0 0 0 0 1 1 0]]

TF-IDF Vocabulary:
 {'love': 7, 'nlp': 8, 'deep': 1, 'learning': 6, 'amazing': 0, 'worst': 12, 'product': 9, 'ever': 2, 'hate': 5, 'good': 4, 'service': 10, 'friendly': 3, 'staff': 11}

TF-IDF Feature Matrix:
 [[0.         0.         0.         0.         0.         0.
  0.         0.70710678 0.70710678 0.         0.         0.
  0.        ]
 [0.57735027 0.57735027 0.         0.         0.         0.
  0.57735027 0.         0.         0.         0.         0.
  0.        ]
 [0.         0.         0.5        0.         0.         0.5
  0.         0.         0.         0.5        0.         0.
  0.5       ]
 [0.         0.         0.         0.5   

In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Conv1D, LSTM, Dropout

# Example labels for demo (replace with your dataset labels)
y = np.array([1, 1, 0, 1])   # 1=positive, 0=negative

# Use padded sequences from Task 1
# (re-using "padded" from preprocessing step)
X = padded

# Train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Common parameters
vocab_size = 5000   # same as tokenizer
embedding_dim = 50
max_len = X.shape[1]

# ----- 1. MLP (Averaged Embeddings) -----
mlp_model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_len),
    GlobalAveragePooling1D(),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])
mlp_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
print("\nTraining MLP...")
mlp_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=16)

# ----- 2. CNN (1D) -----
cnn_model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_len),
    Conv1D(128, 5, activation="relu"),
    GlobalAveragePooling1D(),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])
cnn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
print("\nTraining CNN...")
cnn_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=16)

# ----- 3. LSTM -----
lstm_model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_len),
    LSTM(64, dropout=0.3, recurrent_dropout=0.3),
    Dense(1, activation="sigmoid")
])
lstm_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
print("\nTraining LSTM...")
lstm_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=16)



Training MLP...
Epoch 1/5




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.3333 - loss: 0.6940 - val_accuracy: 1.0000 - val_loss: 0.6843
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step - accuracy: 0.6667 - loss: 0.6889 - val_accuracy: 1.0000 - val_loss: 0.6781
Epoch 3/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step - accuracy: 0.6667 - loss: 0.6895 - val_accuracy: 1.0000 - val_loss: 0.6730
Epoch 4/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step - accuracy: 0.6667 - loss: 0.6900 - val_accuracy: 1.0000 - val_loss: 0.6684
Epoch 5/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step - accuracy: 0.6667 - loss: 0.6812 - val_accuracy: 1.0000 - val_loss: 0.6636

Training CNN...
Epoch 1/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.3333 - loss: 0.6929 - val_accuracy: 1.0000 - val_loss: 0.6757
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7abf204ff6b0>

In [5]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import pandas as pd

# --- Helper function ---
def evaluate_model(model, X_test, y_test, deep=True):
    if deep:  # for Keras models
        y_pred = (model.predict(X_test) > 0.5).astype("int32")
    else:     # for scikit-learn models
        y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary")
    return acc, precision, recall, f1

# --- Evaluate Deep Models ---
mlp_metrics = evaluate_model(mlp_model, X_test, y_test, deep=True)
cnn_metrics = evaluate_model(cnn_model, X_test, y_test, deep=True)
lstm_metrics = evaluate_model(lstm_model, X_test, y_test, deep=True)

# --- Classical ML with TF-IDF ---
# Split TF-IDF features
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42
)

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_tfidf, y_train_tfidf)
log_reg_metrics = evaluate_model(log_reg, X_test_tfidf, y_test_tfidf, deep=False)

# SVM
svm = SVC(kernel="linear")
svm.fit(X_train_tfidf, y_train_tfidf)
svm_metrics = evaluate_model(svm, X_test_tfidf, y_test_tfidf, deep=False)

# --- Collect Results ---
results = pd.DataFrame({
    "Model": ["MLP", "CNN", "LSTM", "Logistic Regression", "SVM"],
    "Accuracy": [mlp_metrics[0], cnn_metrics[0], lstm_metrics[0], log_reg_metrics[0], svm_metrics[0]],
    "Precision": [mlp_metrics[1], cnn_metrics[1], lstm_metrics[1], log_reg_metrics[1], svm_metrics[1]],
    "Recall": [mlp_metrics[2], cnn_metrics[2], lstm_metrics[2], log_reg_metrics[2], svm_metrics[2]],
    "F1-Score": [mlp_metrics[3], cnn_metrics[3], lstm_metrics[3], log_reg_metrics[3], svm_metrics[3]]
})

print("\n=== Evaluation Results ===")
print(results)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 280ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step

=== Evaluation Results ===
                 Model  Accuracy  Precision  Recall  F1-Score
0                  MLP       1.0        1.0     1.0       1.0
1                  CNN       1.0        1.0     1.0       1.0
2                 LSTM       1.0        1.0     1.0       1.0
3  Logistic Regression       1.0        1.0     1.0       1.0
4                  SVM       1.0        1.0     1.0       1.0


In [6]:
# Assumes "results" DataFrame is already created from Task 4
print("\n=== Final Results Table ===")
print(results)

# Analysis function
def analyze_results(results):
    # Get best model by F1-score
    best_model = results.loc[results["F1-Score"].idxmax()]

    print("\n=== Brief Analysis ===")

    # 1. Embeddings vs TF-IDF
    avg_deep_f1 = results.loc[results["Model"].isin(["MLP","CNN","LSTM"]), "F1-Score"].mean()
    avg_classical_f1 = results.loc[results["Model"].isin(["Logistic Regression","SVM"]), "F1-Score"].mean()
    if avg_deep_f1 > avg_classical_f1:
        print(f"- Embeddings improved performance over TF-IDF "
              f"({avg_deep_f1:.3f} vs {avg_classical_f1:.3f} F1-score).")
    else:
        print(f"- TF-IDF performed better or comparable to embeddings "
              f"({avg_classical_f1:.3f} vs {avg_deep_f1:.3f} F1-score).")

    # 2. Best neural network
    best_nn = results.loc[results["Model"].isin(["MLP","CNN","LSTM"])].sort_values("F1-Score", ascending=False).iloc[0]
    print(f"- Among neural networks, {best_nn['Model']} benefited most from embeddings "
          f"(F1={best_nn['F1-Score']:.3f}).")

    # 3. Sequential models (LSTM) vs CNN/MLP
    lstm_f1 = results.loc[results["Model"]=="LSTM","F1-Score"].values[0]
    cnn_f1 = results.loc[results["Model"]=="CNN","F1-Score"].values[0]
    mlp_f1 = results.loc[results["Model"]=="MLP","F1-Score"].values[0]

    if lstm_f1 > max(cnn_f1, mlp_f1):
        print(f"- LSTM (F1={lstm_f1:.3f}) outperformed CNN (F1={cnn_f1:.3f}) and MLP (F1={mlp_f1:.3f}), "
              "suggesting sequential models capture tweet context better.")
    else:
        print(f"- LSTM (F1={lstm_f1:.3f}) did not clearly outperform CNN (F1={cnn_f1:.3f}) or MLP (F1={mlp_f1:.3f}).")

    print(f"\n=> Best overall model: {best_model['Model']} with "
          f"F1={best_model['F1-Score']:.3f}, Accuracy={best_model['Accuracy']:.3f}")

# Run analysis
analyze_results(results)


=== Final Results Table ===
                 Model  Accuracy  Precision  Recall  F1-Score
0                  MLP       1.0        1.0     1.0       1.0
1                  CNN       1.0        1.0     1.0       1.0
2                 LSTM       1.0        1.0     1.0       1.0
3  Logistic Regression       1.0        1.0     1.0       1.0
4                  SVM       1.0        1.0     1.0       1.0

=== Brief Analysis ===
- TF-IDF performed better or comparable to embeddings (1.000 vs 1.000 F1-score).
- Among neural networks, MLP benefited most from embeddings (F1=1.000).
- LSTM (F1=1.000) did not clearly outperform CNN (F1=1.000) or MLP (F1=1.000).

=> Best overall model: MLP with F1=1.000, Accuracy=1.000
