<a href="https://colab.research.google.com/github/saikirankesoju/NLP/blob/main/25-09-2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
"""
end_to_end_sentiment_with_datasets.py

- Downloads a public tweet sentiment dataset (Sentiment140) automatically (or uses a small built-in sample if download fails).
- Prepares data (maps 4->1 positive, 0->0 negative, drops neutral 2).
- Optionally downloads GloVe 6B 300d (if you want embeddings) or uses random embeddings fallback.
- Builds embedding matrix (no gensim required).
- Trains and evaluates: LSTM, CNN, Bi-LSTM (Keras/TensorFlow).
- Trains and evaluates: TF-IDF + LogisticRegression and LinearSVC.
- Saves comparison CSV and error analysis CSV (5 misclassified positive and negative examples).
- Designed to run on a typical laptop; by default it samples a manageable subset from Sentiment140 (you can increase SAMPLE_SIZE if you have GPU/time).

USAGE:
- Ensure Python packages installed: pandas, numpy, scikit-learn, requests, tqdm, tensorflow, nltk
  e.g. pip install pandas numpy scikit-learn requests tqdm tensorflow nltk
- Run: python end_to_end_sentiment_with_datasets.py
- If you want to use full Sentiment140, increase SAMPLE_SIZE or set SAMPLE_SIZE = None.

DATA SOURCES (auto-download attempted):
- Sentiment140 training/test zip: http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
- GloVe (manual): download glove.6B.300d.txt from https://nlp.stanford.edu/projects/glove/ and place it in the working dir or set GLOVE_PATH below.
"""

import os
import io
import zipfile
import requests
from tqdm import tqdm
import random
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Input
from tensorflow.keras.callbacks import EarlyStopping
import nltk
nltk.download('punkt', quiet=True)

# -------------------------
# CONFIG - edit as needed
# -------------------------
SENT140_URL = "http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip"
WORK_DIR = "data"
SENT140_ZIP = os.path.join(WORK_DIR, "trainingandtestdata.zip")
SENT140_CSV = os.path.join(WORK_DIR, "training.1600000.processed.noemoticon.csv")
SAMPLE_SIZE = 50000     # set to None to use entire dataset (may be very slow/large)
RANDOM_STATE = 42

GLOVE_PATH = "glove.6B.300d.txt"   # place here if you downloaded; otherwise fallback to random init
EMBEDDING_DIM = 300
MAX_NUM_WORDS = 30000
MAX_SEQ_LEN = 50
BATCH_SIZE = 128
EPOCHS = 5    # increase if you have GPU/time
VERBOSE = 1
TEST_SIZE = 0.2
VAL_RATIO_FROM_TRAIN = 0.1
# -------------------------

random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

os.makedirs(WORK_DIR, exist_ok=True)

def download_file(url, target_path, chunk_size=32768):
    if os.path.exists(target_path):
        print(f"[download] already exists: {target_path}")
        return target_path
    print(f"[download] fetching {url} -> {target_path}")
    resp = requests.get(url, stream=True, timeout=60)
    resp.raise_for_status()
    total = int(resp.headers.get('content-length', 0))
    with open(target_path, 'wb') as f:
        with tqdm(total=total, unit='B', unit_scale=True) as pbar:
            for chunk in resp.iter_content(chunk_size=chunk_size):
                if chunk:
                    f.write(chunk)
                    pbar.update(len(chunk))
    return target_path

def extract_sent140(zip_path, extract_to):
    with zipfile.ZipFile(zip_path, 'r') as z:
        # the zip contains training.1600000.processed.noemoticon.csv and testdata.manual.2009.06.14.csv
        z.extractall(path=extract_to)
    print("[extract] done.")

def load_sentiment140(sample_size=SAMPLE_SIZE, random_state=RANDOM_STATE):
    # download and extract if not present
    try:
        if not os.path.exists(SENT140_CSV):
            download_file(SENT140_URL, SENT140_ZIP)
            extract_sent140(SENT140_ZIP, WORK_DIR)
        # file format: sentiment,id,date,query,user,text  (no header)
        # sentiment: 0 = negative, 2 = neutral, 4 = positive
        print("[load] reading Sentiment140 CSV (this may take a while)...")
        df = pd.read_csv(SENT140_CSV, encoding='latin-1', header=None)
        df = df[[0,5]]
        df.columns = ['sentiment', 'text']
        # map 4->1, 0->0; drop neutral 2
        df = df[df['sentiment'] != 2].copy()
        df['label'] = df['sentiment'].map({0:0, 4:1})
        df = df[['text','label']].reset_index(drop=True)
        if sample_size is not None and sample_size > 0 and sample_size < len(df):
            df = df.sample(n=sample_size, random_state=random_state).reset_index(drop=True)
        print(f"[load] loaded Sentiment140: {len(df)} examples (labels counts: {df['label'].value_counts().to_dict()})")
        return df
    except Exception as e:
        print(f"[load] failed to load Sentiment140: {e}")
        return None

def make_small_sample():
    # fallback small dataset (balanced)
    print("[fallback] creating small built-in sample dataset")
    positive = [
        "I love this product! Totally recommend it :)",
        "What a great experience, I'm so happy",
        "Absolutely fantastic service and friendly people",
        "Best purchase ever, very satisfied",
        "This made my day, wonderful!"
    ]
    negative = [
        "I hate this. Worst ever!",
        "Terrible experience, will never come back",
        "Very disappointed, broke after one use",
        "Waste of money and time",
        "This ruined my day, awful service"
    ]
    texts = positive + negative
    labels = [1]*len(positive) + [0]*len(negative)
    df = pd.DataFrame({'text': texts, 'label': labels})
    return df

# -------------------------
# Load dataset (attempt Sentiment140, else fallback)
# -------------------------
df = load_sentiment140()
if df is None or len(df) < 100:
    df = make_small_sample()

# basic cleaning - you can expand (lowercase, remove URLs, mentions, etc.)
def simple_clean(text):
    txt = str(text)
    # minimal cleaning: strip
    return txt.strip()

df['text_clean'] = df['text'].map(simple_clean)
print("[data] sample rows:\n", df.head().to_dict(orient='records')[:3])

# -------------------------
# Train / Val / Test split
# -------------------------
train_df, test_df = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=df['label'])
train_df, val_df = train_test_split(train_df, test_size=VAL_RATIO_FROM_TRAIN, random_state=RANDOM_STATE, stratify=train_df['label'])

print(f"[split] train={len(train_df)} val={len(val_df)} test={len(test_df)}")

# -------------------------
# Tokenize and sequences
# -------------------------
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['text_clean'].tolist())

def texts_to_padded(texts):
    seq = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seq, maxlen=MAX_SEQ_LEN, padding='post', truncating='post')

X_train = texts_to_padded(train_df['text_clean'])
X_val   = texts_to_padded(val_df['text_clean'])
X_test  = texts_to_padded(test_df['text_clean'])

y_train = train_df['label'].values
y_val   = val_df['label'].values
y_test  = test_df['label'].values

word_index = tokenizer.word_index
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
print(f"[tokenizer] vocab_size={len(word_index)}, using num_words={num_words}")

# -------------------------
# Load GloVe (if present) and build embedding matrix (no gensim)
# -------------------------
def load_glove(glove_path, dim=EMBEDDING_DIM):
    if not os.path.exists(glove_path):
        print(f"[glove] not found at {glove_path}. Skipping GloVe load.")
        return None
    print(f"[glove] loading {glove_path} ...")
    embeddings_index = {}
    with open(glove_path, 'r', encoding='utf8', errors='ignore') as f:
        for line in f:
            parts = line.rstrip().split(' ')
            if len(parts) <= dim:
                continue
            word = parts[0]
            try:
                coefs = np.asarray(parts[1:], dtype='float32')
            except:
                continue
            if coefs.shape[0] != dim:
                continue
            embeddings_index[word] = coefs
    print(f"[glove] loaded vectors for {len(embeddings_index)} words.")
    return embeddings_index

glove = load_glove(GLOVE_PATH, EMBEDDING_DIM)

def build_embedding_matrix(word_index, embeddings_index, num_words, dim):
    rng = np.random.default_rng(RANDOM_STATE)
    embedding_matrix = rng.normal(loc=0.0, scale=0.01, size=(num_words, dim)).astype('float32')
    if embeddings_index is None:
        print("[embed] no pre-trained embeddings, using random init")
        return embedding_matrix
    oov = 0
    for word, i in word_index.items():
        if i >= num_words:
            continue
        vec = embeddings_index.get(word)
        if vec is None:
            vec = embeddings_index.get(word.lower())
        if vec is not None:
            embedding_matrix[i] = vec
        else:
            oov += 1
    print(f"[embed] built matrix: {num_words} words, approx OOV={oov}")
    return embedding_matrix

embedding_matrix = build_embedding_matrix(word_index, glove, num_words, EMBEDDING_DIM)

# -------------------------
# Model builders (fresh Embedding per model)
# -------------------------
def make_embedding_layer(embedding_matrix, num_words, dim, input_length, trainable=False):
    return Embedding(input_dim=num_words, output_dim=dim, weights=[embedding_matrix], input_length=input_length, trainable=trainable)

def build_lstm_model():
    emb = make_embedding_layer(embedding_matrix, num_words, EMBEDDING_DIM, MAX_SEQ_LEN, trainable=False)
    model = Sequential([
        emb,
        Bidirectional(LSTM(128)),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def build_cnn_model():
    emb = make_embedding_layer(embedding_matrix, num_words, EMBEDDING_DIM, MAX_SEQ_LEN, trainable=False)
    model = Sequential([
        emb,
        Conv1D(128, 3, activation='relu'),
        MaxPooling1D(3),
        Conv1D(128, 3, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def build_bilstm_model():
    inp = Input(shape=(MAX_SEQ_LEN,))
    emb = Embedding(input_dim=num_words, output_dim=EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQ_LEN, trainable=False)(inp)
    x = Bidirectional(LSTM(128, return_sequences=True))(emb)
    x = Bidirectional(LSTM(64))(x)
    x = Dropout(0.5)(x)
    x = Dense(64, activation='relu')(x)
    out = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=inp, outputs=out)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# -------------------------
# Train & evaluate deep models
# -------------------------
deep_models = {
    "LSTM": build_lstm_model,
    "CNN": build_cnn_model,
    "BiLSTM": build_bilstm_model
}

deep_results = []

for name, builder in deep_models.items():
    try:
        print(f"\n[train] Building & training {name}")
        model = builder()
        model.summary(print_fn=lambda s: print("[model]", s))
        es = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True, verbose=0)
        model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[es], verbose=VERBOSE)
        probs = model.predict(X_test, batch_size=BATCH_SIZE, verbose=0).ravel()
        preds = (probs >= 0.5).astype(int)
        acc = accuracy_score(y_test, preds)
        f1m = f1_score(y_test, preds, average='macro')
        print(f"[eval] {name} | Acc={acc:.4f} | F1-macro={f1m:.4f}")
        deep_results.append({'name': name, 'model': model, 'preds': preds, 'probs': probs, 'accuracy': acc, 'f1_macro': f1m})
    except Exception as e:
        print(f"[error] Training {name} failed: {e}")

# -------------------------
# Traditional ML baselines (TF-IDF + LR, SVM)
# -------------------------
print("\n[baseline] Training TF-IDF + LogisticRegression and SVM")
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(train_df['text_clean'])
X_val_tfidf = tfidf.transform(val_df['text_clean'])
X_test_tfidf = tfidf.transform(test_df['text_clean'])

clf_lr = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
clf_lr.fit(X_train_tfidf, y_train)
preds_lr = clf_lr.predict(X_test_tfidf)
acc_lr = accuracy_score(y_test, preds_lr)
f1_lr = f1_score(y_test, preds_lr, average='macro')
print(f"[eval] LogisticRegression | Acc={acc_lr:.4f} | F1-macro={f1_lr:.4f}")

clf_svm = LinearSVC(max_iter=5000, random_state=RANDOM_STATE)
clf_svm.fit(X_train_tfidf, y_train)
preds_svm = clf_svm.predict(X_test_tfidf)
acc_svm = accuracy_score(y_test, preds_svm)
f1_svm = f1_score(y_test, preds_svm, average='macro')
print(f"[eval] SVM | Acc={acc_svm:.4f} | F1-macro={f1_svm:.4f}")

# -------------------------
# Summary table
# -------------------------
rows = []
for r in deep_results:
    rows.append({'Model': r['name'], 'Accuracy': r['accuracy'], 'F1-macro': r['f1_macro']})
rows.extend([
    {'Model': 'LogisticRegression', 'Accuracy': acc_lr, 'F1-macro': f1_lr},
    {'Model': 'SVM', 'Accuracy': acc_svm, 'F1-macro': f1_svm}
])
summary_df = pd.DataFrame(rows).sort_values(by='F1-macro', ascending=False).reset_index(drop=True)
print("\n[summary]\n", summary_df)
summary_df.to_csv("model_comparison_summary.csv", index=False)
print("[saved] model_comparison_summary.csv")

# -------------------------
# Error analysis: choose best model by F1-macro (deep preferred if available)
# -------------------------
chosen = None
if deep_results:
    best_deep = max(deep_results, key=lambda x: x['f1_macro'])
    chosen = best_deep
    print(f"[choose] best deep model = {best_deep['name']}")
else:
    # choose best traditional
    if f1_lr >= f1_svm:
        chosen = {'name': 'LogisticRegression', 'preds': preds_lr, 'probs': None}
    else:
        chosen = {'name': 'SVM', 'preds': preds_svm, 'probs': None}
    print(f"[choose] best traditional = {chosen['name']}")

preds = chosen['preds']
probs = chosen.get('probs', None)

test_texts = test_df['text_clean'].tolist()
test_labels = test_df['label'].tolist()

mis_pos_idx = [i for i, (t, p) in enumerate(zip(test_labels, preds)) if t == 1 and p == 0]
mis_neg_idx = [i for i, (t, p) in enumerate(zip(test_labels, preds)) if t == 0 and p == 1]

# collect 5 examples each (or fewer if not available)
mis_pos_examples = []
mis_neg_examples = []

for idx in mis_pos_idx[:5]:
    mis_pos_examples.append({'index': int(idx), 'text': test_texts[idx], 'true': int(test_labels[idx]), 'pred': int(preds[idx]), 'prob_pos': float(probs[idx]) if probs is not None else None})
for idx in mis_neg_idx[:5]:
    mis_neg_examples.append({'index': int(idx), 'text': test_texts[idx], 'true': int(test_labels[idx]), 'pred': int(preds[idx]), 'prob_pos': float(probs[idx]) if probs is not None else None})

ea_df = pd.DataFrame(mis_pos_examples + mis_neg_examples)
ea_df.to_csv("error_analysis_samples.csv", index=False)
print("[saved] error_analysis_samples.csv (contains up to 5 misclassified positives and 5 misclassified negatives)")

print("\nDone. Files produced:\n - model_comparison_summary.csv\n - error_analysis_samples.csv\nIf you want me to (a) increase SAMPLE_SIZE, (b) switch to full Sentiment140, (c) fine-tune embeddings (trainable=True), or (d) add a transformer (BERT) baseline, tell me which and I'll give updated code.")

[download] fetching http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip -> data/trainingandtestdata.zip


100%|██████████| 81.4M/81.4M [00:01<00:00, 74.9MB/s]


[extract] done.
[load] reading Sentiment140 CSV (this may take a while)...
[load] loaded Sentiment140: 50000 examples (labels counts: {1: 25014, 0: 24986})
[data] sample rows:
 [{'text': '@chrishasboobs AHHH I HOPE YOUR OK!!! ', 'label': 0, 'text_clean': '@chrishasboobs AHHH I HOPE YOUR OK!!!'}, {'text': '@misstoriblack cool , i have no tweet apps  for my razr 2', 'label': 0, 'text_clean': '@misstoriblack cool , i have no tweet apps  for my razr 2'}, {'text': '@TiannaChaos i know  just family drama. its lame.hey next time u hang out with kim n u guys like have a sleepover or whatever, ill call u', 'label': 0, 'text_clean': '@TiannaChaos i know  just family drama. its lame.hey next time u hang out with kim n u guys like have a sleepover or whatever, ill call u'}]
[split] train=36000 val=4000 test=10000
[tokenizer] vocab_size=47828, using num_words=30000
[glove] not found at glove.6B.300d.txt. Skipping GloVe load.
[embed] no pre-trained embeddings, using random init

[train] Building & t

[model] Model: "sequential"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ embedding (Embedding)           │ ?                      │     9,000,000 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ bidirectional (Bidirectional)   │ ?                      │   0 (unbuilt) │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout (Dropout)               │ ?                      │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense (Dense)                   │ ?                      │   0 (unbuilt) │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_1 (Dropout)             │ ?                      │             0 │
├─────────────────────────────────┼─────────────

[model] Model: "sequential_1"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ embedding_1 (Embedding)         │ ?                      │     9,000,000 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv1d (Conv1D)                 │ ?                      │   0 (unbuilt) │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ max_pooling1d (MaxPooling1D)    │ ?                      │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv1d_1 (Conv1D)               │ ?                      │   0 (unbuilt) │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ global_max_pooling1d            │ ?                      │             0 │
│ (GlobalMaxPooling1D)            │           

[model] Model: "functional_2"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer_2 (InputLayer)      │ (None, 50)             │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding_2 (Embedding)         │ (None, 50, 300)        │     9,000,000 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ bidirectional_1 (Bidirectional) │ (None, 50, 256)        │       439,296 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ bidirectional_2 (Bidirectional) │ (None, 128)            │       164,352 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dropout_3 (Dropout)             │ (None, 128)            │             0 │
├─────────────────────────────────┼───────────