In [None]:
"""
CNN for Text Classification (Colab-ready mini-project)
File: CNN_Text_Classification_Colab.py

How to use:
1. Open Google Colab (https://colab.research.google.com)
2. New Python 3 notebook -> Runtime -> Change runtime type -> GPU (optional)
3. Copy-paste the contents of this file into a single code cell (or upload as .py and run)

This script provides two dataset options:
- Option A (default, recommended): use TensorFlow / Keras built-in IMDB dataset (no Kaggle credentials required)
- Option B: use Kaggle "IMDB Dataset of 50k" (shows commands to set up `kaggle.json`) — uncomment if you want to use Kaggle

The model is a compact, well-documented 1D-CNN for binary sentiment classification.

"""

# ---------- 0. Environment check (Colab-friendly) ----------
# If running in Colab, you already have most libs. Below installs are safe to run locally/Colab.
try:
    import tensorflow as tf
except Exception:
    !pip install -q tensorflow
    import tensorflow as tf

import os
import re
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

print("TensorFlow version:", tf.__version__)

# Optional: use GPU if available (Colab -> Runtime -> Change runtime type -> GPU)
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
        print('GPU found and configured')
    except Exception as e:
        print('GPU found but could not set memory growth:', e)
else:
    print('No GPU found. CPU will be used.')

# ---------- 1. Select dataset source ----------
# OPTION A (recommended): built-in Keras IMDB dataset (already tokenized as integer sequences)
# OPTION B (more realistic): use raw text CSV (Kaggle IMDB 50k). If you want Kaggle, follow the instructions below.
USE_KAGGLE_CSV = False

# If you set USE_KAGGLE_CSV = True, follow these steps in Colab (example):
# 1) Upload your kaggle.json (from your Kaggle account) to the Colab session (Files -> Upload)
# 2) Uncomment the block below to move kaggle.json, set permissions, and download the dataset

if USE_KAGGLE_CSV:
    # !mkdir -p ~/.kaggle
    # !cp kaggle.json ~/.kaggle/
    # !chmod 600 ~/.kaggle/kaggle.json
    # !kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
    # !unzip -o imdb-dataset-of-50k-movie-reviews.zip
    # df = pd.read_csv('IMDB Dataset.csv')
    # df.head()
    raise RuntimeError('Set USE_KAGGLE_CSV = False or follow the Kaggle setup commands in a cell in Colab.')

# ---------- 2. Load dataset (Option A) ----------
# Keras provides an IMDB dataset where words are mapped to integers. But for a CNN on raw text we often prefer raw text + tokenizer.
# We'll use the tensorflow_datasets load for the raw text, but keep things simple and use a small custom load based on keras's raw_text_dataset_from_directory.

# Simpler: download a small raw-text version using tensorflow datasets (tfds)
USE_TFDS = False
if USE_TFDS:
    !pip install -q tensorflow-datasets
    import tensorflow_datasets as tfds
    ds_train, ds_test = tfds.load('imdb_reviews', split=['train', 'test'], as_supervised=True)
    # Convert to pandas DataFrame for consistency
    def ds_to_df(ds):
        texts = []
        labels = []
        for text, label in tfds.as_numpy(ds):
            texts.append(text.decode('utf-8'))
            labels.append(int(label))
        return pd.DataFrame({'review': texts, 'sentiment': labels})
    df_train = ds_to_df(ds_train)
    df_test = ds_to_df(ds_test)
    df = pd.concat([df_train, df_test], ignore_index=True)
else:
    # We'll construct a small example dataset by downloading the IMDB dataset via `tensorflow.keras.datasets` and mapping back to words
    # But the Keras dataset is integer-encoded; thus for a realistic pipeline we'll instead fetch the 'aclImdb' raw dataset if online access allowed.
    # For reliability, let's use a fallback: load a small demo dataset or the Kaggle CSV if the user uploads it.

    # Fallback: try to read IMDB CSV if uploaded to Colab session local filesystem
    if os.path.exists('IMDB Dataset.csv'):
        df = pd.read_csv('IMDB Dataset.csv')
    else:
        # If no CSV available, download a lightweight sample from tf.keras (preprocessed integers) and create toy raw text from it.
        # For real project, upload 'IMDB Dataset.csv' to Colab or set USE_KAGGLE_CSV=True and follow instructions.
        from tensorflow.keras.datasets import imdb
        print('No IMDB CSV found locally. Loading Keras IMDB integer dataset and converting to simple text (toy example).')
        (X_train_i, y_train_i), (X_test_i, y_test_i) = imdb.load_data(num_words=10000)
        word_index = imdb.get_word_index()
        index_word = {index+3: word for word, index in word_index.items()}  # Keras reserves indices
        index_word[0] = '<PAD>'
        index_word[1] = '<START>'
        index_word[2] = '<UNK>'
        index_word[3] = '<UNUSED>'

        def seqs_to_texts(seqs):
            texts = []
            for seq in seqs:
                words = [index_word.get(i, '?') for i in seq]
                texts.append(' '.join(words))
            return texts

        texts = seqs_to_texts(np.concatenate([X_train_i, X_test_i]))
        labels = np.concatenate([y_train_i, y_test_i])
        df = pd.DataFrame({'review': texts, 'sentiment': labels})

print('Dataset size:', len(df))
print(df.head())

# ---------- 3. Basic EDA ----------
print('\nLabel distribution:')
print(df['sentiment'].value_counts())
print('\nSample review (cleaned):')
print(df['review'].iloc[0][:300])

# ---------- 4. Text cleaning function ----------
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)
    # Remove characters that are not letters (keep spaces)
    text = re.sub(r"[^a-zA-Z0-9' ]", ' ', text)
    # Reduce multiple spaces to single space
    text = re.sub(r'\s+', ' ', text)
    # Lowercase
    return text.strip().lower()

# Apply cleaning (this can take time on 50k rows; it's fine in Colab)
print('\nCleaning text... (this may take a few seconds)')
df['clean_review'] = df['review'].astype(str).apply(clean_text)

# ---------- 5. Tokenization & Padding ----------
# Hyperparameters — tune these for the mini-project
VOCAB_SIZE = 15000    # number of words to keep
MAX_LEN = 200         # max sequence length
EMBED_DIM = 128       # embedding dimension

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<OOV>')
tokenizer.fit_on_texts(df['clean_review'])
sequences = tokenizer.texts_to_sequences(df['clean_review'])
X = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')
y = df['sentiment'].astype(int).values

print('X shape:', X.shape)
print('Example sequence (first review tokens):', X[0][:20])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

# Further split a validation set from training set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42, stratify=y_train)
print('After val split -> Train:', X_train.shape, 'Val:', X_val.shape)

# ---------- 6. Build the CNN model ----------
# A robust small CNN architecture for text classification
model = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_DIM, input_length=MAX_LEN),
    Conv1D(filters=128, kernel_size=5, activation='relu', padding='valid'),
    BatchNormalization(),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# ---------- 7. Callbacks ----------
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1),
    ModelCheckpoint('best_cnn_model.h5', save_best_only=True, monitor='val_loss')
]

# ---------- 8. Train the model ----------
EPOCHS = 8
BATCH_SIZE = 128

history = model.fit(
    X_train, y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_val, y_val),
    callbacks=callbacks,
    verbose=2
)

# ---------- 9. Evaluation ----------
print('\nEvaluating on test set:')
loss, acc = model.evaluate(X_test, y_test, verbose=2)
print(f'Test Loss: {loss:.4f} | Test Accuracy: {acc*100:.2f}%')

# Predictions and classification report
y_pred_prob = model.predict(X_test, verbose=0)
y_pred = (y_pred_prob >= 0.5).astype(int).reshape(-1)

print('\nClassification report:')
print(classification_report(y_test, y_pred, digits=4))

print('Confusion matrix:')
print(confusion_matrix(y_test, y_pred))

# ---------- 10. Save tokenizer and model ----------
# Save tokenizer for later inference
import json
with open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(tokenizer.to_json())

model.save('cnn_text_model.h5')
print('\nSaved model to cnn_text_model.h5 and tokenizer.json')

# ---------- 11. Inference helper ----------

def predict_sentiment(texts, model, tokenizer, maxlen=MAX_LEN):
    cleaned = [clean_text(t) for t in texts]
    seq = tokenizer.texts_to_sequences(cleaned)
    pad = pad_sequences(seq, maxlen=maxlen, padding='post')
    probs = model.predict(pad)
    labels = ['Positive' if p >= 0.5 else 'Negative' for p in probs.reshape(-1)]
    return list(zip(texts, probs.reshape(-1), labels))

# Try example
examples = [
    "This movie was fantastic! I loved the story and the acting.",
    "I hated this film. It was boring and too long."
]
print('\nExample predictions:')
for txt, prob, lbl in predict_sentiment(examples, model, tokenizer):
    print(f"{lbl} (prob={prob:.3f}) -> {txt}")

# ---------- 12. Next steps & tips ----------
# - Try varying VOCAB_SIZE, MAX_LEN, EMBED_DIM and number of Conv filters/kernel sizes
# - Try adding multiple Conv1D with different kernel sizes (3,5,7) and concatenate (like a multi-channel CNN)
# - Try pretrained embeddings (GloVe) by building an embedding matrix and setting Embedding(weights=[..], trainable=False or True)
# - For multi-class tasks, change final Dense to units=num_classes and activation='softmax' with categorical_crossentropy
# - For deployment: convert model to SavedModel, or export tokenizer + model and load in Flask/FastAPI

print('\nAll done — notebook finished.\n')


TensorFlow version: 2.19.0
No GPU found. CPU will be used.
No IMDB CSV found locally. Loading Keras IMDB integer dataset and converting to simple text (toy example).
Dataset size: 50000
                                              review  sentiment
0  <START> this film was just brilliant casting l...          1
1  <START> big hair big boobs bad music and a gia...          0
2  <START> this has to be one of the worst films ...          0
3  <START> the <UNK> <UNK> at storytelling the tr...          1
4  <START> worst mistake of my life br br i picke...          0

Label distribution:
sentiment
1    25000
0    25000
Name: count, dtype: int64

Sample review (cleaned):
<START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the

Cleaning text... (this 



Epoch 1/8
