# Undercomplete Autoencoder on IMDb Reviews
Load the Keras IMDb dataset (top 10,000 words, first 200 tokens per review), build an undercomplete autoencoder to learn a compact representation, and reconstruct the sequences.

Pipeline
1) Load & pad reviews (top 10k; 200 tokens)
2) Sequence autoencoder (Embedding → Encoder LSTM → Code → Repeat → Decoder LSTM → token softmax)
3) Sweep small code sizes; pick smallest that reaches a validation-loss target
4) Plot training/validation loss
5) Show 5 test samples: original vs reconstructed (indices → words)

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed, RepeatVector
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from keras.datasets import imdb
from keras.preprocessing import sequence
import matplotlib.pyplot as plt

print('TensorFlow:', tf.__version__)
tf.random.set_seed(123)
np.random.seed(123)

## 1.Load & Prepare IMDB (Top 10,000 words; first 200 tokens)

In [None]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_features = 10000
maxlen = 200

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test  = pad_sequences(X_test,  maxlen=maxlen)

print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)


## 2.Index-Word Mapping and Text Rendering

In [None]:
word_index = imdb.get_word_index()
index_from = 3  
reverse_index = { (idx + index_from): word for word, idx in word_index.items() }
reverse_index[0] = '[PAD]'
reverse_index[1] = '[START]'
reverse_index[2] = '[OOV]'

def decode_indices(indices, skip_pad=True):
    words = []
    for idx in indices:
        if skip_pad and idx == 0:
            continue
        words.append(reverse_index.get(int(idx), '[UNK]'))
    return ' '.join(words)

## 3.Sequence Autoencoder

In [None]:
vocab_size = max_features
embed_dim  = 64

def build_seq_autoencoder(code_dim):
    inp = Input(shape=(maxlen,), dtype='int32')
    x = Embedding(vocab_size, embed_dim, mask_zero=True, name='embed')(inp)
    enc = LSTM(128, return_sequences=False, name='encoder_lstm')(x)
    code = Dense(code_dim, activation=None, name='code')(enc)
    rep = RepeatVector(maxlen)(code)
    dec = LSTM(128, return_sequences=True, name='decoder_lstm')(rep)
    logits = TimeDistributed(Dense(vocab_size, activation='softmax'), name='token_logits')(dec)
    model = Model(inp, logits, name=f'seq_autoencoder_latent{code_dim}')
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    return model

## 4.Train: Sweep Small Code Sizes and Select the Smallest

In [None]:
code_candidates = [8, 16, 32, 64]
val_loss_threshold = 3.0  

histories = {}
chosen_model = None
chosen_code_dim = None

# Targets are original sequences (identity reconstruction)
y_train_targets = np.expand_dims(X_train, -1)

for cd in code_candidates:
    model = build_seq_autoencoder(cd)
    es = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
    h = model.fit(
        X_train, y_train_targets,
        validation_split=0.1,
        epochs=9,
        batch_size=256,
        verbose=1,
        callbacks=[es]
    )
    histories[cd] = h.history
    final_val = h.history['val_loss'][-1]
    print(f'[code_dim={cd}] final val_loss: {final_val:.4f}')
    if final_val <= val_loss_threshold and chosen_model is None:
        chosen_model = model
        chosen_code_dim = cd
        break

if chosen_model is None:
    chosen_code_dim = min(histories.keys(), key=lambda k: min(histories[k]['val_loss']))
    print(f'No code_dim met threshold; using best by val_loss: {chosen_code_dim}')
    chosen_model = build_seq_autoencoder(chosen_code_dim)
    es = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
    _ = chosen_model.fit(
        X_train, np.expand_dims(X_train, -1),
        validation_split=0.1,
        epochs=9,
        batch_size=256,
        verbose=1,
        callbacks=[es]
    )

## Plot Train/Validation Loss

In [None]:
h = histories.get(chosen_code_dim)
if h is not None:
    plt.figure()
    plt.plot(h['loss'], label='train_loss')
    plt.plot(h['val_loss'], label='val_loss')
    plt.xlabel('Epoch'); plt.ylabel('Sparse CE'); plt.title(f'Loss (code_dim={chosen_code_dim})')
    plt.legend()
    plt.show()
else:
    print('History not found; model was retrained in fallback.')

## 5.Five Random Test Samples: Original vs Reconstructed (Text)

In [None]:
idx = np.random.choice(len(X_test), size=5, replace=False)
samples = X_test[idx]
pred = chosen_model.predict(samples, verbose=0)
recon_ids = np.argmax(pred, axis=-1).astype('int32')

for i in range(5):
    print('\n=== Sample', i+1, '(code_dim =', chosen_code_dim, ') ===')
    print('Original :\n', decode_indices(samples[i], skip_pad=True)[:1000])
    print('\nReconst.:\n', decode_indices(recon_ids[i], skip_pad=True)[:1000])

## Report

In [None]:
print(f'Smallest number of codings (latent code_dim) selected: {chosen_code_dim}')