In [1]:
import os
import tarfile
import pandas as pd
from functools import wraps
from typing import Callable

def ensure_directory_exists(path: str) -> None:
    """
    Ensures that the specified directory exists; creates it if it does not.

    Parameters:
    - path: str - Directory path to check and create if necessary.
    """
    if not os.path.exists(path):
        os.makedirs(path)

def download_and_extract(url: str, extract_to: str) -> None:
    """
    Download and extract the dataset from the given URL.

    Parameters:
    - url: str - URL of the dataset archive.
    - extract_to: str - Directory path where the archive will be extracted.
    """
    import requests
    from tqdm import tqdm

    ensure_directory_exists(extract_to)  # Ensure the directory exists

    # Download the file
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    block_size = 1024  # 1 KB
    filename = url.split('/')[-1]
    filepath = os.path.join(extract_to, filename)

    with open(filepath, 'wb') as f, tqdm(
        desc=filename,
        total=total_size,
        unit='iB',
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for chunk in response.iter_content(block_size):
            f.write(chunk)
            bar.update(len(chunk))

    # Extract the archive
    with tarfile.open(filepath, 'r:gz') as tar:
        tar.extractall(path=extract_to)
    
    # Remove the downloaded archive
    os.remove(filepath)

def log_execution_time(func: Callable) -> Callable:
    """
    Decorator to log the execution time of a function.

    Parameters:
    - func: Callable - Function to be decorated.
    
    Returns:
    - Callable - Decorated function.
    """
    import time

    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Execution time for {func.__name__}: {end_time - start_time:.2f} seconds")
        return result
    
    return wrapper

@log_execution_time
def load_imdb_data(data_dir: str, dataset_type: str = 'train') -> pd.DataFrame:
    """
    Load the IMDB dataset from the extracted directory into a pandas DataFrame.

    Parameters:
    - data_dir: str - Path to the directory containing the extracted data.
    - dataset_type: str - The type of dataset to load ('train' or 'test').

    Returns:
    - pd.DataFrame - DataFrame containing the reviews and sentiments.
    """
    def read_files_from_dir(directory: str, sentiment: str) -> pd.DataFrame:
        """
        Read files from a directory and create a DataFrame for the specified sentiment.

        Parameters:
        - directory: str - Path to the directory with the files.
        - sentiment: str - Sentiment label (positive/negative).

        Returns:
        - pd.DataFrame - DataFrame with the reviews and sentiment.
        """
        reviews = []
        for subdir, _, files in os.walk(directory):
            for file in files:
                with open(os.path.join(subdir, file), 'r', encoding='utf-8') as f:
                    reviews.append((f.read(), sentiment))
        return pd.DataFrame(reviews, columns=['review', 'sentiment'])

    # Validate dataset_type
    if dataset_type not in ['train', 'test']:
        raise ValueError("dataset_type must be 'train' or 'test'")

    # Paths to directories
    pos_dir = os.path.join(data_dir, 'aclImdb', dataset_type, 'pos')
    neg_dir = os.path.join(data_dir, 'aclImdb', dataset_type, 'neg')

    # Load data
    pos_df = read_files_from_dir(pos_dir, 'positive')
    neg_df = read_files_from_dir(neg_dir, 'negative')

    # Combine data
    data = pd.concat([pos_df, neg_df], ignore_index=True)

    return data

url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
extract_to = './imdb_data'

  from pandas.core import (


In [3]:
download_and_extract(url, extract_to)

aclImdb_v1.tar.gz: 100%|█████████████████████████████████████████████████████████| 80.2M/80.2M [00:28<00:00, 2.99MiB/s]


In [5]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from keras.layers import Embedding, SimpleRNN, Dense, Dropout

# Load the IMDB dataset (assuming you have already downloaded and extracted it)
data_dir = './imdb_data'
df_train = load_imdb_data(data_dir, dataset_type='train')
df_test = load_imdb_data(data_dir, dataset_type='test')

# Convert sentiment to binary labels (1 for positive, 0 for negative)
df_train['sentiment'] = df_train['sentiment'].map({'positive': 1, 'negative': 0})
df_test['sentiment'] = df_test['sentiment'].map({'positive': 1, 'negative': 0})

# Combine train and test data for tokenizer fitting
df = pd.concat([df_train, df_test], ignore_index=True)

# Tokenize the reviews
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(df['review'])
X = tokenizer.texts_to_sequences(df['review'])

# Pad sequences
X = pad_sequences(X, maxlen=200)

# Split back into train and test
X_train, X_test = X[:len(df_train)], X[len(df_train):]
y_train, y_test = df_train['sentiment'].values, df_test['sentiment'].values

# Ensure the data is in the correct format
X_train = X_train.astype(np.int32)
X_test = X_test.astype(np.int32)
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)

# Define the RNN model
# model = Sequential()
# model.add(Embedding(input_dim=20000, output_dim=128, input_length=200))
# model.add(SimpleRNN(128, return_sequences=False))
# model.add(Dense(1, activation='sigmoid'))


model = Sequential()
model.add(Embedding(input_dim=20000, output_dim=128, input_length=200))
model.add(SimpleRNN(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(SimpleRNN(128))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))



# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()


X_valid, X_final_test, y_valid, y_final_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
# Train the RNN model
history = model.fit(X_train, y_train, epochs=30, batch_size=64, validation_data=(X_valid, y_valid))

# Predict on test data
y_pred = (model.predict(X_final_test) > 0.5).astype("int32")

model.summary()
# Evaluate the model
accuracy = accuracy_score(y_final_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}") # Test Accuracy: 0.7833

# Print classification report
print(classification_report(y_final_test, y_pred, target_names=['negative', 'positive']))


Execution time for load_imdb_data: 137.83 seconds
Execution time for load_imdb_data: 150.11 seconds




Epoch 1/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 123ms/step - accuracy: 0.5053 - loss: 0.7640 - val_accuracy: 0.7044 - val_loss: 0.5725
Epoch 2/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 132ms/step - accuracy: 0.7338 - loss: 0.5396 - val_accuracy: 0.8080 - val_loss: 0.4549
Epoch 3/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 125ms/step - accuracy: 0.8425 - loss: 0.3882 - val_accuracy: 0.8206 - val_loss: 0.4482
Epoch 4/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 116ms/step - accuracy: 0.8312 - loss: 0.3991 - val_accuracy: 0.6978 - val_loss: 0.5847
Epoch 5/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 110ms/step - accuracy: 0.8409 - loss: 0.3897 - val_accuracy: 0.8226 - val_loss: 0.4517
Epoch 6/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 106ms/step - accuracy: 0.8387 - loss: 0.3834 - val_accuracy: 0.8269 - val_loss: 0.4658
Epoch 7/30

Test Accuracy: 0.7286
              precision    recall  f1-score   support

    negative       0.74      0.69      0.72      6192
    positive       0.72      0.77      0.74      6308

    accuracy                           0.73     12500
   macro avg       0.73      0.73      0.73     12500
weighted avg       0.73      0.73      0.73     12500



In [7]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout, LSTM, GRU
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Function to create different models
def create_model(model_type='SimpleRNN', units=128, dropout_rate=0.5):
    model = Sequential()
    model.add(Embedding(input_dim=20000, output_dim=128, input_length=200))
    if model_type == 'SimpleRNN':
        model.add(SimpleRNN(units, return_sequences=True))
    elif model_type == 'LSTM':
        model.add(LSTM(units, return_sequences=True))
    elif model_type == 'GRU':
        model.add(GRU(units, return_sequences=True))
    
    model.add(Dropout(dropout_rate))
    
    if model_type == 'SimpleRNN':
        model.add(SimpleRNN(units))
    elif model_type == 'LSTM':
        model.add(LSTM(units))
    elif model_type == 'GRU':
        model.add(GRU(units))
    
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

# List of model types to test
model_types = ['SimpleRNN', 'LSTM', 'GRU']
best_accuracy = 0
best_model_type = None
best_model = None

# Train and evaluate each model
for model_type in model_types:
    print(f"Training {model_type} model...")
    model = create_model(model_type=model_type)
    model.fit(X_train, y_train, epochs=30, batch_size=64, validation_data=(X_valid, y_valid))
    
    y_pred = (model.predict(X_final_test) > 0.5).astype("int32")
    
    accuracy = accuracy_score(y_final_test, y_pred)
    print(f"{model_type} Test Accuracy: {accuracy:.4f}")
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model_type = model_type
        best_model = model

# Output the best model and its accuracy
print(f"\nBest Model: {best_model_type} with Test Accuracy: {best_accuracy:.4f}")

# Print classification report of the best model
y_pred_best = (best_model.predict(X_final_test) > 0.5).astype("int32")
print(classification_report(y_final_test, y_pred_best, target_names=['negative', 'positive']))

Training SimpleRNN model...
Epoch 1/30




[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 117ms/step - accuracy: 0.5053 - loss: 0.7657 - val_accuracy: 0.5039 - val_loss: 0.6952
Epoch 2/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 113ms/step - accuracy: 0.5051 - loss: 0.7132 - val_accuracy: 0.5051 - val_loss: 0.6961
Epoch 3/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 114ms/step - accuracy: 0.5004 - loss: 0.6980 - val_accuracy: 0.5055 - val_loss: 0.6935
Epoch 4/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 115ms/step - accuracy: 0.5062 - loss: 0.6956 - val_accuracy: 0.4952 - val_loss: 0.6932
Epoch 5/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 113ms/step - accuracy: 0.5006 - loss: 0.6954 - val_accuracy: 0.5138 - val_loss: 0.6934
Epoch 6/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 112ms/step - accuracy: 0.5201 - loss: 0.6906 - val_accuracy: 0.7027 - val_loss: 0.5807
Epoch 7/30
[1m391/39



[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 426ms/step - accuracy: 0.7241 - loss: 0.5119 - val_accuracy: 0.8570 - val_loss: 0.3368
Epoch 2/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 453ms/step - accuracy: 0.9185 - loss: 0.2187 - val_accuracy: 0.8593 - val_loss: 0.3309
Epoch 3/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 420ms/step - accuracy: 0.9470 - loss: 0.1519 - val_accuracy: 0.8420 - val_loss: 0.4039
Epoch 4/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 401ms/step - accuracy: 0.9666 - loss: 0.0973 - val_accuracy: 0.8604 - val_loss: 0.5107
Epoch 5/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 399ms/step - accuracy: 0.9787 - loss: 0.0628 - val_accuracy: 0.8503 - val_loss: 0.5425
Epoch 6/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 405ms/step - accuracy: 0.9807 - loss: 0.0609 - val_accuracy: 0.8274 - val_loss: 0.5339
Epoch 7/30
[1m



[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m278s[0m 701ms/step - accuracy: 0.7032 - loss: 0.5467 - val_accuracy: 0.8665 - val_loss: 0.3178
Epoch 2/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 589ms/step - accuracy: 0.9135 - loss: 0.2278 - val_accuracy: 0.8635 - val_loss: 0.3204
Epoch 3/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 589ms/step - accuracy: 0.9516 - loss: 0.1388 - val_accuracy: 0.8626 - val_loss: 0.3473
Epoch 4/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 590ms/step - accuracy: 0.9806 - loss: 0.0650 - val_accuracy: 0.8439 - val_loss: 0.4108
Epoch 5/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m232s[0m 594ms/step - accuracy: 0.9863 - loss: 0.0445 - val_accuracy: 0.8631 - val_loss: 0.5510
Epoch 6/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m233s[0m 596ms/step - accuracy: 0.9909 - loss: 0.0283 - val_accuracy: 0.8589 - val_loss: 0.7195
Epoch 7/30
[1m

In [9]:
best_model.summary()