In [1]:
import os
import pandas as pd
import random
import re
from gensim import corpora, models
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from tqdm import tqdm
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
import numpy as np

In [2]:
# Download required data for text processing
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Set random seed for consistent results
def initialize_seed(seed_value=123):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

initialize_seed()

In [4]:
# Text preprocessing function
def clean_text(input_text):
    """
    Cleans text by removing non-alphabet characters, converting to lowercase,
    tokenizing, and filtering stopwords.

    Args:
        input_text (str): The text to process.

    Returns:
        List[str]: A list of processed words.
    """
    cleaned_text = re.sub(r'[^a-zA-Z]', ' ', input_text)
    tokens = word_tokenize(cleaned_text.lower())
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return filtered_tokens


In [5]:
# Dataset class for IMDB reviews
class ReviewDataset(Dataset):
    def __init__(self, reviews, sentiments, tokenizer, max_len=512):
        self.reviews = reviews
        self.sentiments = sentiments
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, index):
        tokens = self.tokenizer(
            self.reviews[index],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        tokens = {key: val.squeeze() for key, val in tokens.items()}
        tokens['labels'] = torch.tensor(self.sentiments[index], dtype=torch.long)
        return tokens

In [6]:
# Function to load and split IMDB dataset
def prepare_review_data(file_path, tokenizer, split_ratio=0.2):
    """
    Loads IMDB reviews and splits them into training and testing sets.

    Args:
        file_path (str): Path to the dataset file.
        tokenizer (BertTokenizer): Tokenizer for processing reviews.
        split_ratio (float): Proportion of data for testing.

    Returns:
        Tuple[ReviewDataset, ReviewDataset]: Training and testing datasets.
    """
    data = pd.read_csv(file_path)
    sentiment_map = {'positive': 1, 'negative': 0}
    data['sentiment_label'] = data['sentiment'].map(sentiment_map)
    reviews = data['review'].tolist()
    sentiments = data['sentiment_label'].tolist()

    train_reviews, test_reviews, train_sentiments, test_sentiments = train_test_split(
        reviews, sentiments, test_size=split_ratio, stratify=sentiments, random_state=42
    )

    train_data = ReviewDataset(train_reviews, train_sentiments, tokenizer)
    test_data = ReviewDataset(test_reviews, test_sentiments, tokenizer)

    return train_data, test_data

In [7]:
# Function to fine-tune a BERT model
def train_bert_model(train_data, test_data, save_dir, model_name='bert-base-uncased', epochs=3, batch_size=16):
    """
    Fine-tunes a BERT model for text classification.

    Args:
        train_data (ReviewDataset): Training dataset.
        test_data (ReviewDataset): Testing dataset.
        save_dir (str): Directory to save the trained model.
        model_name (str): Name of the pretrained BERT model.
        epochs (int): Number of training epochs.
        batch_size (int): Batch size for training.

    Returns:
        Trainer: The trained model.
    """
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
    training_args = TrainingArguments(
        output_dir=save_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        greater_is_better=True
    )

    def calculate_metrics(predictions):
        true_labels = predictions.label_ids
        predicted_labels = np.argmax(predictions.predictions, axis=1)
        acc = accuracy_score(true_labels, predicted_labels)
        precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='binary')
        return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

    data_collator = DataCollatorWithPadding(tokenizer=train_data.tokenizer)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=train_data.tokenizer,
        data_collator=data_collator,
        compute_metrics=calculate_metrics
    )

    trainer.train()
    return trainer

In [8]:
# Function to perform LDA topic modeling
def perform_lda_analysis(tokenized_docs, num_topics=10):
    """
    Conducts LDA topic modeling on tokenized documents.

    Args:
        tokenized_docs (List[List[str]]): Preprocessed documents as token lists.
        num_topics (int): Number of topics to extract.

    Returns:
        Tuple[models.LdaModel, float]: LDA model and coherence score.
    """
    vocab_dictionary = corpora.Dictionary(tokenized_docs)
    document_corpus = [vocab_dictionary.doc2bow(doc) for doc in tokenized_docs]
    lda_model = models.LdaModel(document_corpus, num_topics=num_topics, id2word=vocab_dictionary, passes=10, random_state=42)
    coherence_model = CoherenceModel(model=lda_model, texts=tokenized_docs, dictionary=vocab_dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    return lda_model, coherence_score


In [9]:
# Main execution pipeline
def execute_pipeline():
    imdb_file = 'data/IMDB_Dataset.csv'
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    print("Preparing IMDB dataset for training and testing...")
    train_data, test_data = prepare_review_data(imdb_file, tokenizer)

    print("Fine-tuning the BERT model...")
    train_bert_model(train_data, test_data, save_dir='models/bert_sentiment_model')

    print("Performing LDA topic modeling on example documents...")
    example_docs = [clean_text("Example document about machine learning.") for _ in range(10)]
    lda_model, coherence = perform_lda_analysis(example_docs)
    print(f"LDA Coherence Score: {coherence}")


In [12]:
# Testing Code
def test_clean_text():
    test_input = "This is a TEST sentence, with punctuation!"
    expected_output = ["test", "sentence", "punctuation"]
    assert clean_text(test_input) == expected_output, "Text cleaning test failed."
    print("Text cleaning test passed.")

def test_prepare_review_data():
    test_data = pd.DataFrame({
        'review': ["Good movie!", "Bad movie!"],
        'sentiment': ["positive", "negative"]
    })
    test_data.to_csv("test_data.csv", index=False)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    train_data, test_data = prepare_review_data("test_data.csv", tokenizer, split_ratio=0.5)
    assert len(train_data) == 1 and len(test_data) == 1, "Data preparation test failed."
    print("Data preparation test passed.")

def test_perform_lda_analysis():
    tokenized_docs = [["machine", "learning", "example"], ["artificial", "intelligence"]]
    lda_model, coherence = perform_lda_analysis(tokenized_docs, num_topics=2)
    assert coherence > 0, "LDA analysis test failed."
    print("LDA analysis test passed.")

# Run tests
def run_tests():
    print("Running tests...")
    test_clean_text()
    test_prepare_review_data()
    test_perform_lda_analysis()
    print("All tests passed.")

run_tests()


Running tests...


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [11]:
# Set random seed for consistent results
def initialize_seed(seed_value=123):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

initialize_seed()

# Text preprocessing function
def clean_text(input_text):
    """
    Cleans text by removing non-alphabet characters, converting to lowercase,
    tokenizing, and filtering stopwords.

    Args:
        input_text (str): The text to process.

    Returns:
        List[str]: A list of processed words.
    """
    cleaned_text = re.sub(r'[^a-zA-Z]', ' ', input_text)
    tokens = word_tokenize(cleaned_text.lower())
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return filtered_tokens

# Dataset class for IMDB reviews
class ReviewDataset(Dataset):
    def __init__(self, reviews, sentiments, tokenizer, max_len=512):
        self.reviews = reviews
        self.sentiments = sentiments
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, index):
        tokens = self.tokenizer(
            self.reviews[index],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        tokens = {key: val.squeeze() for key, val in tokens.items()}
        tokens['labels'] = torch.tensor(self.sentiments[index], dtype=torch.long)
        return tokens

# Function to load and split IMDB dataset
def prepare_review_data(file_path, tokenizer, split_ratio=0.2):
    """
    Loads IMDB reviews and splits them into training and testing sets.

    Args:
        file_path (str): Path to the dataset file.
        tokenizer (BertTokenizer): Tokenizer for processing reviews.
        split_ratio (float): Proportion of data for testing.

    Returns:
        Tuple[ReviewDataset, ReviewDataset]: Training and testing datasets.
    """
    data = pd.read_csv(file_path)
    sentiment_map = {'positive': 1, 'negative': 0}
    data['sentiment_label'] = data['sentiment'].map(sentiment_map)
    reviews = data['review'].tolist()
    sentiments = data['sentiment_label'].tolist()

    train_reviews, test_reviews, train_sentiments, test_sentiments = train_test_split(
        reviews, sentiments, test_size=split_ratio, stratify=sentiments, random_state=42
    )

    train_data = ReviewDataset(train_reviews, train_sentiments, tokenizer)
    test_data = ReviewDataset(test_reviews, test_sentiments, tokenizer)

    return train_data, test_data

# Function to fine-tune a BERT model
def train_bert_model(train_data, test_data, save_dir, model_name='bert-base-uncased', epochs=3, batch_size=16):
    """
    Fine-tunes a BERT model for text classification.

    Args:
        train_data (ReviewDataset): Training dataset.
        test_data (ReviewDataset): Testing dataset.
        save_dir (str): Directory to save the trained model.
        model_name (str): Name of the pretrained BERT model.
        epochs (int): Number of training epochs.
        batch_size (int): Batch size for training.

    Returns:
        Trainer: The trained model.
    """
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
    training_args = TrainingArguments(
        output_dir=save_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        greater_is_better=True
    )

    def calculate_metrics(predictions):
        true_labels = predictions.label_ids
        predicted_labels = np.argmax(predictions.predictions, axis=1)
        acc = accuracy_score(true_labels, predicted_labels)
        precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='binary')
        return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

    data_collator = DataCollatorWithPadding(tokenizer=train_data.tokenizer)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=test_data,
        tokenizer=train_data.tokenizer,
        data_collator=data_collator,
        compute_metrics=calculate_metrics
    )

    trainer.train()
    return trainer

# Function to perform LDA topic modeling
def perform_lda_analysis(tokenized_docs, num_topics=10):
    """
    Conducts LDA topic modeling on tokenized documents.

    Args:
        tokenized_docs (List[List[str]]): Preprocessed documents as token lists.
        num_topics (int): Number of topics to extract.

    Returns:
        Tuple[models.LdaModel, float]: LDA model and coherence score.
    """
    vocab_dictionary = corpora.Dictionary(tokenized_docs)
    document_corpus = [vocab_dictionary.doc2bow(doc) for doc in tokenized_docs]
    lda_model = models.LdaModel(document_corpus, num_topics=num_topics, id2word=vocab_dictionary, passes=10, random_state=42)
    coherence_model = CoherenceModel(model=lda_model, texts=tokenized_docs, dictionary=vocab_dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    return lda_model, coherence_score

# Main execution pipeline
def execute_pipeline():
    imdb_file = 'data/IMDB_Dataset.csv'
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    print("Preparing IMDB dataset for training and testing...")
    train_data, test_data = prepare_review_data(imdb_file, tokenizer)

    print("Fine-tuning the BERT model...")
    train_bert_model(train_data, test_data, save_dir='models/bert_sentiment_model')

    print("Performing LDA topic modeling on example documents...")
    example_docs = [clean_text("Example document about machine learning.") for _ in range(10)]
    lda_model, coherence = perform_lda_analysis(example_docs)
    print(f"LDA Coherence Score: {coherence}")

if __name__ == "__main__":
    execute_pipeline()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Preparing IMDB dataset for training and testing...


FileNotFoundError: [Errno 2] No such file or directory: 'data/IMDB_Dataset.csv'