In [1]:
!ls

sample_data  Tweets.csv


In [2]:
!pip install pandas numpy matplotlib seaborn tqdm
!pip install nltk gensim scikit-learn
!pip install transformers datasets torch



In [3]:
!pip install evaluate



In [4]:


import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import re
import string
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

# Set random seeds for reproducibility
np.random.seed(42)

print("="*80)
print("Week 2 Assignment - VQA Challenge")
print("="*80)

# Check and install required packages
def check_and_install_packages():

    required_packages = {
        'tqdm': 'tqdm',
        'nltk': 'nltk',
        'gensim': 'gensim',
        'sklearn': 'scikit-learn',
        'transformers': 'transformers',
        'datasets': 'datasets',
        'evaluate': 'evaluate',
        'torch': 'torch',
        'accelerate': 'accelerate'
    }

    missing_packages = []
    for import_name, package_name in required_packages.items():
        try:
            __import__(import_name)
        except ImportError:
            missing_packages.append(package_name)

    if missing_packages:
        print(f"\n Missing packages: {', '.join(missing_packages)}")
        print("Installing missing packages...")
        for package in missing_packages:
            os.system(f'pip install {package} --break-system-packages -q')
        print(" Installation complete\n")

check_and_install_packages()


from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


print("Downloading NLTK data...")
nltk_packages = ['punkt', 'punkt_tab', 'stopwords', 'wordnet', 'omw-1.4', 'averaged_perceptron_tagger']
for package in nltk_packages:
    try:
        nltk.download(package, quiet=True)
    except:
        pass
print(" NLTK data ready\n")

import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

try:
    from transformers import (
        AutoTokenizer,
        AutoModelForSequenceClassification,
        TrainingArguments,
        Trainer,
        EarlyStoppingCallback
    )
    from datasets import load_dataset
    import evaluate
    import torch

    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}\n")
    TRANSFORMERS_AVAILABLE = True
except ImportError as e:
    print(f" Transformers not available: {e}")
    print("Problem 2 will be skipped. Install transformers, datasets, torch, and evaluate to run it.\n")
    TRANSFORMERS_AVAILABLE = False
    device = None





Week 2 Assignment - VQA Challenge
Downloading NLTK data...
 NLTK data ready

Using device: cuda



In [5]:
# PROBLEM 1: Twitter Sentiment Analysis with Word2Vec


print("\n" + "="*80)
print("PROBLEM 1: Twitter Sentiment Analysis with Word2Vec")
print("="*80 + "\n")

# 1.1 Load Dataset
# ----------------
print("Step 1.1: Loading dataset...")

try:
    df = pd.read_csv('Tweets.csv')
    print(f" Dataset loaded: {len(df)} tweets")
except FileNotFoundError:
    print(" Tweets.csv not found. Creating sample data for demonstration.")

    # Create more realistic sample data
    sample_tweets = [
        ("@VirginAmerica plus you've added commercials to the experience... tacky.", "negative"),
        ("@VirginAmerica I didn't today... Must mean I need to take another trip!", "positive"),
        ("@VirginAmerica it's really aggressive to blast obnoxious entertainment", "negative"),
        ("@VirginAmerica Really missed a prime opportunity for parody!", "neutral"),
        ("@VirginAmerica well, I didn't…but NOW I DO! :-D", "positive"),
        ("@VirginAmerica amazing flight experience! Best airline ever!", "positive"),
        ("@VirginAmerica worst customer service. Never flying again.", "negative"),
        ("@VirginAmerica the flight was okay, nothing special", "neutral"),
        ("@VirginAmerica love the new seats and entertainment system!", "positive"),
        ("@VirginAmerica delayed again! This is unacceptable!", "negative"),
    ]


    tweets_list = []
    sentiments_list = []
    for _ in range(50):
        for tweet, sentiment in sample_tweets:
            tweets_list.append(tweet)
            sentiments_list.append(sentiment)

    df = pd.DataFrame({
        'text': tweets_list,
        'airline_sentiment': sentiments_list
    })
    print(f" Sample dataset created: {len(df)} tweets")

print(f"\nSentiment distribution:")
print(df['airline_sentiment'].value_counts())
print()




PROBLEM 1: Twitter Sentiment Analysis with Word2Vec

Step 1.1: Loading dataset...
 Dataset loaded: 14640 tweets

Sentiment distribution:
airline_sentiment
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64



In [6]:
# 1.2 Text Preprocessing
# ----------------------
print("Step 1.2: Text Preprocessing...")

# Contraction mapping
CONTRACTION_MAP = {
    "ain't": "is not", "aren't": "are not", "can't": "cannot",
    "can't've": "cannot have", "could've": "could have", "couldn't": "could not",
    "didn't": "did not", "doesn't": "does not", "don't": "do not",
    "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
    "he'd": "he would", "he'll": "he will", "he's": "he is",
    "i'd": "I would", "i'll": "I will", "i'm": "I am", "i've": "I have",
    "isn't": "is not", "it'd": "it would", "it'll": "it will", "it's": "it is",
    "let's": "let us", "shouldn't": "should not", "that's": "that is",
    "there's": "there is", "they'd": "they would", "they'll": "they will",
    "they're": "they are", "they've": "they have", "wasn't": "was not",
    "we'd": "we would", "we'll": "we will", "we're": "we are", "we've": "we have",
    "weren't": "were not", "what's": "what is", "where's": "where is",
    "won't": "will not", "wouldn't": "would not", "you'd": "you would",
    "you'll": "you will", "you're": "you are", "you've": "you have"
}

# Initialize lemmatizer and stopwords
try:
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
except LookupError:
    # If stopwords not available, create a basic set
    stop_words = {'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
                  'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
                  'to', 'was', 'will', 'with'}
    lemmatizer = None
    print(" Using basic stopwords set (NLTK stopwords not fully available)")

def preprocess_tweet(text):
    """
    Complete preprocessing pipeline for tweets

    Args:
        text: Raw tweet text

    Returns:
        Cleaned and preprocessed text
    """
    if pd.isna(text) or text == "":
        return ""

    # Convert to lowercase
    text = str(text).lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove mentions (@username)
    text = re.sub(r'@\w+', '', text)

    # Remove hashtags (keeping the text)
    text = re.sub(r'#', '', text)

    # Remove HTML entities (e.g., &amp;)
    text = re.sub(r'&\w+;', '', text)

    # Expand contractions
    for contraction, expansion in CONTRACTION_MAP.items():
        text = re.sub(r'\b' + contraction + r'\b', expansion, text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove emojis and special characters
    text = re.sub(r'[^\w\s]', '', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    # Tokenize
    try:
        tokens = word_tokenize(text)
    except:
        # Fallback: simple split if word_tokenize fails
        tokens = text.split()

    # Lemmatize and remove stopwords
    if lemmatizer:
        tokens = [lemmatizer.lemmatize(word) for word in tokens
                  if word not in stop_words and len(word) > 2]
    else:
        # Just remove stopwords if lemmatizer not available
        tokens = [word for word in tokens
                  if word not in stop_words and len(word) > 2]

    return ' '.join(tokens)

# Apply preprocessing with progress bar
tqdm.pandas(desc="Processing tweets")
df['cleaned_text'] = df['text'].progress_apply(preprocess_tweet)

print("✓ Preprocessing complete")
print(f"\nExample preprocessing:")
print(f"Original: {df['text'].iloc[0]}")
print(f"Cleaned:  {df['cleaned_text'].iloc[0]}\n")



Step 1.2: Text Preprocessing...


Processing tweets: 100%|██████████| 14640/14640 [00:14<00:00, 1042.85it/s]

✓ Preprocessing complete

Example preprocessing:
Original: @VirginAmerica What @dhepburn said.
Cleaned:  said






In [7]:
# 1.3 Load Word2Vec Model
# -----------------------
print("Step 1.3: Loading Word2Vec model...")
print("(This may take a few minutes on first run)...")

try:
    # Try to load the full Google News model
    w2v_model = api.load('word2vec-google-news-300')
    print(" Word2Vec model loaded: word2vec-google-news-300 (300-dimensional vectors)")
    vector_size = 300
except Exception as e:
    print(f" Could not load Google News model: {e}")
    try:
        # Fallback to smaller GloVe model
        print("Trying smaller alternative: glove-wiki-gigaword-100...")
        w2v_model = api.load('glove-wiki-gigaword-100')
        print(" Word2Vec model loaded: glove-wiki-gigaword-100 (100-dimensional vectors)")
        vector_size = 100
    except Exception as e2:
        print(f" Could not load any embedding model: {e2}")
        print("Please ensure you have internet connection for first-time download.")
        sys.exit(1)

print()



Step 1.3: Loading Word2Vec model...
(This may take a few minutes on first run)...
 Word2Vec model loaded: word2vec-google-news-300 (300-dimensional vectors)



In [8]:
# 1.4 Convert Tweets to Vectors
# -----------------------------
print("Step 1.4: Converting tweets to vectors using average Word2Vec...")

def tweet_to_vector(text, model):
    """
    Convert tweet to vector by averaging Word2Vec embeddings

    Args:
        text: Preprocessed tweet text
        model: Word2Vec model

    Returns:
        Average vector representation
    """
    if not text or text == "":
        return np.zeros(model.vector_size)

    words = text.split()
    valid_vectors = []

    for word in words:
        if word in model:
            valid_vectors.append(model[word])

    if not valid_vectors:
        # Return zero vector if no words found
        return np.zeros(model.vector_size)

    return np.mean(valid_vectors, axis=0)

# Vectorize all tweets
X_vectors = []
for text in tqdm(df['cleaned_text'], desc="Vectorizing"):
    X_vectors.append(tweet_to_vector(text, w2v_model))

X = np.array(X_vectors)
y = df['airline_sentiment'].values

print(f" Vectorization complete")
print(f"Feature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}\n")



Step 1.4: Converting tweets to vectors using average Word2Vec...


Vectorizing: 100%|██████████| 14640/14640 [00:01<00:00, 12944.07it/s]

 Vectorization complete
Feature matrix shape: (14640, 300)
Target shape: (14640,)






In [9]:
# 1.5 Train-Test Split
# --------------------
print("Step 1.5: Splitting dataset (80% train, 20% test)...")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}\n")



Step 1.5: Splitting dataset (80% train, 20% test)...
Training samples: 11712
Testing samples: 2928



In [10]:
# 1.6 Train Logistic Regression
# -----------------------------
print("Step 1.6: Training Multiclass Logistic Regression...")

clf = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=1000,
    random_state=42,
    verbose=0
)

clf.fit(X_train, y_train)
print(" Training complete\n")



Step 1.6: Training Multiclass Logistic Regression...
 Training complete



In [11]:
# 1.7 Evaluate Model
# -----------------
print("Step 1.7: Evaluating model...")

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\n" + "="*80)
print(f"PROBLEM 1 RESULTS")
print(f"="*80)
print(f"Test Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)\n")

print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=clf.classes_, yticklabels=clf.classes_,
            cbar_kws={'label': 'Count'})
plt.title('Problem 1: Confusion Matrix - Twitter Sentiment Classification',
          fontsize=14, fontweight='bold')
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
output_path = 'problem1_confusion_matrix.png'
plt.savefig(output_path, dpi=300, bbox_inches='tight')
print(f" Confusion matrix saved to '{output_path}'\n")

# 1.8 Prediction Function
# -----------------------
print("Step 1.8: Creating prediction function...")

def predict_tweet_sentiment(model, w2v_model, tweet):
    """
    Predict sentiment for a single tweet

    Args:
        model: Trained classifier
        w2v_model: Word2Vec model
        tweet: Raw tweet text (string)

    Returns:
        tuple: (predicted_sentiment, confidence)
    """
    # Preprocess tweet
    cleaned = preprocess_tweet(tweet)

    # Convert to vector
    vector = tweet_to_vector(cleaned, w2v_model)

    # Reshape for prediction
    vector = vector.reshape(1, -1)

    # Predict
    prediction = model.predict(vector)[0]
    probabilities = model.predict_proba(vector)[0]
    confidence = probabilities.max()

    return prediction, confidence

print(" Prediction function ready\n")

# Test the prediction function
print("="*80)
print("Testing Prediction Function:")
print("="*80 + "\n")

test_tweets = [
    "@airline Great service! Will definitely fly again! Best experience ever!",
    "@airline Terrible experience. Lost my baggage and rude staff.",
    "@airline Flight was on time. Nothing special but okay."
]

for i, tweet in enumerate(test_tweets, 1):
    sentiment, confidence = predict_tweet_sentiment(clf, w2v_model, tweet)
    print(f"Test {i}:")
    print(f"  Tweet: {tweet}")
    print(f"  Predicted: {sentiment} (confidence: {confidence:.2%})\n")

print("="*80)
print(" Problem 1 Complete!")
print("="*80 + "\n")

Step 1.7: Evaluating model...

PROBLEM 1 RESULTS
Test Accuracy: 0.7695 (76.95%)

Classification Report:
              precision    recall  f1-score   support

    negative       0.80      0.92      0.86      1835
     neutral       0.63      0.47      0.54       620
    positive       0.77      0.57      0.66       473

    accuracy                           0.77      2928
   macro avg       0.73      0.66      0.68      2928
weighted avg       0.76      0.77      0.76      2928

 Confusion matrix saved to 'problem1_confusion_matrix.png'

Step 1.8: Creating prediction function...
 Prediction function ready

Testing Prediction Function:

Test 1:
  Tweet: @airline Great service! Will definitely fly again! Best experience ever!
  Predicted: positive (confidence: 95.94%)

Test 2:
  Tweet: @airline Terrible experience. Lost my baggage and rude staff.
  Predicted: negative (confidence: 99.42%)

Test 3:
  Tweet: @airline Flight was on time. Nothing special but okay.
  Predicted: negative (con

In [12]:

# PROBLEM 2: BERT Fine-tuning with Hugging Face


if not TRANSFORMERS_AVAILABLE:
    print("\n" + "="*80)
    print("PROBLEM 2: SKIPPED")
    print("="*80)
    print("Transformers library not available. Please install:")
    print("  pip install transformers datasets torch evaluate accelerate")
    print("\nScript completed successfully (Problem 1 only).")
    sys.exit(0)

print("\n\n" + "="*80)
print("PROBLEM 2: BERT Fine-tuning with Hugging Face")
print("="*80 + "\n")

# 2.1 Load IMDb Dataset
# --------------------
print("Step 2.1: Loading IMDb dataset from Hugging Face...")

try:
    imdb_dataset = load_dataset("imdb")
    print(f"✓ Dataset loaded")
    print(f"Train samples: {len(imdb_dataset['train'])}")
    print(f"Test samples: {len(imdb_dataset['test'])}\n")
except Exception as e:
    print(f"✗ Could not load IMDb dataset: {e}")
    print("Please ensure you have internet connection.")
    sys.exit(1)

# For faster training, use a subset
print("Using subset for faster training (2000 train, 500 test samples)")
print("For full training, remove the subset selection below.\n")

imdb_dataset['train'] = imdb_dataset['train'].shuffle(seed=42).select(range(2000))
imdb_dataset['test'] = imdb_dataset['test'].shuffle(seed=42).select(range(500))

# 2.2 Load Tokenizer and Model
# ----------------------------
print("Step 2.2: Loading BERT model and tokenizer...")

model_name = "bert-base-uncased"

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2
    ).to(device)

    print(f"✓ Model loaded: {model_name}")
    print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")
    print(f"Device: {device}\n")
except Exception as e:
    print(f"✗ Could not load BERT model: {e}")
    sys.exit(1)

# 2.3 Tokenize Dataset
# -------------------
print("Step 2.3: Tokenizing dataset...")

def tokenize_function(examples):
    """Tokenize text for BERT"""
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=512
    )

try:
    tokenized_datasets = imdb_dataset.map(
        tokenize_function,
        batched=True,
        desc="Tokenizing"
    )
    print("✓ Tokenization complete\n")
except Exception as e:
    print(f"✗ Tokenization failed: {e}")
    sys.exit(1)

# 2.4 Prepare for Training
# ------------------------
print("Step 2.4: Preparing datasets for training...")

# Rename 'label' column to 'labels'
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

# Set format for PyTorch
tokenized_datasets.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask', 'labels']
)

print("✓ Datasets prepared\n")

# 2.5 Define Training Arguments
# -----------------------------
print("Step 2.5: Setting up training configuration...")

training_args = TrainingArguments(
    output_dir='/home/claude/results_bert',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    logging_dir='/home/claude/logs',
    logging_steps=50,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    report_to='none',  # Disable wandb/tensorboard
    disable_tqdm=False  # Show progress bars
)

print("✓ Training configuration set\n")

# 2.6 Define Metrics
# -----------------
def compute_metrics(eval_pred):
    """Compute accuracy and F1 score"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='binary')

    return {
        'accuracy': accuracy,
        'f1': f1
    }

# 2.7 Create Trainer
# -----------------
print("Step 2.6-2.7: Creating Trainer...")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print("✓ Trainer ready\n")

# 2.8 Train Model
# --------------
print("Step 2.8: Starting fine-tuning...")
print("This may take 10-30 minutes depending on your hardware...")
print("(Using CPU will be significantly slower)\n")

try:
    train_result = trainer.train()
    print("\n✓ Training complete!")

    # Save training metrics
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    print("✓ Training metrics saved\n")

except KeyboardInterrupt:
    print("\n⚠ Training interrupted by user")
    print("Proceeding with current model state...\n")
except Exception as e:
    print(f"\n✗ Training failed: {e}")
    sys.exit(1)

# 2.9 Evaluate Model
# -----------------
print("Step 2.9: Evaluating model on test set...")

try:
    eval_results = trainer.evaluate()

    print(f"\n" + "="*80)
    print(f"PROBLEM 2 RESULTS")
    print(f"="*80)
    print(f"Test Accuracy: {eval_results['eval_accuracy']:.4f} ({eval_results['eval_accuracy']*100:.2f}%)")
    print(f"Test F1 Score: {eval_results['eval_f1']:.4f}")
    print(f"="*80 + "\n")
except Exception as e:
    print(f"✗ Evaluation failed: {e}\n")

# 2.10 Save Model
# --------------
print("Step 2.10: Saving fine-tuned model...")

model_save_path = "/home/fine_tuned_bert_imdb"

try:
    trainer.save_model(model_save_path)
    tokenizer.save_pretrained(model_save_path)
    print(f"✓ Model saved to: {model_save_path}\n")
except Exception as e:
    print(f"⚠ Could not save model: {e}\n")

# 2.11 Inference Demo
# ------------------
print("Step 2.11: Demonstrating inference on sample texts...\n")

try:
    # Load saved model for inference
    inference_model = AutoModelForSequenceClassification.from_pretrained(model_save_path).to(device)
    inference_tokenizer = AutoTokenizer.from_pretrained(model_save_path)
    inference_model.eval()  # Set to evaluation mode

    def predict_sentiment_bert(text):
        """Predict sentiment using fine-tuned BERT"""
        # Tokenize
        inputs = inference_tokenizer(
            text,
            return_tensors='pt',
            truncation=True,
            max_length=512,
            padding=True
        ).to(device)

        # Predict
        with torch.no_grad():
            outputs = inference_model(**inputs)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_class = torch.argmax(predictions, dim=1).item()
            confidence = predictions[0][predicted_class].item()

        sentiment = "Positive" if predicted_class == 1 else "Negative"
        return sentiment, confidence

    # Test samples
    test_samples = [
        "This movie was absolutely fantastic! Best film I've seen this year.",
        "Terrible movie. Waste of time and money. Do not watch.",
        "It was okay, nothing special but not terrible either.",
        "An emotional masterpiece with brilliant performances throughout.",
        "Boring and predictable. I fell asleep halfway through."
    ]

    print("="*80)
    print("Sample Predictions:")
    print("="*80 + "\n")

    for i, text in enumerate(test_samples, 1):
        sentiment, confidence = predict_sentiment_bert(text)
        print(f"Sample {i}:")
        print(f"  Text: {text}")
        print(f"  Prediction: {sentiment} (confidence: {confidence:.2%})\n")

    print("="*80)
    print("✓ Problem 2 Complete!")
    print("="*80)

except Exception as e:
    print(f"⚠ Inference demo failed: {e}")

# Final Summary

print("="*80)
print("\nFiles generated:")
print("  1. problem1_confusion_matrix.png")
print("  2. /home/claude/fine_tuned_bert_imdb/ (fine-tuned model)")
print("="*80)



PROBLEM 2: BERT Fine-tuning with Hugging Face

Step 2.1: Loading IMDb dataset from Hugging Face...


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

✓ Dataset loaded
Train samples: 25000
Test samples: 25000

Using subset for faster training (2000 train, 500 test samples)
For full training, remove the subset selection below.

Step 2.2: Loading BERT model and tokenizer...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Model loaded: bert-base-uncased
Parameters: 109,483,778
Device: cuda

Step 2.3: Tokenizing dataset...


Tokenizing:   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/50000 [00:00<?, ? examples/s]

✓ Tokenization complete

Step 2.4: Preparing datasets for training...
✓ Datasets prepared

Step 2.5: Setting up training configuration...
✓ Training configuration set

Step 2.6-2.7: Creating Trainer...
✓ Trainer ready

Step 2.8: Starting fine-tuning...
This may take 10-30 minutes depending on your hardware...
(Using CPU will be significantly slower)



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.317,0.369992,0.876,0.884758
2,0.1294,0.420561,0.894,0.897087
3,0.0808,0.397782,0.91,0.909091



✓ Training complete!
***** train metrics *****
  epoch                    =        3.0
  total_flos               =  1470247GF
  train_loss               =     0.2388
  train_runtime            = 0:05:50.99
  train_samples_per_second =     17.094
  train_steps_per_second   =      2.137
✓ Training metrics saved

Step 2.9: Evaluating model on test set...



PROBLEM 2 RESULTS
Test Accuracy: 0.9100 (91.00%)
Test F1 Score: 0.9091

Step 2.10: Saving fine-tuned model...
✓ Model saved to: /home/fine_tuned_bert_imdb

Step 2.11: Demonstrating inference on sample texts...

Sample Predictions:

Sample 1:
  Text: This movie was absolutely fantastic! Best film I've seen this year.
  Prediction: Positive (confidence: 99.75%)

Sample 2:
  Text: Terrible movie. Waste of time and money. Do not watch.
  Prediction: Negative (confidence: 99.83%)

Sample 3:
  Text: It was okay, nothing special but not terrible either.
  Prediction: Negative (confidence: 85.87%)

Sample 4:
  Text: An emotional masterpiece with brilliant performances throughout.
  Prediction: Positive (confidence: 99.74%)

Sample 5:
  Text: Boring and predictable. I fell asleep halfway through.
  Prediction: Negative (confidence: 98.96%)

✓ Problem 2 Complete!

Files generated:
  1. problem1_confusion_matrix.png
  2. /home/claude/fine_tuned_bert_imdb/ (fine-tuned model)
