# Training Emotion Classification Model

This notebook will guide you through training the emotion classification model for student engagement analysis.

## Step 1: Setup Environment

First, make sure you have all required packages installed. Run this cell to install them:

In [25]:
# Install required packages
!pip install tensorflow pandas numpy matplotlib seaborn scikit-learn opencv-python albumentations

Collecting albumentations
  Downloading albumentations-2.0.2-py3-none-any.whl.metadata (38 kB)
Collecting pydantic>=2.9.2 (from albumentations)
  Downloading pydantic-2.10.6-py3-none-any.whl.metadata (30 kB)
Collecting albucore==0.0.23 (from albumentations)
  Downloading albucore-0.0.23-py3-none-any.whl.metadata (5.3 kB)
Collecting opencv-python-headless>=4.9.0.80 (from albumentations)
  Downloading opencv_python_headless-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl.metadata (20 kB)
Collecting stringzilla>=3.10.4 (from albucore==0.0.23->albumentations)
  Downloading stringzilla-3.11.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (80 kB)
Collecting simsimd>=5.9.2 (from albucore==0.0.23->albumentations)
  Downloading simsimd-6.2.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (66 kB)
Collecting pydantic-core==2.27.2 (from pydantic>=2.9.2->albumentations)
  Downloading pydantic_core-2.27.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting typing-extensions>=3.6.6 (from tensorflow)
  

## Step 2: Import Required Libraries

Now let's import all the libraries we'll need:

In [28]:
import os
import glob
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from PIL import Image, ImageFile
import time
import albumentations as A  # Powerful augmentation library

import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, BatchNormalization
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm

warnings.filterwarnings('ignore')


# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

## Step 3: Load and Prepare Data

Download the [Emotion Recognition Dataset](https://www.kaggle.com/datasets/sujaykapadnis/emotion-recognition-dataset/data) from Kaggle and extract it. 

The dataset should have a structure like this:
```
dataset/
    train/
        angry/
        happy/
        neutral/
        sad/
        surprise/
```

Now let's load the data:

In [30]:
# Define the path to the CSV file relative to the notebooks folder
csv_path = os.path.abspath(os.path.join(".", "data", "data.csv"))
df = pd.read_csv(csv_path)
df = df[~df['label'].str.contains('Ahegao', case=False, na=False)]
print(df.head())

   Unnamed: 0                                               path     label
0           0  Surprise/1bd930d6a1c717c11be33db74823f661cb53f...  Surprise
1           1       Surprise/cropped_emotions.100096~12fffff.png  Surprise
2           2  Surprise/0df0e470e33093f5b72a8197fa209d684032c...  Surprise
3           3       Surprise/cropped_emotions.260779~12fffff.png  Surprise
4           4       Surprise/cropped_emotions.263616~12fffff.png  Surprise


## Step 3.1: Get rid of bad rows

In [34]:
# Allow truncated images to load (optional)
ImageFile.LOAD_TRUNCATED_IMAGES = True

# Define the correct base images directory
base_images_path = os.path.abspath("StudentEngagementAnalyzer/data/images")

# Remove duplicate "StudentEngagementAnalyzer" occurrences if present
cwd = os.getcwd()  # Get current working directory
if "StudentEngagementAnalyzer" in cwd:
    base_images_path = os.path.abspath(os.path.join(cwd, "data/images"))

print(f"✅ Base Image Path Set To: {base_images_path}")

print(f"Initial number of rows: {df.shape[0]}")

# Update chunk size and timer based on request
chunk_size = 400  
valid_rows = [] 

# Iterate through the DataFrame in chunks
for start in range(0, len(df), chunk_size):
    end = start + chunk_size
    chunk = df.iloc[start:end]
    print(f"Processing rows {start} to {end}...")

    for idx, row in chunk.iterrows():
        # Ensure the path is appended correctly (remove any duplicate occurrences)
        img_path = os.path.normpath(os.path.join(base_images_path, row["path"]))
        img_path = os.path.abspath(img_path)  # Standardize to absolute path

        # Check if the file actually exists
        if not os.path.isfile(img_path):
            # print(f"❌ NOT FOUND: {img_path}")
            continue  # Skip this row

        # Try to open the image using OpenCV
        img = cv2.imread(img_path)

        if img is None:
            print(f"⚠️ CORRUPT IMAGE (cv2 check failed): {img_path}")
            continue  # Skip this row

        # If image is valid, store the absolute path
        row["full_path"] = img_path
        valid_rows.append(row)  

    # Sleep between chunks to prevent system overload
    time.sleep(2)  # Reduced from 3 to 2 seconds per request

# Create a new DataFrame with only valid rows
df_cleaned = pd.DataFrame(valid_rows)

# Print final row count
print(f"Total rows after cleaning: {df_cleaned.shape[0]}")

# Save cleaned dataset
cleaned_csv_path = os.path.abspath("StudentEngagementAnalyzer/data/cleaned_data.csv")
df_cleaned.to_csv(cleaned_csv_path, index=False)
print(f"✅ Cleaned dataset saved as {cleaned_csv_path}")


✅ Base Image Path Set To: /Users/parhamhajzavar/Documents/Seattle/UW/Classes/Winter 2024/522/Project/StudentEngagementAnalyzer/data/images
Initial number of rows: 14248
Processing rows 0 to 400...
Processing rows 400 to 800...
Processing rows 800 to 1200...
Processing rows 1200 to 1600...
Processing rows 1600 to 2000...
Processing rows 2000 to 2400...


Premature end of JPEG file


Processing rows 2400 to 2800...
Processing rows 2800 to 3200...
Processing rows 3200 to 3600...
Processing rows 3600 to 4000...
Processing rows 4000 to 4400...
Processing rows 4400 to 4800...
Processing rows 4800 to 5200...
Processing rows 5200 to 5600...
Processing rows 5600 to 6000...
Processing rows 6000 to 6400...
Processing rows 6400 to 6800...
Processing rows 6800 to 7200...
Processing rows 7200 to 7600...
Processing rows 7600 to 8000...
Processing rows 8000 to 8400...
Processing rows 8400 to 8800...
Processing rows 8800 to 9200...
Processing rows 9200 to 9600...
Processing rows 9600 to 10000...
Processing rows 10000 to 10400...
Processing rows 10400 to 10800...
Processing rows 10800 to 11200...
Processing rows 11200 to 11600...
Processing rows 11600 to 12000...
Processing rows 12000 to 12400...
Processing rows 12400 to 12800...
Processing rows 12800 to 13200...
Processing rows 13200 to 13600...
Processing rows 13600 to 14000...
Processing rows 14000 to 14400...
Total rows after 

## Step 3.2: Augment the dataset

In [39]:
# Save cleaned dataset
cleaned_csv_path = os.path.abspath("StudentEngagementAnalyzer/data/cleaned_data.csv")
df_cleaned.to_csv(cleaned_csv_path, index=False)
print(f"✅ Cleaned dataset saved as {cleaned_csv_path}")


✅ Cleaned dataset saved as /Users/parhamhajzavar/Documents/Seattle/UW/Classes/Winter 2024/522/Project/StudentEngagementAnalyzer/StudentEngagementAnalyzer/data/cleaned_data.csv


In [41]:
# Augment
import os
import pandas as pd
import cv2
import random
import numpy as np
from PIL import Image
import albumentations as A  # Powerful augmentation library

# Define paths
base_images_path = os.path.abspath("StudentEngagementAnalyzer/data/images")
augmented_images_path = os.path.abspath("StudentEngagementAnalyzer/data/augmented_images")

# Ensure the augmented directory exists
os.makedirs(augmented_images_path, exist_ok=True)

# Load the cleaned dataset
df_cleaned_path = os.path.abspath("StudentEngagementAnalyzer/data/cleaned_data.csv")
df = pd.read_csv(df_cleaned_path)

# Augmentation pipeline
augmentations = A.Compose([
    A.HorizontalFlip(p=0.5),  # 50% chance to flip horizontally
    A.Rotate(limit=15, p=0.5),  # Random rotation within +/- 15 degrees
    A.RandomBrightnessContrast(p=0.5),  # Adjust brightness & contrast
    A.ShiftScaleRotate(shift_limit=0.02, scale_limit=0.1, rotate_limit=10, p=0.5),  # Small shifts & zoom
    A.GaussianBlur(blur_limit=(3, 5), p=0.2)  # Light blur
])

# Store new data paths
augmented_rows = []

print(f"Starting augmentation for {len(df)} images...")

for idx, row in df.iterrows():
    img_path = row["full_path"]
    
    # Read image
    img = cv2.imread(img_path)
    if img is None:
        print(f"⚠️ Skipping {img_path} (Could not load image)")
        continue  # Skip invalid images
    
    # Convert from BGR (OpenCV) to RGB (for augmentation)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # Generate multiple augmented versions
    for i in range(3):  # Create 3 augmented versions per image
        augmented = augmentations(image=img)["image"]

        # Convert back to BGR for saving
        augmented = cv2.cvtColor(augmented, cv2.COLOR_RGB2BGR)

        # Generate new filename
        orig_filename = os.path.basename(img_path)
        new_filename = f"aug_{i}_{orig_filename}"
        new_img_path = os.path.join(augmented_images_path, new_filename)

        # Save augmented image
        cv2.imwrite(new_img_path, augmented)

        # Store new entry for CSV
        augmented_rows.append({"path": f"augmented_images/{new_filename}", "label": row["label"]})

    # Progress update
    if idx % 100 == 0:
        print(f"Processed {idx}/{len(df)} images...")

# Create augmented DataFrame
df_augmented = pd.DataFrame(augmented_rows)

# Combine original and augmented data
df_final = pd.concat([df, df_augmented], ignore_index=True)

# Save the new dataset
augmented_csv_path = os.path.abspath("StudentEngagementAnalyzer/data/augmented_data.csv")
df_final.to_csv(augmented_csv_path, index=False)

print(f"✅ Augmentation complete! Augmented dataset saved as {augmented_csv_path}")




Starting augmentation for 10977 images...
Processed 0/10977 images...
Processed 100/10977 images...
Processed 200/10977 images...
Processed 300/10977 images...
Processed 400/10977 images...
Processed 500/10977 images...
Processed 600/10977 images...
Processed 700/10977 images...
Processed 800/10977 images...
Processed 900/10977 images...
Processed 1000/10977 images...
Processed 1100/10977 images...
Processed 1200/10977 images...
Processed 1300/10977 images...
Processed 1400/10977 images...
Processed 1500/10977 images...
Processed 1600/10977 images...
Processed 1700/10977 images...
Processed 1800/10977 images...
Processed 1900/10977 images...


Premature end of JPEG file


Processed 2000/10977 images...
Processed 2100/10977 images...
Processed 2200/10977 images...
Processed 2300/10977 images...
Processed 2400/10977 images...
Processed 2500/10977 images...
Processed 2600/10977 images...
Processed 2700/10977 images...
Processed 2800/10977 images...
Processed 2900/10977 images...
Processed 3000/10977 images...
Processed 3100/10977 images...
Processed 3200/10977 images...
Processed 3300/10977 images...
Processed 3400/10977 images...
Processed 3500/10977 images...
Processed 3600/10977 images...
Processed 3700/10977 images...
Processed 3800/10977 images...
Processed 3900/10977 images...
Processed 4000/10977 images...
Processed 4100/10977 images...
Processed 4200/10977 images...
Processed 4300/10977 images...
Processed 4400/10977 images...
Processed 4500/10977 images...
Processed 4600/10977 images...
Processed 4700/10977 images...
Processed 4800/10977 images...
Processed 4900/10977 images...
Processed 5000/10977 images...
Processed 5100/10977 images...
Processe

## Step 4: Create and Compile Model

We'll use MobileNetV2 as our base model and add custom layers for emotion classification:

In [25]:
def create_model(input_shape=(200, 200, 3)):
    """Create and compile the model"""
    # Base model - MobileNetV2
    base_model = MobileNetV2(
        weights='imagenet',  # Use pre-trained weights
        include_top=False,   # Remove original classification head
        input_shape=input_shape
    )
    
    # Freeze base model layers
    base_model.trainable = False
    
    # Add classification head
    x = GlobalAveragePooling2D()(base_model.output)
    x = Dropout(0.5)(x)
    x = Dense(256, activation='relu')(x)
    x = BatchNormalization()(x)  # Improve stability
    x = Dropout(0.3)(x)
    outputs = Dense(5, activation='softmax')(x)  # 5 emotion classes
    
    model = Model(inputs=base_model.input, outputs=outputs)
    
    # Compile model
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Create model
model = create_model()
model.summary()


## Step 5: Prepare Data Generator

We'll create a data generator to efficiently load and preprocess images:

In [None]:
def preprocess_data(df, input_shape=(224, 224)):
    """Preprocess images and prepare labels"""
    X = []
    y = []
    
    for idx, row in df.iterrows():
        # Load and preprocess image
        img = tf.keras.preprocessing.image.load_img(
            row['image_path'],
            target_size=input_shape
        )
        img_array = tf.keras.preprocessing.image.img_to_array(img)
        img_array = tf.keras.applications.mobilenet_v2.preprocess_input(img_array)
        X.append(img_array)
        y.append(row['emotion'])
        
        # Print progress
        if (idx + 1) % 1000 == 0:
            print(f"Processed {idx + 1} images")
    
    # Convert to numpy arrays
    X = np.array(X)
    y = tf.keras.utils.to_categorical(y)
    
    return X, y

# Preprocess data
print("Preprocessing training data...")
X, y = preprocess_data(df)

# Split data
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,  # Use 20% for validation
    random_state=42,  # For reproducibility
    stratify=y  # Maintain class distribution
)

## Step 6: Train the Model

Now let's train the model with callbacks for better training:

In [None]:
# Setup callbacks for better training
callbacks = [
    # Save the best model
    ModelCheckpoint(
        'best_model.h5',  # Save model to this file
        monitor='val_accuracy',  # Watch validation accuracy
        save_best_only=True,  # Only save if better than previous
        mode='max',  # Higher accuracy is better
        verbose=1
    ),
    # Stop if not improving
    EarlyStopping(
        monitor='val_loss',  # Watch validation loss
        patience=10,  # Wait 10 epochs for improvement
        restore_best_weights=True,  # Use best weights when done
        verbose=1
    ),
    # Reduce learning rate when stuck
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.1,  # Reduce LR by 90%
        patience=5,  # Wait 5 epochs before reducing
        min_lr=1e-6,  # Don't go below this LR
        verbose=1
    )
]

# Train model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,  # Maximum number of epochs
    batch_size=32,  # Process 32 images at a time
    callbacks=callbacks,
    verbose=1  # Show progress
)

## Step 7: Visualize Training Results

Let's plot the training history:

In [None]:
def plot_training_results(history):
    """Plot training metrics"""
    # Set style
    sns.set_style("whitegrid")
    sns.set_palette("husl")
    
    # Create figure
    fig = plt.figure(figsize=(15, 5))
    
    # Plot accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    # Plot loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

# Plot results
plot_training_results(history)

## Step 8: Save the Model

The model has already been saved as 'best_model.h5' through the ModelCheckpoint callback during training. You can now copy this file to your Student Engagement Analysis project directory.

To verify the model was saved:

In [None]:
import os
if os.path.exists('best_model.h5'):
    print("Model saved successfully!")
    print(f"Model file size: {os.path.getsize('best_model.h5') / (1024*1024):.2f} MB")
else:
    print("Error: Model file not found!")