In [59]:
import torch
import torch.nn as nn
from torchvision import transforms, models
from PIL import Image
import os
from google.colab import drive
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
import io
from tqdm import tqdm  # For progress bar


In [60]:
# ===========================
#    Mount Google Drive
# ===========================
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [61]:
class ParquetDataset(Dataset):
    def __init__(self, parquet_file_or_df, transform=None):
        if isinstance(parquet_file_or_df, str):
            self.df = pd.read_parquet(parquet_file_or_df)
        elif isinstance(parquet_file_or_df, pd.DataFrame):
            self.df = parquet_file_or_df
        else:
            raise ValueError("parquet_file_or_df must be either a file path (str) or a pandas DataFrame.")

        self.transform = transform

        # Identify genre columns by excluding known columns
        self.non_genre_columns = ['movie_id', 'movie_name', 'movie_poster']
        self.genre_columns = self.df.columns.drop(self.non_genre_columns)
        self.num_genres = len(self.genre_columns)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Load image from binary data
        image_binary = row['movie_poster']
        image = self.load_image_from_binary(image_binary)

        if self.transform:
            image = self.transform(image)

        # Extract movie_id
        movie_id = row['movie_id']

        return movie_id, image

    @staticmethod
    def load_image_from_binary(image_binary):
        img_byte_arr = io.BytesIO(image_binary)
        image = Image.open(img_byte_arr).convert("RGB")
        return image


In [62]:
# ===========================
#      Model Definition
# ===========================
class CustomResNet(nn.Module):
    def __init__(self, num_classes):
        super(CustomResNet, self).__init__()
        base_model = models.resnet34(pretrained=True)
        self.base = nn.Sequential(*list(base_model.children())[:-1])  # Remove the final classification layer
        self.fc_layers = nn.Sequential(
            nn.Linear(512, 512), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(512, 512), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(512, 512), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(512, 512), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(512, 512), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(512, 512), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(512, 512), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(512, 512), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(512, 512), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(512, 512), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(512, 512), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(512, 512), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(512, 256), #Vector is this layer.
            nn.ReLU(), nn.Dropout(0.5),
        )
        self.output_layer = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.base(x).flatten(1)  # Extract features
        x = self.fc_layers(x)        # Vectorization layers
        return self.output_layer(x)  # Classification layer

    def get_vector(self, x):
      x = self.base(x).flatten(1)
      for layer in self.fc_layers:
          x = layer(x)
          if isinstance(layer, nn.Linear) and layer.out_features == 256:
              break  # Stop after the Linear(512, 256) layer
      return x

In [63]:
# ===========================
#      Model Loading
# ===========================
def load_model(checkpoint_path, num_classes, device='cuda' if torch.cuda.is_available() else 'cpu'):
    """
    Loads the trained CustomResNet model from the checkpoint.

    Parameters:
        checkpoint_path (str): Path to the saved model checkpoint.
        num_classes (int): Number of output classes.
        device (str): Device to load the model on ('cuda' or 'cpu').

    Returns:
        CustomResNet: Loaded model in evaluation mode.
    """
    if not os.path.exists(checkpoint_path):
        raise FileNotFoundError(f"Checkpoint not found at '{checkpoint_path}'")

    # Initialize the model architecture
    model = CustomResNet(num_classes=num_classes)

    # Load the state_dict
    torch.load(checkpoint_path, map_location=device)
    model.to(device)
    model.eval()  # Set to evaluation mode
    return model

In [64]:
# ===========================
#          Transforms
# ===========================
def get_transform():
    """
    Returns the image transformations used during training.

    Returns:
        torchvision.transforms.Compose: Composed transformations.
    """
    return transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],  # ImageNet mean
                             std=[0.229, 0.224, 0.225]),  # ImageNet std
    ])

In [65]:
# ===========================
#    Inference Function
# ===========================
def get_vector_from_image(image_input, model, transform, device='cuda' if torch.cuda.is_available() else 'cpu'):
    """
    Processes the input image and returns a 256-dimensional feature vector.

    Parameters:
        image_input (str or PIL.Image.Image): Path to the image file or a PIL Image object.
        model (CustomResNet): The trained model.
        transform (torchvision.transforms.Compose): Preprocessing transformations.
        device (str): Device to perform computation on.

    Returns:
        numpy.ndarray: 256-dimensional feature vector.
    """
    # Load the image
    if isinstance(image_input, str):
        if not os.path.exists(image_input):
            raise FileNotFoundError(f"Image file not found at '{image_input}'")
        image = Image.open(image_input).convert("RGB")
    elif isinstance(image_input, Image.Image):
        image = image_input.convert("RGB")
    else:
        raise ValueError("image_input must be a file path or a PIL.Image.Image object.")

    # Apply transformations
    image = transform(image).unsqueeze(0).to(device)  # Add batch dimension

    with torch.no_grad():
        vector = model.get_vector(image)  # Get the 256-dimensional vector
        vector = vector.cpu().numpy().flatten()

    return vector

In [66]:
# ===========================
#         Usage Example
# ===========================
if __name__ == "__main__":
    # Path to the saved model checkpoint
    checkpoint_path = '/content/drive/MyDrive/Projects/movie_posters/Training/Models/best_model.pth'

    # Number of classes (replace with your actual number of genre columns)
    num_classes = 20  # Example value; replace with your actual number

    # Initialize device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")

    # Load the trained model
    try:
        model = load_model(checkpoint_path, num_classes=num_classes, device=device)
        print("Model loaded successfully.")
    except Exception as e:
        print(f"Error loading model: {e}")
        exit(1)

    # Get the transformation pipeline
    transform = get_transform()

    # Path to the input image for inference
    image_path = '/content/drive/MyDrive/Projects/movie_posters/Vectorization/poster_image.jpg'  # Replace with your image path

    # Check if image exists
    if not os.path.exists(image_path):
        print(f"Image file not found at '{image_path}'")
        exit(1)

    # Get the 256-dimensional vector
    try:
        vector = get_vector_from_image(image_path, model, transform, device=device)
        print(f"Vector shape: {vector.shape}")
        print(f"Vector: {vector}")
    except Exception as e:
        print(f"Error during inference: {e}")


Using device: cuda


  torch.load(checkpoint_path, map_location=device)


Model loaded successfully.
Vector shape: (256,)
Vector: [ 0.05856618  0.03154264 -0.02973773  0.02552691  0.00126035  0.00360497
 -0.02882855  0.03062486 -0.02029199 -0.03287343  0.0408044   0.02266661
  0.0383857  -0.00882797  0.02130431 -0.00712389 -0.03790253 -0.01565099
 -0.01244076 -0.01609455  0.04103363 -0.03976864 -0.0060688  -0.04436138
 -0.00957141 -0.03470837  0.00668672 -0.02438618  0.00188775 -0.06013023
 -0.030621    0.0517728   0.04189692 -0.00377822 -0.01050789 -0.02665899
 -0.04136047 -0.00627694 -0.00513279 -0.03514675 -0.03070272 -0.03292549
  0.04396499  0.01870156 -0.03442657 -0.01046899  0.03248999 -0.03566707
  0.02777395 -0.00137688 -0.04080449 -0.01196069 -0.00493136  0.006618
 -0.02525345  0.03621399  0.04613086 -0.04903983  0.02785776 -0.02644226
 -0.03053804 -0.01671258 -0.02704752 -0.03511086  0.0260879  -0.02371722
 -0.0286771   0.03857456 -0.02903061 -0.03686766 -0.03618192 -0.00799718
  0.03343593  0.02720884 -0.03677804  0.001142    0.0043915   0.022676

In [67]:
def vectorize_dataset(parquet_path, model_checkpoint_path, output_csv_path, batch_size=64, num_workers=4, device='cuda' if torch.cuda.is_available() else 'cpu'):
    """
    Processes all images in the parquet dataset through the model's vectorization layer
    and saves the movie_id and corresponding 256-dimensional vectors to a CSV file.

    Parameters:
        parquet_path (str): Path to the parquet file containing the dataset.
        model_checkpoint_path (str): Path to the saved model state dictionary.
        output_csv_path (str): Path to save the output CSV file.
        batch_size (int): Number of samples per batch for DataLoader.
        num_workers (int): Number of subprocesses to use for data loading.
        device (str): Device to perform computation on ('cuda' or 'cpu').

    Returns:
        None
    """
    # Define transforms
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],  # ImageNet mean
                             std=[0.229, 0.224, 0.225]),  # ImageNet std
    ])

    # Initialize the dataset
    print("Loading dataset...")
    dataset = ParquetDataset(parquet_path, transform=transform)
    print(f"Dataset loaded with {len(dataset)} samples.")

    # Initialize the model
    num_classes = dataset.num_genres
    model = CustomResNet(num_classes=num_classes)

    # Load the state dictionary
    print("Loading model state dictionary...")
    try:
        model = torch.load(model_checkpoint_path, map_location=device)
        print("Model state dictionary loaded successfully.")
    except Exception as e:
        print(f"Error loading model state dictionary: {e}")
        return

    model.to(device)
    model.eval()  # Set model to evaluation mode

    # Create DataLoader
    print("Creating DataLoader...")
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    # Initialize lists to store movie_ids and vectors
    all_movie_ids = []
    all_vectors = []

    # Iterate through DataLoader with a progress bar
    print("Starting vectorization process...")
    for batch_movie_ids, images in tqdm(data_loader, desc="Vectorizing Images"):
        images = images.to(device)

        with torch.no_grad():
            vectors = model.get_vector(images)  # Get the 256-dimensional vectors
            vectors = vectors.cpu().numpy()     # Move to CPU and convert to NumPy array

        all_movie_ids.extend(batch_movie_ids)
        all_vectors.extend(vectors)

    # Convert vectors to a DataFrame
    print("Converting vectors to DataFrame...")
    vectors_np = np.array(all_vectors)  # Shape: (num_samples, 256)
    vector_columns = [f'vec_{i}' for i in range(vectors_np.shape[1])]

    df_vectors = pd.DataFrame(vectors_np, columns=vector_columns)
    df_vectors.insert(0, 'movie_id', all_movie_ids)

    # Save to CSV
    print(f"Saving vectors to {output_csv_path}...")
    df_vectors.to_csv(output_csv_path, index=False)
    print(f"Saved vectors to {output_csv_path}")


In [69]:
# Define paths
parquet_file_path = '/content/drive/MyDrive/Projects/movie_posters/Training/dataset.parquet'  # Update as needed
checkpoint_path = '/content/drive/MyDrive/Projects/movie_posters/Training/Models/best_model.pth'  # Update as needed
output_csv = '/content/drive/MyDrive/Projects/movie_posters/Vectorization/Vectors/Vectors.csv'  # Desired output path

# Run the vectorization process
vectorize_dataset(
    parquet_path=parquet_file_path,
    model_checkpoint_path=checkpoint_path,
    output_csv_path=output_csv,
    batch_size=64,       # Adjust based on your GPU/CPU memory
    num_workers=4,       # Adjust based on your system
    device='cuda'        # Ensure CUDA is available, else use 'cpu'
)


Loading dataset...
Dataset loaded with 994 samples.


  model = torch.load(model_checkpoint_path, map_location=device)


Loading model state dictionary...
Model state dictionary loaded successfully.
Creating DataLoader...
Starting vectorization process...


Vectorizing Images: 100%|██████████| 16/16 [00:01<00:00, 13.72it/s]


Converting vectors to DataFrame...
Saving vectors to /content/drive/MyDrive/Projects/movie_posters/Vectorization/Vectors/Vectors.csv...
Saved vectors to /content/drive/MyDrive/Projects/movie_posters/Vectorization/Vectors/Vectors.csv
