### Import Necessary Libraries:

Imports all the required libraries for text processing, video processing, feature extraction, and machine learning.

In [2]:
!pip install umap-learn

Collecting umap-learn
  Downloading umap_learn-0.5.6-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pynndescent, umap-learn
Successfully installed pynndescent-0.5.13 umap-learn-0.5.6


In [3]:
# Import necessary libraries
import os
import cv2
import numpy as np
import torch
from torchvision import models, transforms
from transformers import RobertaTokenizer, RobertaModel
import umap
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Mount Google Drive:

In [4]:
# Mount Google Drive to access files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Load and Preprocess Textual Data:

In [5]:
# Load textual data
textual_data_path = '/content/drive/MyDrive/Sample.csv'
text_data = pd.read_csv(textual_data_path)

# Drop unnecessary columns from textual data
text_data.drop(columns=[
    'creative_data_lifetime_spend_estimated',
    'creative_data_lifetime_airings_count',
    'creative_data_airing_date_first_et',
    'creative_data_airing_date_last_et'
], inplace=True)

# Display the first 5 rows of the dataframe
text_data.head()

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenize the text
    words = nltk.word_tokenize(text)

    # Remove stop words and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    return ' '.join(words)

# Example usage of text preprocessing
sample_text = "This is an example sentence, to demonstrate the preprocessing steps."
print(preprocess_text(sample_text))

example sentence demonstrate preprocessing step


### Extract Textual Features using RoBERTa:

In [6]:
# Initialize RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

# Function to extract RoBERTa features
def extract_roberta_features(texts):
    text_features = []
    for text in texts:
        cleaned_text = preprocess_text(text)
        inputs = tokenizer(cleaned_text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        with torch.no_grad():
            outputs = model(**inputs)
        feature_vector = outputs.last_hidden_state.mean(dim=1).detach().numpy()
        text_features.append(feature_vector)
    text_features = np.vstack(text_features)
    return text_features

# Extract features from descriptions and speech columns
description_features = extract_roberta_features(text_data['creative_data_description'].tolist())
speech_features = extract_roberta_features(text_data['speech'].tolist())

# Combine text features
combined_text_features = np.hstack((description_features, speech_features))
print("Shape of combined text features:", combined_text_features.shape)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Shape of combined text features: (150, 1536)


### Load and Preprocess Video Data:

In [7]:
# Function to get all video paths from a directory
def get_video_paths(directory_path):
    video_extensions = ('.mp4', '.avi', '.mov')
    video_paths = [os.path.join(directory_path, fname) for fname in os.listdir(directory_path) if fname.endswith(video_extensions)]
    return video_paths

# Specify the directory path for videos
directory_path = '/content/drive/MyDrive/sample'
video_paths = get_video_paths(directory_path)

# Function to extract frames from a video
def extract_frames(video_path, max_frames=30):
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_interval = max(1, frame_count // max_frames)
    for i in range(frame_count):
        ret, frame = cap.read()
        if not ret:
            break
        if i % frame_interval == 0:
            frames.append(frame)
        if len(frames) >= max_frames:
            break
    cap.release()
    return frames

# Initialize pre-trained VGG16 model
cnn = models.vgg16(pretrained=True)
cnn.eval()

# Transformation pipeline for input frames
preprocess = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Function to extract and aggregate visual features from video frames
def extract_visual_features_aggregate(video_paths, max_frames=30):
    visual_features = []
    for video_path in video_paths:
        frames = extract_frames(video_path)
        if len(frames) == 0:
            continue
        frame_features = []
        for frame in frames:
            input_tensor = preprocess(frame).unsqueeze(0)
            with torch.no_grad():
                output = cnn(input_tensor)
            frame_features.append(output.numpy())
        if frame_features:
            aggregated_features = np.mean(frame_features, axis=0)  # Aggregate features by averaging
            visual_features.append(aggregated_features)
    visual_features = np.vstack(visual_features)
    return visual_features

# Extract and aggregate visual features from the videos
aggregated_visual_features = extract_visual_features_aggregate(video_paths, max_frames=30)
print("Shape of aggregated visual features:", aggregated_visual_features.shape)

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100%|██████████| 528M/528M [00:09<00:00, 58.0MB/s]


Shape of aggregated visual features: (150, 1000)


### PCA Dimensionality Reduction

In [8]:
# Function to reduce dimensions using PCA
from sklearn.decomposition import PCA

def reduce_dimensions_pca(features, target_dim):
    pca = PCA(n_components=target_dim)
    reduced_features = pca.fit_transform(features)
    return reduced_features

# Set target dimension for PCA
target_dim = 150

# Reduce dimensions of text features
reduced_text_features = reduce_dimensions_pca(combined_text_features, target_dim)
print("Shape of reduced text features:", reduced_text_features.shape)

# Reduce dimensions of visual features
reduced_visual_features = reduce_dimensions_pca(aggregated_visual_features, target_dim)
print("Shape of reduced visual features:", reduced_visual_features.shape)

Shape of reduced text features: (150, 150)
Shape of reduced visual features: (150, 150)


### Combine Text and Visual Features

In [50]:
# Ensure the number of samples match before combining
assert reduced_text_features.shape[0] == reduced_visual_features.shape[0], "Number of samples do not match!"

# Combine text and visual features
final_combined_features = np.hstack((reduced_text_features, reduced_visual_features))
print("Shape of final combined features:", final_combined_features.shape)

Shape of final combined features: (150, 300)


### Load and Process Labels:

In [51]:
# Load ground truth labels
ground_truth_path = '/content/drive/MyDrive/ground truth.xlsx'
ground_df = pd.read_excel(ground_truth_path)

# Fill missing values with a default value (e.g., 'No')
ground_df.fillna('No', inplace=True)

# Drop columns with any None values
ground_df = ground_df.dropna(axis=1, how='any')

# Define question columns
question_columns = [
    'Is there a call to go online (e.g., shop online, visit the Web)?',
    'Is there online contact information provided (e.g., URL, website)?',
    'Is there a visual or verbal call to purchase (e.g., buy now, order now)?',
    'Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)?',
    'Is there an incentive to buy (e.g., a discount, a coupon, a sale or "limited time offer")?',
    'Is there offline contact information provided (e.g., phone, mail, store location)?',
    'Is there mention of something free?',
    'Does the ad mention at least one specific product or service (e.g., model, type, item)?',
    'Is there any verbal or visual mention of the price?',
    'Does the ad show the brand (logo, brand name) or trademark (something that most people know is the brand) multiple times?\n\nFor example, Nike ads often have the "swoosh" logo prominently displayed on shoes and apparel worn by celebrity athletes. The "Just Do It" slogan is another Nike trademark frequently included.',
    'Does the ad show the brand or trademark exactly once at the end of the ad?',
    'Is the ad intended to affect the viewer emotionally, either with positive emotion (fun, joy), negative emotion (sad, anxious) or another type of emotion? (Note: You may not personally agree, but assess if that was the intention.)',
    'Does the ad give you a positive feeling about the brand?',
    'Does the ad have a story arc, with a beginning and an end?',
    'Does the ad have a reversal of fortune, where something changes for the better, or changes for the worse?',
    'Does the ad have relatable characters?',
    'Is the ad creative/clever?',
    'Is the ad intended to be funny? (Note: You may not personally agree, but assess if that was the intention.)',
    'Does this ad provide sensory stimulation (e.g., cool visuals, arousing music, mouth-watering)?',
    'Is the ad visually pleasing?',
    'Does the ad have cute elements like animals, babies, animated, characters, etc?'
]

# Trim the spaces from the DataFrame column names
ground_df.columns = ground_df.columns.str.strip()

# Select the columns with trimmed spaces from the DataFrame
labels = ground_df[question_columns].values

# Ensure the labels also match the number of samples
labels = ground_df[question_columns].values[:final_combined_features.shape[0]]

In [52]:
# Convert ground truth labels to binary (yes/no to 1/0)
binary_labels = ground_df[question_columns].applymap(lambda x: 1 if x.lower() == 'yes' else 0)

# Ensure binary_labels is defined correctly
print("Shape of binary labels:", binary_labels.shape)

Shape of binary labels: (449, 21)


### Train-Test Split:

In [53]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(final_combined_features, labels, test_size=0.2, random_state=42)

# Convert 'Yes'/'No
y_train_binary = (y_train == 'Yes').astype(int)
y_test_binary = (y_test == 'Yes').astype(int)

### Convert Labels to Binary Format

In [54]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(final_combined_features, labels, test_size=0.2, random_state=42)

### Analyze Class Distribution

In [55]:
# Analyze class distribution for each question
for i, question in enumerate(question_columns):
    unique, counts = np.unique(binary_y_test[:, i], return_counts=True)
    print(f"Question: {question}")
    print(f"Class distribution: {dict(zip(unique, counts))}\n")

Question: Is there a call to go online (e.g., shop online, visit the Web)?
Class distribution: {0: 19, 1: 11}

Question: Is there online contact information provided (e.g., URL, website)?
Class distribution: {0: 15, 1: 15}

Question: Is there a visual or verbal call to purchase (e.g., buy now, order now)?
Class distribution: {0: 14, 1: 16}

Question: Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)?
Class distribution: {0: 23, 1: 7}

Question: Is there an incentive to buy (e.g., a discount, a coupon, a sale or "limited time offer")?
Class distribution: {0: 16, 1: 14}

Question: Is there offline contact information provided (e.g., phone, mail, store location)?
Class distribution: {0: 21, 1: 9}

Question: Is there mention of something free?
Class distribution: {0: 24, 1: 6}

Question: Does the ad mention at least one specific product or service (e.g., model, type, item)?
Class distribution: {0: 5, 1: 25}

Question: Is there any verbal or visu

### Train XGBoost Model with Class Weights

In [57]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'max_depth': [3, 4, 5],
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'scale_pos_weight': [1, 2, 3]
}

# Train XGBoost classifier for each question with hyperparameter tuning and class weighting
classifiers = []
for i in range(binary_labels.shape[1]):
    clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    grid_search = GridSearchCV(clf, param_grid, cv=3, scoring='f1')
    binary_labels_train = np.where(y_train[:, i] == 'Yes', 1, 0)
    grid_search.fit(X_train, binary_labels_train)
    best_clf = grid_search.best_estimator_
    classifiers.append(best_clf)

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru

### Predict and Save Results

In [58]:
# Predict answers for each video
predicted_answers = []
predicted_ids = set()

for i in range(final_combined_features.shape[0]):
    video_answers = []
    current_id = text_data['creative_data_id'].values[i]
    if current_id in predicted_ids:
        continue
    predicted_ids.add(current_id)
    for clf in classifiers:
        pred = clf.predict(final_combined_features[i].reshape(1, -1))[0]
        video_answers.append(pred)
    predicted_answers.append(video_answers)

# Convert predictions back to 'Yes'/'No'
predicted_answers = np.where(np.array(predicted_answers) == 1, 'Yes', 'No')

# Create a DataFrame for the predicted answers
predicted_answers_df = pd.DataFrame(predicted_answers, columns=question_columns)

# Add video IDs to the DataFrame
video_ids = list(predicted_ids)
predicted_answers_df.insert(0, 'creative_data_id', video_ids)

# Save the predicted answers to a CSV file
predicted_answers_df.to_csv('guhan.p_answers.csv', index=False)

### Evaluate the Model

In [59]:
# Predict answers for the test data
y_pred = np.zeros_like(y_test)
for i, clf in enumerate(classifiers):
    y_pred[:, i] = clf.predict(X_test)

# Convert y_test to binary for evaluation
binary_y_test = np.where(y_test == 'Yes', 1, 0)

# Convert y_pred to binary (this is the fix)
binary_y_pred = np.where(y_pred == 1, 1, 0)

# Initialize lists to store the metrics for each question
precision_scores = []
recall_scores = []
f1_scores = []
agreement_percentages = []

# Calculate metrics for each question
for i in range(y_test.shape[1]):
    # Use binary_y_pred here
    precision = precision_score(binary_y_test[:, i], binary_y_pred[:, i], average='binary', zero_division=0)
    recall = recall_score(binary_y_test[:, i], binary_y_pred[:, i], average='binary', zero_division=0)
    f1 = f1_score(binary_y_test[:, i], binary_y_pred[:, i], average='binary', zero_division=0)
    agreement_percentage = np.mean(binary_y_test[:, i] == binary_y_pred[:, i]) * 100

    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    agreement_percentages.append(agreement_percentage)

# Print the metrics for each question
for i, question in enumerate(question_columns):
    print(f'Question: {question}')
    print(f'  Precision: {precision_scores[i]:.2f}')
    print(f'  Recall: {recall_scores[i]:.2f}')
    print(f'  F1 Score: {f1_scores[i]:.2f}')
    print(f'  Agreement Percentage: {agreement_percentages[i]:.2f}%\n')

Question: Is there a call to go online (e.g., shop online, visit the Web)?
  Precision: 0.17
  Recall: 0.09
  F1 Score: 0.12
  Agreement Percentage: 50.00%

Question: Is there online contact information provided (e.g., URL, website)?
  Precision: 0.50
  Recall: 1.00
  F1 Score: 0.67
  Agreement Percentage: 50.00%

Question: Is there a visual or verbal call to purchase (e.g., buy now, order now)?
  Precision: 0.50
  Recall: 0.81
  F1 Score: 0.62
  Agreement Percentage: 46.67%

Question: Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)?
  Precision: 0.33
  Recall: 0.71
  F1 Score: 0.45
  Agreement Percentage: 60.00%

Question: Is there an incentive to buy (e.g., a discount, a coupon, a sale or "limited time offer")?
  Precision: 0.12
  Recall: 0.07
  F1 Score: 0.09
  Agreement Percentage: 33.33%

Question: Is there offline contact information provided (e.g., phone, mail, store location)?
  Precision: 0.39
  Recall: 0.78
  F1 Score: 0.52
  Agr

In [60]:
# Calculate average metrics
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)
average_f1_score = np.mean(f1_scores)
average_agreement_percentage = np.mean(agreement_percentages)

# Print the average metrics
print(f'Average Precision: {average_precision:.2f}')
print(f'Average Recall: {average_recall:.2f}')
print(f'Average F1 Score: {average_f1_score:.2f}')
print(f'Average Agreement Percentage: {average_agreement_percentage:.2f}%')

Average Precision: 0.45
Average Recall: 0.66
Average F1 Score: 0.52
Average Agreement Percentage: 63.17%
