#### Loading necessary libraries

In [67]:
import pandas as pd
import numbers as np
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)


#### Loading Data

In [68]:
df = pd.read_excel('ground-truth.xlsx', engine='openpyxl')
df.shape

(449, 36)

In [69]:
# df.head()

In [70]:
df.isna().sum()
cols = df.columns[df.isnull().any()].tolist()
# Drop columns with missing values as none of the columns/question in questionnaire 
df.drop(columns=cols, inplace=True)
df.shape

(449, 29)

In [71]:
def majority_vote(series):
    """
    Performs majority voting on a pandas Series.

    Parameters:
    - series (pd.Series): A pandas Series containing categorical values.

    Returns:
    - str: The majority-voted value. If there's a tie, 'yes' is favored.

    Example:
    series = pd.Series(['yes', 'no', 'yes', 'yes', 'no'])
    result = majority_vote(series)
    """
    if series.empty:
        return 'no'  
    # Count the occurrences of each unique value in the series
    counts = series.value_counts()
    # Check if there's a tie
    if len(counts) > 1 and counts.iloc[0] == counts.iloc[1]:
        # Resolve ties in favor of 'yes'
        return 'yes'
    else:
        # Return the most frequent value
        return counts.idxmax()

In [72]:
majority_df = df.groupby('creative_data_id').agg(majority_vote).reset_index()


In [73]:
majority_df.shape

(150, 29)

In [74]:
# removing unnecessary columns which are not needed for further analysis
majority_df = majority_df.iloc[:,:-6].reset_index(drop=True)
majority_df.shape

(150, 23)

In [75]:
# majority_df.to_excel("data/ground-truth_cleaned.xlsx")

In [76]:
video_df = pd.read_csv("data/sample.csv")
video_df.shape

(150, 9)

In [77]:
video_df.head(2)

Unnamed: 0,creative_data_id,creative_data_title,creative_data_description,creative_data_duration,creative_data_lifetime_spend_estimated,creative_data_lifetime_airings_count,creative_data_airing_date_first_et,creative_data_airing_date_last_et,speech
0,2194673,30s Kim's Discount - 2194673,Kim is going for the State Farm Drive Safe & S...,30,29789808.73,13949,2019-04-06T22:19:06-04:00,2020-08-04T18:42:50-04:00,"So Kim, you going for a big drive safe and sav..."
1,2142915,30s New Flat - 2142915,Uncomfortable with her shabby apartment and ro...,30,5423001.7,10132,2019-03-04T06:49:02-05:00,2021-08-03T11:12:36-04:00,Check your credit scores for free and learn ho...


In [78]:
video_df  = video_df[['creative_data_id', 'creative_data_description', 'speech']].reset_index(drop=True)
video_df.shape

(150, 3)

In [102]:
# Merging ground-truth and video-data

merged_df = pd.merge(majority_df, video_df, on='creative_data_id')
merged_df.shape


(150, 25)

In [103]:
merged_df.isna().sum()

creative_data_id                                                                                                                                                                                                                                                                                                                 0
Timestamp                                                                                                                                                                                                                                                                                                                        0
Is there a call to go online (e.g., shop online, visit the Web)?                                                                                                                                                                                                                                                                 0
Is there online contact informa

In [104]:
merged_df = pd.read_excel("data/final_data.xlsx")

In [105]:
yes_no_columns =merged_df.columns[2:-2]  

# Convert values to lowercase before mapping
for col in yes_no_columns:
    merged_df[col] = merged_df[col].str.lower().map({'yes': 1, 'no': 0})

In [132]:
# Create a long-form dataset where each row is a (speech, question) pair
long_form_data = []

for index, row in merged_df.iterrows():
    for question in yes_no_columns:
        long_form_data.append({
            'ID': row['creative_data_id'],
            'speech': row['speech'],
            'description': row['creative_data_description'],
            'question': question,
            'label': row[question]
        })

long_form_df = pd.DataFrame(long_form_data)

In [133]:
long_form_df.isna().sum()

ID             0
speech         0
description    0
question       0
label          0
dtype: int64

In [121]:
long_form_df[long_form_df['ID']==1739116]['label'].to_list()

[1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0]

In [122]:
long_form_df[long_form_df['ID']==1739116]['speech'].to_list()[0]

"Introducing slim pants. The number one weight loss supplement at GNC with slim bands. I lost £19. I gained a size six body in eight weeks. Natural herbal ingredients. I've lost four dress sizes, no stimulants, no jitters. I've gained a healthier lifestyle that now my kids can see. Slim Vance is now available at your local G and C store or text slim 30 to 2468, 10 to get a complimentary 14 day supply of slim vs before they're gone."

In [136]:
long_form_df.head(28)

Unnamed: 0,ID,speech,description,question,label
0,1471363,It's another pure gray morning. Don't know wha...,"The new MINI Countryman is the largest yet, pr...","Is there a call to go online (e.g., shop onlin...",0
1,1471363,It's another pure gray morning. Don't know wha...,"The new MINI Countryman is the largest yet, pr...",Is there online contact information provided (...,1
2,1471363,It's another pure gray morning. Don't know wha...,"The new MINI Countryman is the largest yet, pr...",Is there a visual or verbal call to purchase (...,0
3,1471363,It's another pure gray morning. Don't know wha...,"The new MINI Countryman is the largest yet, pr...",Does the ad portray a sense of urgency to act ...,0
4,1471363,It's another pure gray morning. Don't know wha...,"The new MINI Countryman is the largest yet, pr...","Is there an incentive to buy (e.g., a discount...",0
5,1471363,It's another pure gray morning. Don't know wha...,"The new MINI Countryman is the largest yet, pr...",Is there offline contact information provided ...,0
6,1471363,It's another pure gray morning. Don't know wha...,"The new MINI Countryman is the largest yet, pr...",Is there mention of something free?,0
7,1471363,It's another pure gray morning. Don't know wha...,"The new MINI Countryman is the largest yet, pr...",Does the ad mention at least one specific prod...,1
8,1471363,It's another pure gray morning. Don't know wha...,"The new MINI Countryman is the largest yet, pr...",Is there any verbal or visual mention of the p...,0
9,1471363,It's another pure gray morning. Don't know wha...,"The new MINI Countryman is the largest yet, pr...","Does the ad show the brand (logo, brand name) ...",1


In [137]:
grouped_df = long_form_df.groupby("ID").agg({
    "speech": "first",  # Take the first unique speech for each ID
    "description": "first",  # Take the first unique description for each ID
    "label": list       # Collect all labels into a list for each ID
}).reset_index()

# Rename columns as needed
grouped_df.columns = ["ID", "Speech", "description", "Labels"]


In [138]:
grouped_df.head()

Unnamed: 0,ID,Speech,description,Labels
0,1471363,It's another pure gray morning. Don't know wha...,"The new MINI Countryman is the largest yet, pr...","[0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, ..."
1,1488315,The end of civilization is upon us. Hold your ...,What would you do if the end of the world was ...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, ..."
2,1526213,Audi presens can help prepare for and in some ...,As a man speeds down a country road in his Aud...,"[0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, ..."
3,1548815,The new Honda Odyssey has tons of available sm...,"On an otherwise peaceful day, two giant monste...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, ..."
4,1624211,Hi guys. So this is the all new Chevy Equinox....,Chevy's spokesperson lists off all the feature...,"[0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, ..."


In [139]:
grouped_df.shape

(150, 4)

In [140]:
grouped_df.to_excel("data/long_form_data1.xlsx")