
### In this notebook, the excel file is made from merging all results for the test set into one dataframe and exporting it

In [0]:
import pandas as pd
import numpy as np

In [0]:
results_baseline = spark.read.table("dev_data_science.mpst_dataset.results_baseline")
results_baseline = results_baseline.toPandas()
results_baseline.head()

Unnamed: 0,PROGRAM_ID,TITLE,Generated_Tags,Actual_Tags,Matching_Tags_Count
0,tt0033045,The Shop Around the Corner,"[comedy, romantic, entertaining, flashback]",[romantic],1
1,tt1937113,Call of Duty: Modern Warfare 3,"[violence, flashback, murder, action, suspense...",[good versus evil],0
2,tt0102007,The Haunted,"[paranormal, horror]","[haunting, horror, paranormal]",2
3,tt2005374,The Frozen Ground,"[murder, violence]","[dramatic, murder]",1
4,tt1411238,No Strings Attached,"[romantic, pornographic]","[adult comedy, boring, cute, entertaining, fla...",1


In [0]:
results_pretrained = spark.read.table("dev_data_science.mpst_dataset.results_pretrained2")
results_pretrained = results_pretrained.toPandas()
results_pretrained.head()

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source,award_label,tag_count,Generated_Tags
0,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",[romantic],test,imdb,No award,1,"[romantic, comedy, dramatic, mystery, sentimen..."
1,tt1937113,Call of Duty: Modern Warfare 3,Hours after the end of the previous game and t...,[good versus evil],test,imdb,No award,1,"[action, suspenseful, dramatic, revenge, viole..."
2,tt0102007,The Haunted,This creepy and scary story centers around The...,"[paranormal, horror, haunting]",test,imdb,No award,3,"[horror, paranormal, dark, haunting, mystery, ..."
3,tt2005374,The Frozen Ground,The film opens in an Anchorage motel room in 1...,"[dramatic, murder]",test,wikipedia,No award,2,"[suspenseful, dark, murder, psychological, mys..."
4,tt1411238,No Strings Attached,"15 years agoWe see two young kids, named Emma ...","[boring, adult comedy, cute, flashback, romant...",test,imdb,No award,6,"[comedy, romantic, adult comedy, feel-good, dr..."


In [0]:
results_finetuned = spark.read.table("dev_data_science.mpst_dataset.results_finetuned")
results_finetuned = results_finetuned.toPandas()
results_finetuned.head()

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source,award_label,tag_count,Generated_Tags
0,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",[romantic],test,imdb,No award,1,[romantic]
1,tt1937113,Call of Duty: Modern Warfare 3,Hours after the end of the previous game and t...,[good versus evil],test,imdb,No award,1,"[action, violence, revenge, suspenseful, dark]"
2,tt0102007,The Haunted,This creepy and scary story centers around The...,"[paranormal, horror, haunting]",test,imdb,No award,3,[horror]
3,tt2005374,The Frozen Ground,The film opens in an Anchorage motel room in 1...,"[dramatic, murder]",test,wikipedia,No award,2,"[murder, suspenseful, psychological, dark, vio..."
4,tt1411238,No Strings Attached,"15 years agoWe see two young kids, named Emma ...","[boring, adult comedy, cute, flashback, romant...",test,imdb,No award,6,[comedy]



Aligning some column names, and removing unnecessary ones:

In [0]:
results_baseline = results_baseline[['PROGRAM_ID','TITLE','Actual_Tags','Generated_Tags']]
results_baseline = results_baseline.rename(columns={"Actual_Tags": "tags"})
results_baseline = results_baseline.rename(columns={"PROGRAM_ID": "imdb_id"})
results_baseline = results_baseline.rename(columns={"TITLE": "title"})

In [0]:
results_baseline

Unnamed: 0,imdb_id,title,tags,Generated_Tags
0,tt0033045,The Shop Around the Corner,[romantic],"[comedy, romantic, entertaining, flashback]"
1,tt1937113,Call of Duty: Modern Warfare 3,[good versus evil],"[violence, flashback, murder, action, suspense..."
2,tt0102007,The Haunted,"[haunting, horror, paranormal]","[paranormal, horror]"
3,tt2005374,The Frozen Ground,"[dramatic, murder]","[murder, violence]"
4,tt1411238,No Strings Attached,"[adult comedy, boring, cute, entertaining, fla...","[romantic, pornographic]"
...,...,...,...,...
2891,tt0105585,Thunderheart,"[flashback, murder, suspenseful]",[murder]
2892,tt1869716,The East,"[neo noir, revenge, sentimental, suspenseful, ...","[murder, flashback, comedy, suspenseful, psych..."
2893,tt0219952,Lucky Numbers,"[comedy, murder]","[comedy, cult]"
2894,tt0039464,High Wall,[murder],"[murder, flashback]"


In [0]:
import random

In [0]:
merged_df = results_baseline.merge(results_pretrained[['imdb_id', 'plot_synopsis', 'Generated_Tags']], on='imdb_id', suffixes=('_baseline', '_zero_shot'))
merged_df = merged_df.merge(results_finetuned[['imdb_id', 'Generated_Tags']], on='imdb_id')

merged_df = merged_df.rename(columns={
    'tags': 'Ground_Truth_Tags',
    'Generated_Tags_baseline': 'Baseline_Tags',
    'Generated_Tags_zero_shot': 'ZeroShot_Tags',
    'Generated_Tags': 'Finetuned_Tags'
})

#shuffle tag sets per row
def shuffle_tag_sets(row):
    tag_sets = [row['Ground_Truth_Tags'], row['Baseline_Tags'], row['ZeroShot_Tags'], row['Finetuned_Tags']]
    random.shuffle(tag_sets)
    return pd.Series(tag_sets, index=['set_1', 'set_2', 'set_3', 'set_4'])

#apply the shuffle to each row
merged_df[['set_1', 'set_2', 'set_3', 'set_4']] = merged_df.apply(shuffle_tag_sets, axis=1)

#source mapping per row
def create_source_map(row):
    source_dict = {}
    sources = {
        'Ground_Truth_Tags': row['Ground_Truth_Tags'],
        'Baseline_Tags': row['Baseline_Tags'],
        'ZeroShot_Tags': row['ZeroShot_Tags'],
        'Finetuned_Tags': row['Finetuned_Tags']
    }
    for key in ['set_1', 'set_2', 'set_3', 'set_4']:
        for source_name, source_tags in sources.items():
            if set(row[key]) == set(source_tags):
                source_dict[key] = source_name
                break
    return source_dict

merged_df['source_map'] = merged_df.apply(create_source_map, axis=1)

experiment_df = merged_df[['imdb_id', 'title', 'plot_synopsis', 'set_1', 'set_2', 'set_3', 'set_4', 'source_map']]

experiment_df.head()

Unnamed: 0,imdb_id,title,plot_synopsis,set_1,set_2,set_3,set_4,source_map
0,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...","[comedy, romantic, entertaining, flashback]",[romantic],[romantic],"[romantic, comedy, dramatic, mystery, sentimen...","{'set_1': 'Baseline_Tags', 'set_2': 'Ground_Tr..."
1,tt1937113,Call of Duty: Modern Warfare 3,Hours after the end of the previous game and t...,"[action, violence, revenge, suspenseful, dark]",[good versus evil],"[action, suspenseful, dramatic, revenge, viole...","[violence, flashback, murder, action, suspense...","{'set_1': 'Finetuned_Tags', 'set_2': 'Ground_T..."
2,tt0102007,The Haunted,This creepy and scary story centers around The...,"[haunting, horror, paranormal]","[horror, paranormal, dark, haunting, mystery, ...",[horror],"[paranormal, horror]","{'set_1': 'Ground_Truth_Tags', 'set_2': 'ZeroS..."
3,tt2005374,The Frozen Ground,The film opens in an Anchorage motel room in 1...,"[suspenseful, dark, murder, psychological, mys...","[dramatic, murder]","[murder, violence]","[murder, suspenseful, psychological, dark, vio...","{'set_1': 'ZeroShot_Tags', 'set_2': 'Ground_Tr..."
4,tt1411238,No Strings Attached,"15 years agoWe see two young kids, named Emma ...",[comedy],"[romantic, pornographic]","[comedy, romantic, adult comedy, feel-good, dr...","[adult comedy, boring, cute, entertaining, fla...","{'set_1': 'Finetuned_Tags', 'set_2': 'Baseline..."


In [0]:
experiment_df.to_csv("filtered_human_validation_experiment_MPST.csv", index=False)