In [2]:
import pickle
import pandas as pd

# Load and inspect data
with open("data/meta.pkl", "rb") as f:
    meta_data = pickle.load(f)

# Print detailed structure analysis
def analyze_structure(data, prefix=""):
    if isinstance(data, dict):
        print(f"{prefix}Dictionary with {len(data)} keys: {list(data.keys())[:5]}")
        if data:
            first_key = list(data.keys())[0]
            print(f"{prefix}Sample of '{first_key}':")
            analyze_structure(data[first_key], prefix + "  ")
    elif isinstance(data, list):
        print(f"{prefix}List with {len(data)} items")
        if data:
            print(f"{prefix}First item:")
            analyze_structure(data[0], prefix + "  ")
    else:
        print(f"{prefix}Value: {type(data)}")

analyze_structure(meta_data)

Dictionary with 23086 keys: ['c6ierudrvo_0', 'c6ierudrvo_1', 'c6ierudrvo_2', 'c6ierudrvo_3', 'c6ierudrvo_5']
Sample of 'c6ierudrvo_0':
  Dictionary with 2 keys: ['text', 'label']
  Sample of 'text':
    Value: <class 'str'>


In [1]:
import pandas as pd

In [38]:
import pickle
import pandas as pd

# Load metadata file containing the actual text
meta_path = "data/meta_and_splits/meta.pkl"
with open(meta_path, "rb") as f:
    meta_data = pickle.load(f)

In [None]:


# Read utterance IDs from each split file
train_ids = []
with open("data/train_split_EN.txt", "r") as f:
    train_ids = [line.strip() for line in f.readlines()]

valid_ids = []
with open("data/valid_split_EN.txt", "r") as f:
    valid_ids = [line.strip() for line in f.readlines()]

test_ids = []
with open("data/test_split_EN.txt", "r") as f:
    test_ids = [line.strip() for line in f.readlines()]

# Create a list to store all data
utterance_data = []

# Process each split and add to the data list
for utt_id in train_ids:
    if utt_id in meta_data:
        utterance_data.append({
            "utterance_id": utt_id,
            "text": meta_data[utt_id]["text"],
            "split": "train"
        })

for utt_id in valid_ids:
    if utt_id in meta_data:
        utterance_data.append({
            "utterance_id": utt_id,
            "text": meta_data[utt_id]["text"],
            "split": "valid"
        })

for utt_id in test_ids:
    if utt_id in meta_data:
        utterance_data.append({
            "utterance_id": utt_id,
            "text": meta_data[utt_id]["text"],
            "split": "test"
        })

# Convert to DataFrame
df = pd.DataFrame(utterance_data)

# Save as CSV
df.to_csv("samsemo_utterances.csv", index=False)

# Print some stats
print(f"Total utterances: {len(df)}")
print(f"Train utterances: {len(df[df['split'] == 'train'])}")
print(f"Valid utterances: {len(df[df['split'] == 'valid'])}")
print(f"Test utterances: {len(df[df['split'] == 'test'])}")

Total utterances: 3898
Train utterances: 2728
Valid utterances: 585
Test utterances: 585


In [44]:
# Print the list of all emotion labels from meta.pkl file in text format
print("\nEmotion labels in meta.pkl:")
for utt_id in meta_data:
    if "label" in meta_data[utt_id]:
        print(f"{utt_id}: {meta_data[utt_id]['label']}")




Emotion labels in meta.pkl:
c6ierudrvo_0: [0, 1, 0, 0, 0]
c6ierudrvo_1: [0, 0, 0, 0, 1]
c6ierudrvo_2: [0, 0, 0, 0, 1]
c6ierudrvo_3: [0, 0, 0, 0, 1]
c6ierudrvo_5: [0, 0, 0, 0, 1]
okowm8pfh6_0: [0, 0, 0, 0, 1]
okowm8pfh6_1: [1, 0, 0, 0, 0]
okowm8pfh6_2: [1, 0, 0, 0, 0]
okowm8pfh6_3: [0, 0, 0, 0, 1]
okowm8pfh6_4: [0, 0, 0, 0, 1]
okowm8pfh6_5: [0, 0, 0, 0, 1]
okowm8pfh6_6: [1, 0, 0, 0, 0]
okowm8pfh6_7: [1, 0, 0, 0, 0]
okowm8pfh6_8: [1, 0, 0, 0, 0]
okowm8pfh6_9: [1, 0, 0, 0, 0]
okowm8pfh6_10: [0, 1, 0, 0, 0]
okowm8pfh6_11: [0, 0, 0, 0, 1]
okowm8pfh6_12: [0, 0, 0, 0, 1]
okowm8pfh6_13: [1, 0, 0, 0, 0]
okowm8pfh6_14: [1, 0, 0, 0, 0]
okowm8pfh6_15: [0, 0, 0, 0, 0]
okowm8pfh6_16: [1, 0, 0, 0, 0]
okowm8pfh6_17: [1, 0, 0, 0, 0]
okowm8pfh6_18: [1, 0, 0, 0, 0]
okowm8pfh6_19: [1, 0, 0, 0, 0]
okowm8pfh6_20: [1, 0, 0, 0, 0]
okowm8pfh6_21: [1, 0, 0, 0, 0]
okowm8pfh6_22: [0, 0, 0, 0, 0]
okowm8pfh6_23: [0, 0, 0, 0, 0]
okowm8pfh6_24: [0, 0, 0, 0, 0]
okowm8pfh6_25: [1, 0, 0, 0, 0]
okowm8pfh6_26: [1, 0, 0, 

# Data Exploration for samsemo.tsv

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re

In [19]:
# Read the TSV file
file_path = 'data/samsemo.tsv'
try:
    # UTF-16 with automatic BOM detection
    tsv_data = pd.read_csv(file_path, 
                           sep='\t',
                           encoding='utf-16',  # Use UTF-16 encoding
                           low_memory=False)
    
    print(f"Successfully read file with {len(tsv_data)} rows and {len(tsv_data.columns)} columns")
    
except Exception as e:
    print(f"UTF-16 approach failed: {e}")

Successfully read file with 23086 rows and 32 columns


In [23]:
# Display the first few rows to verify correct reading
print("\nFirst 2 rows:")
print(tsv_data.head(2))


First 2 rows:
   utterance_id                                        movie_title  \
0  04ftfyfznd_0  Das ABC des Freien Wissens - S=Sprachgewalt. I...   
1  04ftfyfznd_1  Das ABC des Freien Wissens - S=Sprachgewalt. I...   

                                          movie_link source_scene_start  \
0  https://upload.wikimedia.org/wikipedia/commons...        00:00:15.00   
1  https://upload.wikimedia.org/wikipedia/commons...        00:01:27.00   

  source_scene_stop language   sex    age   race covered_face  ...  \
0       00:00:27.99       DE  male  adult  white           no  ...   
1       00:01:34.99       DE  male  adult  white           no  ...   

                                      translation_de  \
0  Ein grosser deutscher dichter  rio reiser hat ...   
1  Nicht dass dort leute, die ganz viel hass empf...   

                                      translation_en  \
0  A big German denser once said the dream is ove...   
1  It is not that people who feel a great deal of...   


In [35]:
# Print column names to verify they're read correctly
print("\nColumn names:")
print(tsv_data.columns.tolist()) 
# Check for any null values in the DataFrame
null_counts = tsv_data.isnull().sum()
print("\nNull values in each column:")
print(null_counts[null_counts > 0])


Column names:
['utterance_id', 'movie_title', 'movie_link', 'source_scene_start', 'source_scene_stop', 'language', 'sex', 'age', 'race', 'covered_face', 'multiple_faces', 'emotion_1_annotator_1', 'emotion_2_annotator_1', 'emotion_1_annotator_2', 'emotion_2_annotator_2', 'emotion_1_annotator_3', 'emotion_2_annotator_3', 'aggregated_emotions', 'annotator_1', 'annotator_2', 'annotator_3', 'transcript', 'translation_de', 'translation_en', 'translation_es', 'translation_ko', 'translation_pl', 'duration', 'movie_type', 'license', 'author', 'download_date']

Null values in each column:
emotion_2_annotator_1    20539
emotion_2_annotator_2    22099
emotion_2_annotator_3    21683
aggregated_emotions       3076
movie_type                   4
author                       6
dtype: int64


In [27]:
# Print rows where aggregated_emotions is null but only the columns mentioned
print("\nRows with null aggregated_emotions:")
tsv_data[tsv_data['aggregated_emotions'].isnull()][['utterance_id', 'language', 'transcript', 'emotion_1_annotator_1', 'emotion_2_annotator_1', 'emotion_1_annotator_2', 'emotion_2_annotator_2', 'emotion_1_annotator_3', 'emotion_2_annotator_3', 'aggregated_emotions']]



Rows with null aggregated_emotions:


Unnamed: 0,utterance_id,language,transcript,emotion_1_annotator_1,emotion_2_annotator_1,emotion_1_annotator_2,emotion_2_annotator_2,emotion_1_annotator_3,emotion_2_annotator_3,aggregated_emotions
1,04ftfyfznd_1,DE,"Nicht dass dort leute, die ganz viel hass empf...",other emotions,,neutral,,disgust,,
17,0gatc9ixh4_12,DE,Sind zu haben,anger,surprise,other emotions,,disgust,,
20,0gatc9ixh4_15,DE,Ich habe mir hier in berlin diese zweite wohnu...,neutral,,other emotions,,surprise,disgust,
22,0gatc9ixh4_17,DE,"Ja, ich habe fotos vielleicht möchten sie mal...",fear,,surprise,,other emotions,,
34,0gatc9ixh4_3,DE,Ist doch schon so lange her bei dir,neutral,,sadness,,other emotions,,
...,...,...,...,...,...,...,...,...,...,...
22999,zr0ochjm4x_15,PL,Fakt że istnieje też broń psychotroniczna któr...,surprise,,neutral,,other emotions,,
23001,zr0ochjm4x_17,PL,"Łał, jajka są niezdrowe, pogadam o tym z tym k...",sadness,,neutral,,other emotions,,
23013,zr0ochjm4x_9,PL,"W ten oto sposób dochodzę do przekonania, że l...",fear,,other emotions,,neutral,,
23024,zuxp6odfgo_12,PL,Nie nie jest to wybieg marketingowy,happiness,,other emotions,,neutral,,


In [28]:
columns_to_keep = [
    'utterance_id', 'movie_title', 'language', 'sex', 'age', 'race', 
    'covered_face', 'multiple_faces', 'aggregated_emotions', 'transcript',   
    'movie_type', 'duration'
]

In [29]:
# Create a new DataFrame with selected columns and drop rows with null values in 'aggregated_emotions'
filtered_df = tsv_data[columns_to_keep].dropna(subset=['aggregated_emotions'])
# Check the shape of the filtered DataFrame
print(f"\nFiltered DataFrame shape: {filtered_df.shape}")


Filtered DataFrame shape: (20010, 12)


In [45]:
# Extract only english language utterances from filtered_df
english_df = filtered_df[filtered_df['language'] == 'EN']
# Check the shape of the english_df DataFrame
print(f"\nEnglish DataFrame shape: {english_df.shape}")


English DataFrame shape: (5870, 12)


In [33]:
# Map utterance_ids from samsemo_utterances.csv to the english_df DataFrame and concatenate english_df with samsemo_utterances.csv so it becomes one bigger csv file with all the data including text and split columns from original csv
utterances_df = pd.read_csv("samsemo_utterances.csv")
# Check the shape of the utterances_df DataFrame
print(f"\nUtterances DataFrame shape: {utterances_df.shape}")
# Merge the two DataFrames on 'utterance_id'
merged_df = pd.merge(english_df, utterances_df, on='utterance_id', how='inner')
# Check the shape of the merged DataFrame
print(f"\nMerged DataFrame shape: {merged_df.shape}")
# Save the merged DataFrame to a new CSV file
merged_df.to_csv("samsemo_utterances_with_text.csv", index=False)
# Print the first few rows of the merged DataFrame
print("\nMerged DataFrame preview:")
print(merged_df.head(10))
# Print the columns of the merged DataFrame
print("\nMerged DataFrame columns:")
print(merged_df.columns.tolist())



Utterances DataFrame shape: (3898, 3)

Merged DataFrame shape: (3898, 14)

Merged DataFrame preview:
    utterance_id                                        movie_title language  \
0   024mu3zivb_0  116th United States Congress House Floor - 201...       EN   
1   024mu3zivb_1  116th United States Congress House Floor - 201...       EN   
2  024mu3zivb_10  116th United States Congress House Floor - 201...       EN   
3  024mu3zivb_11  116th United States Congress House Floor - 201...       EN   
4  024mu3zivb_16  116th United States Congress House Floor - 201...       EN   
5  024mu3zivb_17  116th United States Congress House Floor - 201...       EN   
6  024mu3zivb_18  116th United States Congress House Floor - 201...       EN   
7  024mu3zivb_19  116th United States Congress House Floor - 201...       EN   
8   024mu3zivb_2  116th United States Congress House Floor - 201...       EN   
9  024mu3zivb_20  116th United States Congress House Floor - 201...       EN   

      sex      ag

In [34]:
# Just remove the text column because transcript is the same as text and we don't need both
merged_df = merged_df.drop(columns=['text'])
# Save the final DataFrame to a new CSV file
merged_df.to_csv("samsemo_utterances_with_text.csv", index=False)