### Display dataset metadata

In [1]:
import os
import pandas as pd
from collections import defaultdict

# Specify the directory where your CSV files are located
directory = '/results/twoertwe/meta/'

# Create an empty DataFrame to store column information
column_info = pd.DataFrame(columns=['File', 'Columns', 'FeatNum', 'Unique Y Values'])

# Create a dictionary to group files by their prefixes
file_groups = defaultdict(list)

# Iterate through files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(directory, filename)
        
        prefix = filename.split('_')[0]
        
        # Add the file to the corresponding group
        file_groups[prefix].append(file_path)

# Iterate through each group and analyze one CSV file from each group
for prefix, files in file_groups.items():
    # Take the first file from the group (you can modify this logic if needed)
    file_path = files[0]
    
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    
    # display(df.head())
    # Get the column names
    columns = df.columns.tolist()
    
    unique_y_values = df['y'].unique().tolist()
    
    # Add file and column information to the DataFrame
    column_info = pd.concat([column_info, pd.DataFrame({'File': [file_path.split('_')[0]], 'Columns': [columns], 'FeatNum': [len(columns)], 'Unique Y Values':[unique_y_values]})], ignore_index=True)

grouped_info_data = []

# Iterate through each row in column_info
for _, row in column_info.iterrows():
    file = row['File']
    columns = row['Columns']
    
    # Create a dictionary to store the grouped columns
    grouped_columns = {}
    
    # Iterate through each column and group by prefix
    for col in columns:
        prefix = col.split('_')[0]
        if prefix in grouped_columns:
            grouped_columns[prefix].append(col)
        else:
            grouped_columns[prefix] = [col]
            
    print(grouped_columns.items())
    
    # Add the grouped columns and prefix to the list
    grouped_info_data.append({'File': file, 'GroupedColumns': list(zip(grouped_columns.keys(), [len(i) for i in grouped_columns.values()]))})

grouped_info = pd.DataFrame(grouped_info_data)
merged_info = pd.merge(column_info, grouped_info, on='File')

display(merged_info)
merged_info.to_csv('metadata.csv', index=False)


KeyboardInterrupt: 

In [None]:
import pandas as pd


def process_row(row):
    # Initialize an empty dictionary to store the processed data for the row
    processed_data = {}
    
    # Iterate through each cell in the row
    for cell in row:
        if cell:  # Check if the cell is not empty
            column_name = cell[0]  # The first element is the column name
            column_value = cell[1]  # Combine the other elements
            processed_data[column_name] = column_value
    
    return processed_data

# Assuming 'df' is your DataFrame and 'col' is the column name
# Apply the function to each row
processed_rows = merged_info.GroupedColumns.apply(process_row)

# Convert the list of dictionaries to a DataFrame
result_df = pd.DataFrame(processed_rows.tolist())

# If necessary, you can fill NaN values with an appropriate value or method
result_df = result_df.fillna(0).astype(int)
print(result_df)

merged_result = merged_info.merge(
result_df.fillna(0), left_index=True, right_index=True)

display(merged_result)
# List of columns to remove
columns_to_remove = ['Columns', 'FeatNum','GroupedColumns']

# Remove the specified columns
merged_result = merged_result.drop(columns=columns_to_remove)


merged_result.to_csv('metadata.csv', index=False)



   y  meta  acoustic  language  vision  ecg  eda  mocap
0  1     4       140       457     125    0    0      0
1  1     3       140       457     125    0    0      0
2  1     8       140       457     125    0    0      0
3  1     9        52       457     125    0    0      0
4  1     3       140         0     125   54   62      0
5  1    19       140       457     125    0    0    330
6  1     1         0         0      49   18    8      0
7  1     3       140       457     125    0    0      0


Unnamed: 0,File,Columns,FeatNum,Unique Y Values,GroupedColumns,y,meta,acoustic,language,vision,ecg,eda,mocap
0,/results/twoertwe/meta/mosi,"[y, meta_clip, meta_begin, meta_end, meta_id, ...",727,"[-2.8, -2.6, -0.8, 1.6, -2.2, -3.0, -0.4, 0.8,...","[(y, 1), (meta, 4), (acoustic, 140), (language...",1,4,140,457,125,0,0,0
1,/results/twoertwe/meta/sewa,"[y, meta_begin, meta_end, meta_id, acoustic_op...",726,"[0.0055897776, 0.41614386, 0.3779384, 0.350838...","[(y, 1), (meta, 3), (acoustic, 140), (language...",1,3,140,457,125,0,0,0
2,/results/twoertwe/meta/tpot,"[y, meta_Evidence, meta_Visual, meta_Language,...",731,"[0, 3, 2, 1]","[(y, 1), (meta, 8), (acoustic, 140), (language...",1,8,140,457,125,0,0,0
3,/results/twoertwe/meta/umeme,"[y, meta_arousal_audio, meta_valence_audio, me...",644,"[4.7, 2.55, 3.06, 5.31, 3.31, 2.85, 3.5, 4.23,...","[(y, 1), (meta, 9), (acoustic, 52), (language,...",1,9,52,457,125,0,0,0
4,/results/twoertwe/meta/recola,"[y, meta_begin, meta_end, meta_id, acoustic_op...",385,"[0.024066666, -0.0049866666, 0.060946666, 0.10...","[(y, 1), (meta, 3), (acoustic, 140), (ecg, 54)...",1,3,140,0,125,54,62,0
5,/results/twoertwe/meta/iemocap,"[y, meta_begin, meta_end, meta_arousal_T, meta...",1072,"[2.0, 3.0, 3.5, 4.0, 2.5, 4.5, 1.5, 1.6667, 2....","[(y, 1), (meta, 19), (acoustic, 140), (languag...",1,19,140,457,125,0,0,330
6,/results/twoertwe/meta/vreed,"[y, meta_id, ecg_Bpm, ecg_HF, ecg_Ibi, ecg_LF,...",77,"[1, 0, 3, 2]","[(y, 1), (meta, 1), (ecg, 18), (eda, 8), (visi...",1,1,0,0,49,18,8,0
7,/results/twoertwe/meta/mosei,"[y, meta_begin, meta_end, meta_id, acoustic_op...",726,"[0.0, 1.6666666, 1.3333334, 0.33333334, 0.6666...","[(y, 1), (meta, 3), (acoustic, 140), (language...",1,3,140,457,125,0,0,0


In [1]:

# from dataset import *

# from torch.utils.data import DataLoader
# from dataset import *
# from mmidataset import custom_collate_fn


# dataset_rootdir = '/results/twoertwe/meta/'  # Path to your dataset directory

# batch_size = 32  # Set the batch size
# dataset_name = 'sewa_valence'
# data_type = 'test'

# non_text_features = DATASET_MODALITY[dataset_name]

# # Create an instance of MMIDataset
# mmi_dataset = MMIDataset(data_type=data_type, dataset_name=dataset_name, 
#                         dataset_rootdir=dataset_rootdir, feature_list=non_text_features)

# data_loader = DataLoader(mmi_dataset, batch_size=batch_size, collate_fn = custom_collate_fn)


from tests.test_dataset_split import *
test_dataset_train_val_test_overlap()

Loading Dataset recola_arousal, size: 660

Loading Dataset recola_arousal, size: 180

Loading Dataset recola_arousal, size: 240



KeyError: 'language'

### Feature Extraction

In [None]:
import cv2
import torch
import torchvision.transforms as transforms
import torchvision.models as models
import pickle
import os

def video_processor(video_path):
    # Load pretrained ResNet-152 model
    model = models.resnet152(pretrained=True)
    model = torch.nn.Sequential(*(list(model.children())[:-1])) # Remove the last fully connected layer
    model.eval()

    # Video frame extraction and preprocessing
    cap = cv2.VideoCapture(video_path)
    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame = preprocess_frame(frame)
        frames.append(frame)
    cap.release()

    if len(frames) == 0:
        return torch.empty(0, 2048)  # Return an empty tensor if no frames are extracted

    # Convert list of frames to tensor
    frames_tensor = torch.stack(frames)
    print('frame', frames_tensor)

    # Feature extraction
    with torch.no_grad():
        features = model(frames_tensor)
    print('features', features)

    features_mean = torch.mean(features, dim=0).unsqueeze(0)

    # Flatten features from [N, 2048, 1, 1] to [N, 2048]
    features_flattened = torch.flatten(features_mean, start_dim=1)

    return features_flattened


def preprocess_frame(frame):
    preprocess = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    return preprocess(frame)


def process_video_data(videos_folder, video_features_path):
    video_features = []

    for filename in os.listdir(videos_folder):
        video_path = os.path.join(videos_folder, filename)
        video_feature = video_processor(video_path)
        video_features.append(video_feature)
            
    with open(video_features_path, 'wb') as f:
        pickle.dump(video_features, f)

    print(f"Video features saved to {video_features_path}")


In [None]:
# !pip install librosa torch numpy
import os
import librosa
import numpy as np
import torch
from tqdm.auto import tqdm

def get_librosa_features(path: str) -> np.ndarray:
    y, sr = librosa.load(path)

    hop_length = 512  # Set the hop length; at 22050 Hz, 512 samples ~= 23ms

    # Remove vocals first
    D = librosa.stft(y, hop_length=hop_length)
    S_full, phase = librosa.magphase(D)

    S_filter = librosa.decompose.nn_filter(S_full, aggregate=np.median, metric="cosine",
                                           width=int(librosa.time_to_frames(0.2, sr=sr)))

    S_filter = np.minimum(S_full, S_filter)

    margin_i, margin_v = 2, 4
    power = 2
    mask_v = librosa.util.softmask(S_full - S_filter, margin_v * S_filter, power=power)
    S_foreground = mask_v * S_full

    # Recreate vocal_removal y
    new_D = S_foreground * phase
    y = librosa.istft(new_D)

    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)  # Compute MFCC features from the raw signal
    mfcc_delta = librosa.feature.delta(mfcc)  # And the first-order differences (delta features)

    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
    S_delta = librosa.feature.delta(S)

    spectral_centroid = librosa.feature.spectral_centroid(S=S_full)

    audio_feature = np.vstack((mfcc, mfcc_delta, S, S_delta, spectral_centroid))  # combine features

    # binning data
    jump = int(audio_feature.shape[1] / 10)
    return librosa.util.sync(audio_feature, range(1, audio_feature.shape[1], jump))

def extract_audio_features(audio_file_path: str) -> torch.Tensor:
    # Extract audio seq features using librosa
    features = get_librosa_features(audio_file_path).T
    
    # avg
    tensor_features = torch.tensor(features).mean(dim=0).unsqueeze(0)
    return tensor_features

def process_audio_data(audio_folder, audio_features_path):
    audio_features = []

    for filename in tqdm(os.listdir(audio_folder), desc="Processing audio files"):
        audio_path = os.path.join(audio_folder, filename)
        if os.path.isfile(audio_path):
            audio_feature = extract_audio_features(audio_path)
            audio_features.append(audio_feature)

    # Save to a pickle file
    with open(audio_features_path, 'wb') as f:
        pickle.dump(audio_features, f)
    print(f"Audio features saved to {audio_features_path}")
    

In [None]:
# Predefined dataset paths configuration
DATASET_PATHS = {
    'datasets/MOSI_small': {
        'videos_folder': '/projects/dataset_original/datasets/MOSI_small/Base_data/Videos_Segmented',
        'audios_folder': '/projects/dataset_original/datasets/MOSI_small/Base_data/Audio_Segmented'
    }

    # Add more datasets here if needed
}
def prepare_dataset_paths(dataset_name):

    # Check if the dataset is defined in the configuration
    if dataset_name in DATASET_PATHS:
        paths = DATASET_PATHS[dataset_name]

        # Process video and audio data if their paths are available
        if 'videos_folder' in paths:
            video_features_path = f'{dataset_name}/video_features.pkl'
            process_video_data(paths['videos_folder'], video_features_path)

        if 'audios_folder' in paths:
            audio_features_path = f'{dataset_name}/audio_features.pkl'
            process_audio_data(paths['audios_folder'], audio_features_path)

    else:
        raise ValueError(f"Dataset {dataset_name} not found in dataset paths")

# Example usage
dataset_name = 'datasets/MOSI_small'
# prepare_dataset_paths(dataset_name)


In [None]:
# process text and label
# !pip install pandas --force-reinstall
import csv
import pandas as pd
import os
TEXT_FEATURE_PATH = f'{dataset_name}/text_n_label.csv'

csv_file_path = TEXT_FEATURE_PATH

def read_text_files(folder_path, df):
    text_list = []
    label_list = []

    filenames = [filename for filename in sorted(os.listdir(folder_path)) if filename.endswith('.txt')]
    
    print(filenames[:5])
    # Iterate over all files in the given folder
    for filename in filenames:
        file_path = os.path.join(folder_path, filename)

        # Check if it's a file and not a directory
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                # Read the content and add it to the list
                text_list.append(file.read())
                
        label_list.append(df[df['filename'] == filename.split('.')[0]]['label'][0])

    label_list = [f"{float(number):.2f}" for number in label_list]

    return text_list, label_list


def text_label_creation(text_list, label_list, csv_file_path = TEXT_FEATURE_PATH):

    assert len(text_list) == len(label_list), "Text and label lists must have the same length."

    # Writing to csv file
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
        csvwriter = csv.writer(csvfile)

        # Write the header
        csvwriter.writerow(['text_input', 'label'])

        # Write the data
        for text, label in zip(text_list, label_list):
            csvwriter.writerow([text, label])

    print(f"CSV file saved to {csv_file_path}")
    
csv_path ='/projects/dataset_original/datasets/MOSI_small/Base_data/Labels/boundaries_sentimentint_avg.csv'
text_folder = '/projects/dataset_original/datasets/MOSI_small/Base_data/Text_Per_Segment/Final'

df = pd.read_csv(csv_path, header=None)
headers = ['c1', 'c2', 'c3', 'filename', 'label']
df.columns = headers

text_list, label_list = read_text_files(text_folder, df)


print(text_list[:5], label_list[:5])
# text_label_creation(text_list, label_list, csv_file_path = TEXT_FEATURE_PATH)


    


### Result post processing

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error

def calculate_metrics(true_values, predicted_values):
    """
    Calculate CCC, RMSE, and PCC.
    :param true_values: Array of true values
    :param predicted_values: Array of predicted values
    :return: Concordance Correlation Coefficient, Root Mean Squared Error, Pearson Correlation Coefficient
    """
    # Convert non-numeric values to NaN
    true_values = pd.to_numeric(true_values, errors='coerce')
    predicted_values = pd.to_numeric(predicted_values, errors='coerce')
    
    # Remove or impute NaNs (or use np.nanmean, np.nanvar, etc., to handle NaNs)
    valid_indices = ~np.isnan(true_values) & ~np.isnan(predicted_values)
    true_values = true_values[valid_indices]
    predicted_values = predicted_values[valid_indices]

    # Calculate CCC
    mean_true = np.mean(true_values)
    mean_predicted = np.mean(predicted_values)
    var_true = np.var(true_values)
    var_predicted = np.var(predicted_values)
    pearson_corr, _ = pearsonr(true_values, predicted_values)
    ccc = (2 * pearson_corr * np.sqrt(var_true) * np.sqrt(var_predicted)) / \
          (var_true + var_predicted + (mean_true - mean_predicted) ** 2)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(true_values, predicted_values))

    # PCC is the Pearson Correlation Coefficient
    pcc = pearson_corr

    return ccc, rmse, pcc


# Example usage


# Path to your CSV file
ALL_DATASETS = [
            'umeme_arousal', 
                # 'vreed_av',

                'iemocap_valence', 'iemocap_arousal',
                # 'recola_valence', 
                # 'recola_arousal', 
                # 'sewa_valence', 'sewa_arousal', 
                # 'mosi_sentiment',
                # 'mosei_sentiment', 'mosei_happiness',
                ]


# List to store results
results = []
num_epoch = 30

for task_name in ALL_DATASETS:
    best_ccc = -1
    best_rmse = float('inf')
    best_pcc = -1

    for i in range(num_epoch):
        csv_file = f'/work/jingyiz4/mustard-demo/results/{task_name}/gpt2_nopretrain_0.0001_2_42_0_unfreeze/predictions_actuals_{i}.csv'
        df = pd.read_csv(csv_file)

        # Assuming the columns are named 'Actual' and 'Prediction'
        true_values = df['Actual'].to_numpy()
        predicted_values = df['Prediction'].to_numpy()

        # Calculate metrics
        ccc, rmse, pcc = calculate_metrics(true_values, predicted_values)

        # Update best metrics if current epoch is better
        if ccc > best_ccc:
            best_ccc = ccc
        if rmse < best_rmse:
            best_rmse = rmse
        if pcc > best_pcc:
            best_pcc = pcc

    # Store best results for the task
    results.append({
        "Task": task_name,
        "Modeling": 'gpt2',
        "Best RMSE": best_rmse,
        "Best PCC": best_pcc,
        "Best CCC": best_ccc,
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Display the results DataFrame
print(results_df)

In [1]:

# Define the base directories
source_base = "/projects/dataset_processed"
target_base = "/work/jingyiz4/datasets/"
# for subdir in subdirs:
#     source_dir = os.path.join(source_base, subdir, "twoertwein")
#     target_dir = os.path.join(target_base, subdir, "twoertwein")

#     # Create the target directory if it doesn't exist
#     os.makedirs(target_dir, exist_ok=True)

#     # Copy all Python files
#     for file in os.listdir(source_dir):
#         # if file.endswith(".hdf"):

#         # if file.endswith(".py"):
#             shutil.copy2(os.path.join(source_dir, file), target_dir)


In [41]:
import pandas as pd
import os
import shutil

def read_df_from_hdf(task_name):
    file_path = f"/work/jingyiz4/datasets/{task_name}/twoertwein/all_minilm_l12_v2.hdf"

    # Read the dataframe from the HDF file
    try:
        stored_df = pd.read_hdf(file_path, key='df')
    except Exception as e:
        print(f"An error occurred while reading the HDF file: {e}")
    meta_strings_filtered = [
        string for string in stored_df.columns 
        if 'liwc_' not in string and 'all_minilm_' not in string 
    ]    

    # display(meta_strings_filtered)
    # display(stored_df[meta_strings_filtered].head(5))
    # print(len(stored_df))
    return stored_df[meta_strings_filtered]
    

# List of subdrectories
datasets = {
    # "UMEME": 'umeme', # done
    # "MOSI": 'mosi',
    # "MOSEI": 'mosei', # sentence merge issue
    # "AVEC16-RECOLA": 'recola', #done
    "SEWA": 'sewa',
    # "IEMOCAP": 'iemocap'  # Uncomment if needed
}

dataset_rootdir = '/results/twoertwe/meta/'  # Path to your dataset directory
new_dataset_rootdir = '/work/jingyiz4/cleaned_data/'

# Assuming the HDF file is "all_minilm_l12_v2.hdf" and the key for the dataframe is "df"
for dataset, dataset_abbr in datasets.items():
    ds_df = read_df_from_hdf(dataset)

    # List all files in dataset_dir
    all_files = os.listdir(dataset_rootdir)

    # Filter for CSV files that include 'dataset' in their name
    csv_files = [file for file in all_files if dataset_abbr in file and file.endswith('.csv')]

    print(csv_files)

    for csv_file in csv_files:
        csv_path = os.path.join(dataset_rootdir, csv_file)

        df = pd.read_csv(csv_path)

        # remove old text columns, now both dfs are cleaned
        non_emb_col = [
            string for string in df.columns 
            if 'liwc_' not in string and 'all_minilm_' not in string 
        ]    
        df = df[non_emb_col]

        # Append 'sentence' column from ds_df to df
        # Ensure the 'sentence' column exists in ds_df
        if 'sentence' in ds_df.columns:
            # Find shared columns
            shared_cols = list(set(ds_df.columns).intersection(set(df.columns)))
            print(f"Dataset: {dataset}")
            # shared_cols = ['meta_begin', 'meta_end']
            print(f"Columns in ds_df: {ds_df.columns.tolist()}")
            print(f"Columns in df: {df.columns.tolist()}")
            print(len(ds_df), len(df))
            
            print(f"Shared columns: {shared_cols}")
            display(ds_df[['meta_begin', 'meta_end','German']].head())   


            display(df[['meta_begin', 'meta_end','meta_id']].head())   

            print()

            ds_df['key'] = ds_df[shared_cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
            df['key'] = df[shared_cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
            display(ds_df['key'].head())
            display(df['key'].head())   

            new_df = df.merge(ds_df[['key', 'sentence']], on='key', how='left')
            new_df.drop('key', inplace=True, axis=1)
            display(ds_df['sentence'].head())
            display(new_df['sentence'].head())
        else:
            new_df = df.copy()

        assert len(new_df) == len(df)

        break
        # new_df.to_csv(os.path.join(new_dataset_rootdir, csv_file))



['sewa_valence_4_training.csv', 'sewa_arousal_0_test.csv', 'sewa_arousal_4_training.csv', 'sewa_valence_1_test.csv', 'sewa_arousal_2_validation.csv', 'sewa_valence_3_training.csv', 'sewa_arousal_0_training.csv', 'sewa_arousal_1_training.csv', 'sewa_valence_2_training.csv', 'sewa_arousal_3_training.csv', 'sewa_valence_2_validation.csv', 'sewa_valence_0_training.csv', 'sewa_valence_1_training.csv', 'sewa_arousal_2_training.csv', 'sewa_valence_0_test.csv', 'sewa_arousal_0_validation.csv', 'sewa_arousal_1_test.csv', 'sewa_valence_0_validation.csv', 'sewa_valence_1_validation.csv', 'sewa_valence_4_test.csv', 'sewa_arousal_2_test.csv', 'sewa_arousal_1_validation.csv', 'sewa_valence_3_test.csv', 'sewa_valence_4_validation.csv', 'sewa_arousal_4_test.csv', 'sewa_valence_3_validation.csv', 'sewa_valence_2_test.csv', 'sewa_arousal_4_validation.csv', 'sewa_arousal_3_test.csv', 'sewa_arousal_3_validation.csv']
Dataset: SEWA
Columns in ds_df: ['meta_begin', 'meta_end', 'German', 'arousal', 'valence'

Unnamed: 0,meta_begin,meta_end,German
1653,0.879274,2.665017,<laughter>
1654,2.665017,3.072928,Hallo
1655,3.072928,3.6239,<laughter>
1656,4.586731,6.299956,Haben wir jetzt nur eine bestimmte Zeit?
2468,6.299956,8.674903,<>


Unnamed: 0,meta_begin,meta_end,meta_id
0,1.120781,2.0414,14
1,2.6341,4.0125,14
2,4.6251,5.199047,14
3,5.199047,7.7186,14
4,7.7186,8.7186,14





1653    0.879274_2.665017
1654    2.665017_3.072928
1655      3.072928_3.6239
1656    4.586731_6.299956
2468    6.299956_8.674903
Name: key, dtype: object

0    1.120781_2.0414
1      2.6341_4.0125
2    4.6251_5.199047
3    5.199047_7.7186
4      7.7186_8.7186
Name: key, dtype: object

1653                                         laughter
1654                                            Hello
1655                                         laughter
1656    Do we only have a certain amount of time now?
2468                                                 
Name: sentence, dtype: object

0              laughter
1    Hi, do you see me?
2                  Okay
3                      
4             breathing
Name: sentence, dtype: object

AssertionError: 

In [1]:
import sys

import numpy as np
import pandas as pd

# sys.path.append("/results/twoertwe/emro")

from dataloader import get_partitions


for dataset, labels in (
    # regression
    # ("mosi", ("sentiment",)),
    # ("mosei", ("sentiment", "happiness")),
    # ("sewa", ("arousal", "valence")),
    # ("recola", ("arousal", "valence")),
    # ("iemocap", ("arousal", "valence")),
    ("umeme", ("arousal", "valence")),
    # # classification (4 classes)
    # ("tpot", ("constructs",)),
    # # classification (5 classes)
    # ("vreed", ("av",)),
):
    for label in labels:
        # get 5-fold
        for fold, partition in get_partitions(
            f"{dataset}/{label}", batch_size=-1
        ).items():
            # training, validation, test sets
            for name, data in partition.items():
                assert len(data.iterator) == 1
                features = pd.DataFrame(
                    data.iterator[0].pop("x")[0],
                    columns=data.properties["x_names"],
                )
                labels_metadata = pd.DataFrame(
                    {key: value[0][:, 0] for key, value in data.iterator[0].items()}
                )
                final_csv = pd.concat([labels_metadata, features], axis=1)
                display(final_csv.head())
                break
                # final_csv.to_csv(
                #     f"{dataset}_{label}_{fold}_{name}.csv", index=False
                # )


AttributeError: 'Series' object has no attribute 'numpy'

In [4]:
stored_df['meta_id'].head()

NameError: name 'stored_df' is not defined

In [19]:
from pathlib import Path

import pandas as pd
from python_tools import caching, features
from python_tools.generic import map_parallel

import sys

# sys.path.insert(0, "/projects/dataset_processed/TPOT/twoertwein")
from python_tools.extract import extract_liwc, load_liwc


def extract(file: Path) -> None:
    name = file.with_suffix(".hdf").name
    # openface+opensmile
    cache = {}
    if file.suffix == ".mp4":
        cache["openface"] = Path(f"openface/{name}")
    if file.suffix == ".mp3" or "Only" not in file.name:
        cache["opensmile_eGeMAPSv02"] = Path(f"opensmile_eGeMAPSv02/{name}")
        cache["opensmile_vad_opensource"] = Path(f"opensmile_vad_opensource/{name}")
    features.extract_features(video=file, audio=file, caches=cache)


if __name__ == "__main__":
    map_parallel(
        extract,
        Path("/projects/dataset_original/UMEME/media/").glob("*.mp*"),
        workers=7,
    )

    sentences = (
        "How can I not",
        "I’m quite sure that we will find some way or another",
        "Ella Jorgenson made the pudding",
        "The floor was completely covered",
        "They are just going to go ahead regardless",
        "It has all been scheduled since Wednesday",
        "I am going shopping",
        "A preliminary study shows rats to be more inquisitive than once thought",
        "That’s it the meeting is finished",
        "I don’t know how she could miss this opportunity",
        "It is raining outside",
        "Your dog is insane",
        "She told me what you did",
        "Your grandmother is on the phone",
        "Only I joined her in the ceremony",
    )
    data = {
        "file": [],
        "name": [],
        "sentence": [],
        "valence": [],
        "arousal": [],
        "dominance": [],
        "audio": [],
        "video": [],
    }
    for block in (
        Path("/projects/dataset_original/UMEME/evaluation.txt")
        .read_text()
        .split("\n\n")
    ):
        if not block:
            continue
        name, *details = block.split("\n")
        details = {
            detail.split(":", 1)[0].strip(): detail.split(":", 1)[1].strip()
            for detail in details
        }

        data["name"].append(details["speaker"])
        match details["modality"]:
            case "av":
                file = f"{name}_original.mp4"
            case "video":
                file = f"{name}_videoOnly.mp4"
            case "audio":
                file = f"{name}_audioOnly.mp3"
            case _:
                assert False, _
        assert (Path("/projects/dataset_original/UMEME/media") / file).is_file()
        data["file"].append(file)
        data["sentence"].append(
            sentences[int(name.split("-", 1)[0].split("S", 1)[1][:-1]) - 1]
        )
        
        # TODO
        # data["arousal"].append(float(details["act"].split("+", 1)[0]))
        # data["valence"].append(float(details["val"].split("+", 1)[0]))
        # data["dominance"].append(float(details["dom"].split("+", 1)[0]))
        # data["audio"].append(details.get("audio", ""))
        # data["video"].append(details.get("video", ""))

    # copy labels of uni-modal ratings but discard their features
    data = pd.DataFrame(data)
    display(data.head())
    unimodal = data.loc[data["file"].apply(lambda x: "Only" in x)].copy()
    unimodal["file"] = unimodal["file"].apply(lambda x: x.split(".")[0])
    labels = ("arousal", "valence", "dominance")
    for key in ("audio", "video"):
        for label in labels:
            data[f"meta_{label}_{key}"] = float("NaN")
    for index, row in data.iterrows():
        if "Only" in row["file"]:
            continue
        for key in ("audio", "video"):
            match = unimodal["file"] == f"{row[key]}_{key}Only"
            if not match.any():
                print(f"{row[key]}_{key}Only")
                continue
            assert match.sum() == 1
            unimodal_index = match[match].index[0]

            for label in labels:
                data.loc[index, f"meta_{label}_{key}"] = unimodal.loc[
                    unimodal_index, label
                ]
    data = data.loc[data["file"].apply(lambda x: "Only" not in x)]

    # model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")
    # embeddings = pd.DataFrame(
    #     model.encode(data["sentence"].str.lower().tolist()), index=data.index
    # )

    liwc = load_liwc()
    liwc_features = extract_liwc(liwc, data["sentence"])

    data = pd.concat(
        [
            data,
            # embeddings.add_prefix("all_minilm_l12_v2_"),
            liwc_features.add_prefix("liwc_"),
        ],
        axis=1,
    )
    
    display(data.head())
    # caching.write_hdfs(Path("all_minilm_l12_v2.hdf"), {"df": data})


FileNotFoundError: [Errno 2] No such file or directory: 'FeatureExtraction'

### Create latex result table

In [6]:
import pandas as pd

def create_grouped_latex_table(df, group_column, caption="Your Table Caption", label="tab:your_label"):
    """
    Create a LaTeX table with grouped rows based on a specific column.

    :param df: Pandas DataFrame to convert
    :param group_column: Column name to group by
    :param caption: Caption for the LaTeX table
    :param label: Label for the LaTeX table
    :return: String containing the LaTeX table code
    """
    unique_groups = df[group_column].unique()
    grouped = df.groupby(group_column)

    latex_table = "\\begin{table}[ht]\n\\centering\n\\begin{tabular}{|l|l|r|r|r|}\n\\hline\n"
    column_labels = " & ".join(df.columns) + " \\\\\n\\hline\n"
    latex_table += column_labels

    for group in unique_groups:
        group_df = grouped.get_group(group)
        for i, row in group_df.iterrows():
            if i == group_df.index[0]:  # First row of the group
                latex_table += f"\\multirow{{{len(group_df)}}}{{*}}{{{row[group_column]}}}"
            latex_table += " & " + " & ".join([str(row[col]) for col in df.columns if col != group_column])
            latex_table += " \\\\\n"
            if i == group_df.index[-1]:  # Last row of the group
                latex_table += "\\hline\n"

    latex_table += "\\end{tabular}\n"
    latex_table += f"\\caption{{{caption}}}\n"
    latex_table += f"\\label{{{label}}}\n"
    latex_table += "\\end{table}"

    return latex_table

# Example DataFrame
# data = {
#     "Task Name": ["Task1", "Task1", "Task2", "Task2", "Task2"],
#     "Modeling": ["Model A", "Model B", "Model A", "Model B", "Model C"],
#     "RMSE": [1.2, 1.3, 1.1, 1.4, 1.5],
#     "PCC": [0.7, 0.75, 0.65, 0.8, 0.85],
#     "CCC": [0.9, 0.85, 0.88, 0.86, 0.89]
# }
# df = pd.DataFrame(data)

# Generate LaTeX table
latex_table = create_grouped_latex_table(results_df, "Task", caption="Grouped Performance Metrics", label="tab:grouped_performance")
print(latex_table)


AttributeError: 'DataFrame' object has no attribute 'append'

In [14]:
from pathlib import Path

import pandas as pd
from python_tools import caching, features
from python_tools.generic import map_parallel
# from sentence_transformers import SentenceTransformer

import sys

sys.path.insert(0, "/projects/dataset_processed/TPOT/twoertwein")
from python_tools.extract import extract_liwc, load_liwc

def extract(file: Path) -> None:
    name = file.with_suffix(".hdf").name
    # openface+opensmile
    cache = {}
    if file.suffix == ".mp4":
        cache["openface"] = Path(f"openface/{name}")
    if file.suffix == ".mp3" or "Only" not in file.name:
        cache["opensmile_eGeMAPSv02"] = Path(f"opensmile_eGeMAPSv02/{name}")
        cache["opensmile_vad_opensource"] = Path(f"opensmile_vad_opensource/{name}")
    features.extract_features(video=file, audio=file, caches=cache)


if __name__ == "__main__":
    map_parallel(
        extract,
        Path("/projects/dataset_original/UMEME/media/").glob("*.mp*"),
        workers=7,
    )

    sentences = (
        "How can I not",
        "I’m quite sure that we will find some way or another",
        "Ella Jorgenson made the pudding",
        "The floor was completely covered",
        "They are just going to go ahead regardless",
        "It has all been scheduled since Wednesday",
        "I am going shopping",
        "A preliminary study shows rats to be more inquisitive than once thought",
        "That’s it the meeting is finished",
        "I don’t know how she could miss this opportunity",
        "It is raining outside",
        "Your dog is insane",
        "She told me what you did",
        "Your grandmother is on the phone",
        "Only I joined her in the ceremony",
    )
    data = {
        "file": [],
        "name": [],
        "sentence": [],
        "valence": [],
        "arousal": [],
        "dominance": [],
        "audio": [],
        "video": [],
    }
    for block in (
        Path("/projects/dataset_original/UMEME/evaluation.txt")
        .read_text()
        .split("\n\n")
    ):
        if not block:
            continue
        name, *details = block.split("\n")
        details = {
            detail.split(":", 1)[0].strip(): detail.split(":", 1)[1].strip()
            for detail in details
        }

        data["name"].append(details["speaker"])
        match details["modality"]:
            case "av":
                file = f"{name}_original.mp4"
            case "video":
                file = f"{name}_videoOnly.mp4"
            case "audio":
                file = f"{name}_audioOnly.mp3"
            case _:
                assert False, _
        assert (Path("/projects/dataset_original/UMEME/media") / file).is_file()
        data["file"].append(file)
        data["sentence"].append(
            sentences[int(name.split("-", 1)[0].split("S", 1)[1][:-1]) - 1]
        )
        data["arousal"].append(float(details["act"].split("+", 1)[0]))
        data["valence"].append(float(details["val"].split("+", 1)[0]))
        data["dominance"].append(float(details["dom"].split("+", 1)[0]))
        data["audio"].append(details.get("audio", ""))
        data["video"].append(details.get("video", ""))

    # copy labels of uni-modal ratings but discard their features
    data = pd.DataFrame(data)
    unimodal = data.loc[data["file"].apply(lambda x: "Only" in x)].copy()
    unimodal["file"] = unimodal["file"].apply(lambda x: x.split(".")[0])
    labels = ("arousal", "valence", "dominance")
    for key in ("audio", "video"):
        for label in labels:
            data[f"meta_{label}_{key}"] = float("NaN")
    for index, row in data.iterrows():
        if "Only" in row["file"]:
            continue
        for key in ("audio", "video"):
            match = unimodal["file"] == f"{row[key]}_{key}Only"
            if not match.any():
                print(f"{row[key]}_{key}Only")
                continue
            assert match.sum() == 1
            unimodal_index = match[match].index[0]

            for label in labels:
                data.loc[index, f"meta_{label}_{key}"] = unimodal.loc[
                    unimodal_index, label
                ]
    data = data.loc[data["file"].apply(lambda x: "Only" not in x)]

    # model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2")
    # embeddings = pd.DataFrame(
    #     model.encode(data["sentence"].str.lower().tolist()), index=data.index
    # )

    liwc = load_liwc()
    liwc_features = extract_liwc(liwc, data["sentence"])

    data = pd.concat(
        [
            data,
            # embeddings.add_prefix("all_minilm_l12_v2_"),
            liwc_features.add_prefix("liwc_"),
        ],
        axis=1,
    )
    display(data.head())
    
    # caching.write_hdfs(Path("all_minilm_l12_v2.hdf"), {"df": data})


FileNotFoundError: [Errno 2] No such file or directory: 'FeatureExtraction'

In [12]:
pip install cloudpickle dask distributed pympi-ling nltk beartype

Collecting beartype
  Downloading beartype-0.16.4-py3-none-any.whl.metadata (29 kB)
Downloading beartype-0.16.4-py3-none-any.whl (819 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m819.1/819.1 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: beartype
Successfully installed beartype-0.16.4
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd

# Assuming the HDF file is "all_minilm_l12_v2.hdf" and the key for the dataframe is "df"
file_path = "all_minilm_l12_v2.hdf"

# Read the dataframe from the HDF file
try:
    stored_df = pd.read_hdf(file_path, key='df')
except Exception as e:
    stored_df = f"An error occurred while reading the HDF file: {e}"

stored_df


