### Testing utility functions
* `list[emoji]` -> `str` 
* `str` -> `list[emoji]`

In [28]:
import emoji
import ast

def emoji_to_unicode(emoji_str):
    return ' '.join([f"U+{ord(char):X}" for char in emoji_str])

def process_emoji_list_to_str(emoji_list):
    desc_processed = ' [EM] '.join(desc.strip(':') for desc in emoji_list)
    return f"{desc_processed}."

def unprocess_emoji_list_from_str(emoji_str):
    s = emoji_str[:-1].split(' [EM] ')
    return [f":{desc}:" for desc in s]

def emoji_str_from_description(desc_str: str) -> str:
    """
    Converts a description string containing emoji descriptions into a string of actual emojis.
    Args:
        desc_str (str): A string containing emoji descriptions separated by ' [EM] '.
                        The string is expected to start with "This is" and end with a period (".").

    Returns:
        str: A string of emojis corresponding to the descriptions in the input string.

    Example:
        >>> emoji_str_from_description('This is face_savoring_food [EM] bread.')
        '😋🍞'
    """
    desc_str = desc_str[8:-1]  # Remove "This is" at the start and "." at the end
    desc_list = desc_str.split(' [EM] ')  # Split the string into a list of descriptions
    return ''.join([emoji.emojize(f":{desc}:") for desc in desc_list])  # Convert descriptions to emojis

# write some tests for processing and unprocessing📈
print(emoji_str_from_description('This is face_savoring_food [EM] bread.'))
print(emoji_to_unicode('🎷'))
print(process_emoji_list_to_str([':necktie:', ':chart_increasing:']))
print(unprocess_emoji_list_from_str(process_emoji_list_to_str([':necktie:', ':chart_increasing:'])))


😋🍞
U+1F3B7
necktie [EM] chart_increasing.
[':necktie:', ':chart_increasing:']


In [None]:
import csv
import ast
import os

def generate_csv(input_file_path, output_file_path, img_folder):
    with open(input_file_path, newline='', encoding='utf-8') as fin, \
        open(output_file_path, 'w', newline='', encoding='utf-8') as fout:

        reader = csv.DictReader(fin)
        fieldnames = ['EM', 'EN', 'unicode', 'label', 'strategy', 'image'] # needs to have the same fieldnames in the output csv
        writer = csv.DictWriter(fout, fieldnames=fieldnames)
        writer.writeheader()

        i = 0
        for row in reader:
            em = emoji_str_from_description(row['sent1'])
            en = ' '.join(row['sent2'].split()[2:])[:-1] 
            writer.writerow({
                'EM': em,
                'EN': en,
                'unicode': emoji_to_unicode(em),  # Added this to match fieldnames
                'label': row['label'],
                'strategy': row['strategy'],
                'image': os.path.join(img_folder, f"{i}.png") 
            })

            i += 1

    print(f"Conversion complete! Output saved to {output_file_path}")

In [39]:
csvs_to_generate = [
    (os.path.join('.', 'original_ELCo_dataset', 'train.csv'), 
     os.path.join('.', 'generated_img_dataset', 'train.csv'), 
     os.path.join('.', 'generated_img_dataset', 'train_google')),

    (os.path.join('.', 'original_ELCo_dataset', 'test.csv'), 
     os.path.join('.', 'generated_img_dataset', 'test.csv'), 
     os.path.join('.', 'generated_img_dataset', 'test_google')), 

    (os.path.join('.', 'original_ELCo_dataset', 'val.csv'), 
     os.path.join('.', 'generated_img_dataset', 'val.csv'), 
     os.path.join('.', 'generated_img_dataset', 'val_google')),
]

for input, output, img_folder in csvs_to_generate:
    generate_csv(input, output, img_folder)

Conversion complete! Output saved to ./generated_img_dataset/train.csv
Conversion complete! Output saved to ./generated_img_dataset/test.csv
Conversion complete! Output saved to ./generated_img_dataset/val.csv


### Counting rows

In [None]:
import os


def count_missing_numbers(folder_path):
  # Get a list of all files in the folder
  files = os.listdir(folder_path)
  
  # Extract numbers from filenames and convert them to integers
  numbers = sorted([int(f.split('.')[0]) for f in files if f.split('.')[0].isdigit()])
  
  # Find the missing numbers
  missing_numbers = [num for num in range(numbers[0], numbers[-1] + 1) if num not in numbers]
  
  return missing_numbers

folder_path = 'google_dataset'
missing_numbers = count_missing_numbers(folder_path)
print(f"Missing numbers in filenames: {missing_numbers}")
print(len(missing_numbers))

Missing numbers in filenames: [26, 42, 375, 376, 488, 544, 566, 622, 630, 638, 1343, 1521, 1523, 1528, 1529, 1530, 1531, 1533, 1534, 1537, 1539, 1541, 1545, 1551, 1553, 1554, 1555, 1556, 1558]
29


In [18]:
# count number of true and false rows in csvs
import pandas as pd
true_df = pd.read_csv('dataset_only_true.csv')
false_df = pd.read_csv('dataset_only_false.csv')

print(f"Number of true rows: {len(true_df)}")
print(f"Number of false rows: {len(false_df)}")


train_df = pd.read_csv('original_ELCo_dataset/train.csv')
val_df = pd.read_csv('original_ELCo_dataset/val.csv')
test_df = pd.read_csv('original_ELCo_dataset/test.csv')

print(f"Number of rows in train.csv: {len(train_df)}")
print(f"Number of rows in val.csv: {len(val_df)}")
print(f"Number of rows in test.csv: {len(test_df)}")
print(f"Sums to: {len(train_df) + len(val_df) + len(test_df)}")

Number of true rows: 1655
Number of false rows: 1655
Number of rows in train.csv: 2398
Number of rows in val.csv: 394
Number of rows in test.csv: 518
Sums to: 3310
