### Dependencies

In [1]:
import emoji
import os
import cv2
import grapheme
import numpy as np
import shutil
import pandas as pd

np.random.seed(0)

# Generate CSVs

### Testing utility functions
* `list[emoji]` -> `str` 
* `str` -> `list[emoji]`

In [2]:
import emoji
import ast

def emoji_to_unicode(emoji_str):
    return ' '.join([f"U+{ord(char):X}" for char in emoji_str])

def process_emoji_list_to_str(emoji_list):
    desc_processed = ' [EM] '.join(desc.strip(':') for desc in emoji_list)
    return f"{desc_processed}."

def unprocess_emoji_list_from_str(emoji_str):
    s = emoji_str[:-1].split(' [EM] ')
    return [f":{desc}:" for desc in s]

def emoji_str_from_description(desc_str: str) -> str:
    """
    Converts a description string containing emoji descriptions into a string of actual emojis.
    Args:
        desc_str (str): A string containing emoji descriptions separated by ' [EM] '.
                        The string is expected to start with "This is" and end with a period (".").

    Returns:
        str: A string of emojis corresponding to the descriptions in the input string.

    Example:
        >>> emoji_str_from_description('This is face_savoring_food [EM] bread.')
        '😋🍞'
    """
    desc_str = desc_str[8:-1]  # Remove "This is" at the start and "." at the end
    desc_list = desc_str.split(' [EM] ')  # Split the string into a list of descriptions
    return ''.join([emoji.emojize(f":{desc}:") for desc in desc_list])  # Convert descriptions to emojis

# write some tests for processing and unprocessing📈
print(emoji_str_from_description('This is face_savoring_food [EM] bread.'))
print(emoji_to_unicode('🎷'))
print(process_emoji_list_to_str([':necktie:', ':chart_increasing:']))
print(unprocess_emoji_list_from_str(process_emoji_list_to_str([':necktie:', ':chart_increasing:'])))


😋🍞
U+1F3B7
necktie [EM] chart_increasing.
[':necktie:', ':chart_increasing:']


In [None]:
import csv
import ast
import os

def generate_csv(input_file_path, output_file_path):
    with open(input_file_path, newline='', encoding='utf-8') as fin, \
        open(output_file_path, 'w', newline='', encoding='utf-8') as fout:

        reader = csv.DictReader(fin)
        fieldnames = ['sent1', 'sent2', 'label', 'strategy'] # needs to have the same fieldnames in the output csv
        writer = csv.DictWriter(fout, fieldnames=fieldnames)
        writer.writeheader()

        for row in reader:
            em = emoji_str_from_description(row['sent1'])
            en = ' '.join(row['sent2'].split()[2:])[:-1] 
            writer.writerow({
                'sent1': em,
                'sent2': en,
                'label': row['label'],
                'strategy': row['strategy'],
            })

    print(f"Conversion complete! Output saved to {output_file_path}")

In [8]:
csvs_to_generate = [
    (
     'train',
     os.path.join('.', 'originals', 'train.csv'), 
     os.path.join('.', 'converted', 'train.csv')), 

    ('test',
     os.path.join('.', 'originals', 'test.csv'), 
     os.path.join('.', 'converted', 'test.csv')), 

    ('val',
     os.path.join('.', 'originals', 'val.csv'), 
     os.path.join('.', 'converted', 'val.csv')), 
]

for folder_type, input, output in csvs_to_generate:
    generate_csv(input, output)

ValueError: dict contains fields not in fieldnames: 'sent1', 'sent2'

In [101]:
for key, value in skipped_indices.items():
    print(f"Length of skipped_indices['{key}']: {len(value)}")
print(skipped_indices)

Length of skipped_indices['train']: 57
Length of skipped_indices['test']: 25
Length of skipped_indices['val']: 10
{'train': {1664, 386, 2309, 1542, 1031, 394, 2317, 2318, 2321, 402, 2324, 2326, 2328, 281, 1187, 422, 2343, 1448, 2346, 2348, 2351, 1586, 2356, 311, 1594, 2238, 2241, 1606, 1607, 971, 1109, 1111, 475, 1116, 1117, 1118, 1119, 1121, 1122, 1636, 1125, 1127, 1129, 1131, 1133, 1134, 367, 879, 880, 882, 1135, 1136, 1138, 1651, 2038, 1659, 2045}, 'test': {136, 26, 292, 42, 325, 335, 341, 342, 346, 349, 224, 225, 104, 106, 107, 108, 109, 362, 111, 492, 495, 114, 116, 246, 248}, 'val': {352, 354, 356, 200, 203, 23, 24, 155, 350, 351}}


### Ensures csv rows match the image folders

In [102]:
for folder_type, _, output_csv, img_folder in csvs_to_generate:
    # Count the number of rows in the CSV file
    with open(output_csv, 'r', encoding='utf-8') as csv_file:
        csv_row_count = sum(1 for _ in csv_file) - 1  # Subtract 1 for the header row

    # Count the number of image files in the folder
    img_file_count = len([f for f in os.listdir(img_folder) if os.path.isfile(os.path.join(img_folder, f))])

    # Compare the counts
    if csv_row_count != img_file_count:
        print(f"Mismatch for {folder_type}: CSV rows = {csv_row_count}, Images = {img_file_count}")
    else:
        print(f"Match for {folder_type}: CSV rows = {csv_row_count}, Images = {img_file_count}")

Match for train: CSV rows = 2341, Images = 2341
Match for test: CSV rows = 493, Images = 493
Match for val: CSV rows = 384, Images = 384
