# Generates images for processing by the VLM
Prereq for running this script: you need the datasets for the respective generators
  
Download and unzip google's emoji dataset here:
* https://github.com/googlefonts/noto-emoji/
 
Download and unzip twitter's emoji dataset here:
* https://github.com/jdecked/twemoji/

### The folder containing the images need to be at the same level at this script (store under `dataset_following_elco_split`)

### Dependencies

In [92]:
import emoji
import os
import cv2
import grapheme
import numpy as np
import shutil
import pandas as pd

np.random.seed(0)

In [93]:
OUTPUT_SIZE = 224 # it's a square image
MAX_EMOJIS_PER_ROW = 3
MAX_EMOJI_SEQUENCE_LENGTH = MAX_EMOJIS_PER_ROW * MAX_EMOJIS_PER_ROW


### Utility Functions

In [94]:
def emoji_str_from_description(desc_str):
    desc_str = desc_str[8:-1] # get rid of "This is" at start and "." at end 
    desc_list = desc_str.split(' [EM] ')
    return ''.join([emoji.emojize(f":{desc}:") for desc in desc_list])

def get_png_image_from_local_repo(emoji, image_dir, filename_getter_fn, extension="png"):
  filename = filename_getter_fn(emoji, extension)
  path = f'{image_dir}/{filename}'
  if not os.path.exists(path):
    print(f"Couldn't find image for {emoji} at {path}")
    return None
  img = cv2.imread(path, cv2.IMREAD_COLOR)  # Use only RGB channels
  img_new_size = OUTPUT_SIZE // MAX_EMOJIS_PER_ROW
  img = cv2.resize(img, (img_new_size, img_new_size))
  return img

def emoji_to_noto_filename(emoji, extension="png"):
    # Step 1: Get the Unicode code points of the emoji.
    codepoints = [f"U{ord(char):04X}" for char in emoji]
    # Step 2: Join the code points with underscores for ZWJ (Zero-Width Joiner) support.
    # Replace the "U" prefix with a lowercase "u" and remove the "+" symbol.
    file_name_parts = [f"u{codepoints[0][1:].lower()}"]  # First code point with "u"
    for codepoint in codepoints[1:]:
        str = codepoint[1:].lower()
        if str == 'fe0f': # Remove the variation selector
          continue
        file_name_parts.append(codepoint[1:].lower())  # Following code points without "u"
    
    # Step 3: Construct the filename (e.g., emoji_u1f9cf_200d_2640.png)
    file_name = "_".join(file_name_parts)
    return f"emoji_{file_name}.{extension}"

In [95]:
# # count the lengths and graph it
# elco_df['length'] = elco_df['EM'].apply(lambda x: len(list(grapheme.graphemes(x))))
# elco_df['length'].hist(bins=elco_df['length'].max())
# elco_df['length'].describe()

### Main method to generate image datasets

In [None]:
skipped_indices = {
  'train': set(),
  'test': set(),
  'val': set()
}
def generate_img_folder(folder_type, csv_file_path, local_image_dir, output_folder, filename_generator_fn, randomise=False):
  df = pd.read_csv(csv_file_path)
  df['EM'] = df['sent1'].apply(emoji_str_from_description)

  if os.path.exists(output_folder):
    # delete the directory
    shutil.rmtree(output_folder)

  os.makedirs(output_folder)

  undone = 0

  i = -1
  for text in df['EM']:
    i += 1
    units = list(grapheme.graphemes(text))
    units = [unit for unit in units if unit != ',' and unit != ' '] # don't ask me why they're there
    if len(units) > MAX_EMOJI_SEQUENCE_LENGTH:
      print(f"Skipping {text} because it's too long")
      skipped_indices[folder_type].add(i)
      undone += 1
      continue
    
    output_filename = f'{output_folder}/{i}.png'
    
    x = 0
    y = 0
    # Make a white image of size OUTPUT_SIZE x OUTPUT_SIZE
    canvas = np.zeros((OUTPUT_SIZE, OUTPUT_SIZE, 3), dtype=np.uint8)  # RGB image
    
    img_new_size = OUTPUT_SIZE // MAX_EMOJIS_PER_ROW

    if randomise:
      while len(units) < MAX_EMOJI_SEQUENCE_LENGTH:
        units.append(' ')
      np.random.shuffle(units)

    generated = True
    # Generate the image
    for j in range(len(units)):
      unit = units[j]
      if unit == ' ':
        continue
      img = get_png_image_from_local_repo(unit, local_image_dir, filename_generator_fn)
      if img is None:
        skipped_indices[folder_type].add(i)
        undone += 1
        generated = False
        break

      # Ensure img is in RGB before placing it on the canvas
      if img.shape[2] == 4:  # If the image is RGBA, convert it to RGB
          img = img[..., :3]

      # Write the img to canvas starting at x, y
      x_pos = j % MAX_EMOJIS_PER_ROW
      y_pos = j // MAX_EMOJIS_PER_ROW
      x = x_pos * img_new_size
      y = y_pos * img_new_size
      canvas[y:y+img.shape[0], x:x+img.shape[1]] = img
    
    if generated:
      cv2.imwrite(output_filename, canvas)
  
  print(f"Number of rows skipped in {folder_type}: {undone}")

### Calling the main method for each split (val, test, train)

In [97]:
output_folder = os.path.join('generated_img_dataset', 'google_dataset')
local_image_dir = os.path.join('googlefonts-noto-emoji-main-png-512')

datasets = {
  'val': os.path.join('.', 'original_ELCo_dataset', 'val.csv'),
  'test': os.path.join('.', 'original_ELCo_dataset', 'test.csv'),
  'train': os.path.join('.', 'original_ELCo_dataset', 'train.csv')
}

for folder_type, csv_file_path in datasets.items():
  output_folder = os.path.join('generated_img_dataset', f'{folder_type}_google')
  generate_img_folder(folder_type, csv_file_path, local_image_dir, output_folder, emoji_to_noto_filename, randomise=False)


Couldn't find image for 🇦🇽 at googlefonts-noto-emoji-main-png-512/emoji_u1f1e6_1f1fd.png
Couldn't find image for 🇺🇸 at googlefonts-noto-emoji-main-png-512/emoji_u1f1fa_1f1f8.png
Couldn't find image for 🇻🇪 at googlefonts-noto-emoji-main-png-512/emoji_u1f1fb_1f1ea.png
Couldn't find image for 🇦🇽 at googlefonts-noto-emoji-main-png-512/emoji_u1f1e6_1f1fd.png
Couldn't find image for 🇺🇸 at googlefonts-noto-emoji-main-png-512/emoji_u1f1fa_1f1f8.png
Couldn't find image for 🇫🇷 at googlefonts-noto-emoji-main-png-512/emoji_u1f1eb_1f1f7.png
Couldn't find image for 🇺🇳 at googlefonts-noto-emoji-main-png-512/emoji_u1f1fa_1f1f3.png
Couldn't find image for 🇻🇪 at googlefonts-noto-emoji-main-png-512/emoji_u1f1fb_1f1ea.png
Couldn't find image for 🇦🇫 at googlefonts-noto-emoji-main-png-512/emoji_u1f1e6_1f1eb.png
Couldn't find image for 🇻🇪 at googlefonts-noto-emoji-main-png-512/emoji_u1f1fb_1f1ea.png
Undone: 10
Skipping 👬👬👬👬👬👬👬👬👬👬👬 because it's too long
Couldn't find image for 🇷🇺 at googlefonts-noto-emoji-mai

# Generate CSVs

### Testing utility functions
* `list[emoji]` -> `str` 
* `str` -> `list[emoji]`

In [98]:
import emoji
import ast

def emoji_to_unicode(emoji_str):
    return ' '.join([f"U+{ord(char):X}" for char in emoji_str])

def process_emoji_list_to_str(emoji_list):
    desc_processed = ' [EM] '.join(desc.strip(':') for desc in emoji_list)
    return f"{desc_processed}."

def unprocess_emoji_list_from_str(emoji_str):
    s = emoji_str[:-1].split(' [EM] ')
    return [f":{desc}:" for desc in s]

def emoji_str_from_description(desc_str: str) -> str:
    """
    Converts a description string containing emoji descriptions into a string of actual emojis.
    Args:
        desc_str (str): A string containing emoji descriptions separated by ' [EM] '.
                        The string is expected to start with "This is" and end with a period (".").

    Returns:
        str: A string of emojis corresponding to the descriptions in the input string.

    Example:
        >>> emoji_str_from_description('This is face_savoring_food [EM] bread.')
        '😋🍞'
    """
    desc_str = desc_str[8:-1]  # Remove "This is" at the start and "." at the end
    desc_list = desc_str.split(' [EM] ')  # Split the string into a list of descriptions
    return ''.join([emoji.emojize(f":{desc}:") for desc in desc_list])  # Convert descriptions to emojis

# write some tests for processing and unprocessing📈
print(emoji_str_from_description('This is face_savoring_food [EM] bread.'))
print(emoji_to_unicode('🎷'))
print(process_emoji_list_to_str([':necktie:', ':chart_increasing:']))
print(unprocess_emoji_list_from_str(process_emoji_list_to_str([':necktie:', ':chart_increasing:'])))


😋🍞
U+1F3B7
necktie [EM] chart_increasing.
[':necktie:', ':chart_increasing:']


In [99]:
import csv
import ast
import os

def generate_csv(folder_type, input_file_path, output_file_path, img_folder):
    with open(input_file_path, newline='', encoding='utf-8') as fin, \
        open(output_file_path, 'w', newline='', encoding='utf-8') as fout:

        reader = csv.DictReader(fin)
        fieldnames = ['EM', 'EN', 'unicode', 'label', 'strategy', 'image'] # needs to have the same fieldnames in the output csv
        writer = csv.DictWriter(fout, fieldnames=fieldnames)
        writer.writeheader()

        i = 0
        for row in reader:
            if i in skipped_indices[folder_type]:
                i += 1
                continue
            em = emoji_str_from_description(row['sent1'])
            en = ' '.join(row['sent2'].split()[2:])[:-1] 
            writer.writerow({
                'EM': em,
                'EN': en,
                'unicode': emoji_to_unicode(em),  # Added this to match fieldnames
                'label': row['label'],
                'strategy': row['strategy'],
                'image': os.path.join(img_folder, f"{i}.png") 
            })

            i += 1

    print(f"Conversion complete! Output saved to {output_file_path}")

In [100]:
csvs_to_generate = [
    (
     'train',
     os.path.join('.', 'original_ELCo_dataset', 'train.csv'), 
     os.path.join('.', 'generated_img_dataset', 'train.csv'), 
     os.path.join('.', 'generated_img_dataset', 'train_google')),

    ('test',
     os.path.join('.', 'original_ELCo_dataset', 'test.csv'), 
     os.path.join('.', 'generated_img_dataset', 'test.csv'), 
     os.path.join('.', 'generated_img_dataset', 'test_google')), 

    ('val',
     os.path.join('.', 'original_ELCo_dataset', 'val.csv'), 
     os.path.join('.', 'generated_img_dataset', 'val.csv'), 
     os.path.join('.', 'generated_img_dataset', 'val_google')),
]

for folder_type, input, output, img_folder in csvs_to_generate:
    generate_csv(folder_type, input, output, img_folder)

Conversion complete! Output saved to ./generated_img_dataset/train.csv
Conversion complete! Output saved to ./generated_img_dataset/test.csv
Conversion complete! Output saved to ./generated_img_dataset/val.csv


In [101]:
for key, value in skipped_indices.items():
    print(f"Length of skipped_indices['{key}']: {len(value)}")
print(skipped_indices)

Length of skipped_indices['train']: 57
Length of skipped_indices['test']: 25
Length of skipped_indices['val']: 10
{'train': {1664, 386, 2309, 1542, 1031, 394, 2317, 2318, 2321, 402, 2324, 2326, 2328, 281, 1187, 422, 2343, 1448, 2346, 2348, 2351, 1586, 2356, 311, 1594, 2238, 2241, 1606, 1607, 971, 1109, 1111, 475, 1116, 1117, 1118, 1119, 1121, 1122, 1636, 1125, 1127, 1129, 1131, 1133, 1134, 367, 879, 880, 882, 1135, 1136, 1138, 1651, 2038, 1659, 2045}, 'test': {136, 26, 292, 42, 325, 335, 341, 342, 346, 349, 224, 225, 104, 106, 107, 108, 109, 362, 111, 492, 495, 114, 116, 246, 248}, 'val': {352, 354, 356, 200, 203, 23, 24, 155, 350, 351}}


### Ensures csv rows match the image folders

In [102]:
for folder_type, _, output_csv, img_folder in csvs_to_generate:
    # Count the number of rows in the CSV file
    with open(output_csv, 'r', encoding='utf-8') as csv_file:
        csv_row_count = sum(1 for _ in csv_file) - 1  # Subtract 1 for the header row

    # Count the number of image files in the folder
    img_file_count = len([f for f in os.listdir(img_folder) if os.path.isfile(os.path.join(img_folder, f))])

    # Compare the counts
    if csv_row_count != img_file_count:
        print(f"Mismatch for {folder_type}: CSV rows = {csv_row_count}, Images = {img_file_count}")
    else:
        print(f"Match for {folder_type}: CSV rows = {csv_row_count}, Images = {img_file_count}")

Match for train: CSV rows = 2341, Images = 2341
Match for test: CSV rows = 493, Images = 493
Match for val: CSV rows = 384, Images = 384
