In [None]:
# Generates images for processing by the VLM
# Prereq for running this script: you need the datasets for the respective generators 
# download and unzip google's emoji dataset here:
# https://github.com/googlefonts/noto-emoji/
# 
# download and unzip twitter's emoji dataset here:
# https://github.com/jdecked/twemoji/
#
# Yes, I am not a data person.

OUTPUT_SIZE = 224 # it's a square image
MAX_EMOJIS_PER_ROW = 3
MAX_EMOJI_SEQUENCE_LENGTH = MAX_EMOJIS_PER_ROW * MAX_EMOJIS_PER_ROW


In [None]:
import pandas as pd
import grapheme

elco_df = pd.read_csv('elco.csv')

# count the lengths and graph it
elco_df['length'] = elco_df['EM'].apply(lambda x: len(list(grapheme.graphemes(x))))
elco_df['length'].hist(bins=elco_df['length'].max())
elco_df['length'].describe()

In [None]:
# count number of lengths > 9
elco_df[elco_df['length'] > 9].shape[0]
elco_df[elco_df['length'] > 4].shape[0]

In [None]:
# common utils
import os
import cv2
import grapheme
import numpy as np
import shutil
np.random.seed(0)

def get_png_image_from_local_repo(emoji, image_dir, filename_getter_fn, extension="png"):
  filename = filename_getter_fn(emoji, extension)
  path = f'{image_dir}/{filename}'
  if not os.path.exists(path):
    print(f"Couldn't find image for {emoji} at {path}")
    return None
  img = cv2.imread(path, cv2.IMREAD_COLOR)  # Use only RGB channels
  img_new_size = OUTPUT_SIZE // MAX_EMOJIS_PER_ROW
  img = cv2.resize(img, (img_new_size, img_new_size))
  return img

def generate_dataset(local_image_dir, output_folder, filename_generator_fn, randomise=False):
  if os.path.exists(output_folder):
    # delete the directory
    shutil.rmtree(output_folder)

  os.makedirs(output_folder)

  undone = 0

  i = -1
  for text in elco_df['EM']:
    i += 1
    units = list(grapheme.graphemes(text))
    units = [unit for unit in units if unit != ',' and unit != ' '] # don't ask me why they're there
    if len(units) > MAX_EMOJI_SEQUENCE_LENGTH:
      print(f"Skipping {text} because it's too long")
      undone += 1
      continue
    
    output_filename = f'{output_folder}/{i}.png'
    
    x = 0
    y = 0
    # Make a white image of size OUTPUT_SIZE x OUTPUT_SIZE
    canvas = np.zeros((OUTPUT_SIZE, OUTPUT_SIZE, 3), dtype=np.uint8)  # RGB image
    
    img_new_size = OUTPUT_SIZE // MAX_EMOJIS_PER_ROW

    if randomise:
      while len(units) < MAX_EMOJI_SEQUENCE_LENGTH:
        units.append(' ')
      np.random.shuffle(units)

    generated = True
    # Generate the image
    for j in range(len(units)):
      unit = units[j]
      if unit == ' ':
        continue
      img = get_png_image_from_local_repo(unit, local_image_dir, filename_generator_fn)
      if img is None:
        undone += 1
        generated = False
        break

      # Ensure img is in RGB before placing it on the canvas
      if img.shape[2] == 4:  # If the image is RGBA, convert it to RGB
          img = img[..., :3]

      # Write the img to canvas starting at x, y
      x_pos = j % MAX_EMOJIS_PER_ROW
      y_pos = j // MAX_EMOJIS_PER_ROW
      x = x_pos * img_new_size
      y = y_pos * img_new_size
      canvas[y:y+img.shape[0], x:x+img.shape[1]] = img
    
    if generated:
      cv2.imwrite(output_filename, canvas)
  
  print(f"Undone: {undone}")

In [None]:
def emoji_to_noto_filename(emoji, extension="png"):
    # Step 1: Get the Unicode code points of the emoji.
    codepoints = [f"U{ord(char):04X}" for char in emoji]
    # Step 2: Join the code points with underscores for ZWJ (Zero-Width Joiner) support.
    # Replace the "U" prefix with a lowercase "u" and remove the "+" symbol.
    file_name_parts = [f"u{codepoints[0][1:].lower()}"]  # First code point with "u"
    for codepoint in codepoints[1:]:
        str = codepoint[1:].lower()
        if str == 'fe0f': # Remove the variation selector
          continue
        file_name_parts.append(codepoint[1:].lower())  # Following code points without "u"
    
    # Step 3: Construct the filename (e.g., emoji_u1f9cf_200d_2640.png)
    file_name = "_".join(file_name_parts)
    return f"emoji_{file_name}.{extension}"

output_folder = 'google_dataset'
local_image_dir = 'noto-emoji-main/noto-emoji-main/png/72'

generate_dataset(local_image_dir, output_folder, emoji_to_noto_filename, randomise=False)

In [None]:
def emoji_to_twemoji_filename(emoji, extension="png"):
    # Step 1: Get the Unicode code points of the emoji.
    codepoints = [f"U{ord(char):X}" for char in emoji]
    # Step 2: Join the code points with underscores for ZWJ (Zero-Width Joiner) support.
    # Replace the "U" prefix with a lowercase "u" and remove the "+" symbol.
    file_name_parts = []
    for codepoint in codepoints:
        str = codepoint[1:].lower()
        if str == 'fe0f' and len(codepoints) <= 2: # Remove the variation selector if there's only 1 proper token
          continue
        file_name_parts.append(codepoint[1:].lower())  # Following code points without "u"
    
    # Step 3: Construct the filename (e.g., emoji_u1f9cf_200d_2640.png)
    file_name = "-".join(file_name_parts)
    return f"{file_name}.{extension}"

output_folder = 'twitter_dataset'
local_image_dir = 'twemoji-main/twemoji-main/assets/72x72'

generate_dataset(local_image_dir, output_folder, emoji_to_twemoji_filename, randomise=False)
