In [23]:
# Function to convert emojis to Unicode
def emoji_to_unicode(emoji_str):
    return ' '.join([f"U+{ord(char):X}" for char in emoji_str])

def process_emoji_list_to_str(emoji_list):
    desc_processed = ' [EM] '.join(desc.strip(':') for desc in emoji_list)
    return f"{desc_processed}."

def unprocess_emoji_list_from_str(emoji_str):
    s = emoji_str[:-1].split(' [EM] ')
    return [f":{desc}:" for desc in s]

import ast
# write some tests for processing and unprocessing
desc_list = ast.literal_eval("[':necktie:', ':chart_increasing:']")
print(process_emoji_list_to_str(desc_list))
print(unprocess_emoji_list_from_str(process_emoji_list_to_str(desc_list)))


necktie [EM] chart_increasing.
[':necktie:', ':chart_increasing:']


In [24]:
import csv
import ast

input_csv = 'ELCo.csv'
output_csv = 'dataset_only_true.csv'


with open(input_csv, newline='', encoding='utf-8') as fin, \
     open(output_csv, 'w', newline='', encoding='utf-8') as fout:

    reader = csv.DictReader(fin)

    # Update fieldnames to include all keys in writer.writerow()
    fieldnames = ['sent1', 'sent2', 'unicode', 'label', 'strategy', 'attribute', 'filename', 'emoji']
    writer = csv.DictWriter(fout, fieldnames=fieldnames)
    writer.writeheader()

    i = 0
    for row in reader:
        desc_list = ast.literal_eval(row['Description'])  # Convert string to list 

        sent1 = process_emoji_list_to_str(desc_list)
        sent2 = row['EN']

        label = 1

        unicode_repr = emoji_to_unicode(row['EM'])

        writer.writerow({
            'sent1': sent1,
            'sent2': sent2,
            'unicode': unicode_repr,  # Added this to match fieldnames
            'label': label,
            'strategy': row['Composition strategy'],
            'attribute': row['Attribute'],
            'filename': f"{i}.png",
            'emoji': row['EM'],
        })

        i += 1

print(f"Conversion complete! Output saved to {output_csv}")


Conversion complete! Output saved to dataset_only_true.csv


In [25]:
import csv
import ast
import pandas as pd

input_csv_folder = 'original_ELCo_dataset'
input_csvs = ['train.csv', 'val.csv', 'test.csv']
output_csv = 'dataset_only_false.csv'
reference_csv = 'ELCo_no_punctuation.csv'

elco_df = pd.read_csv(reference_csv)

with open(output_csv, 'w', newline='', encoding='utf-8') as fout:
  
    fieldnames = ['sent1', 'sent2', 'unicode', 'label', 'strategy', 'attribute', 'filename', 'emoji']
    writer = csv.DictWriter(fout, fieldnames=fieldnames)
    writer.writeheader()

    for input_csv in input_csvs:
        with open(f"{input_csv_folder}/{input_csv}", newline='', encoding='utf-8') as fin:
            reader = csv.DictReader(fin)

            for row in reader:
                if row['label'] == '1':
                    continue
                

                sent1 = row['sent1']
                # strip the 'this is ' prefix
                tmp = sent1[8:]
                # unprocess the emoji string
                tmp = str(unprocess_emoji_list_from_str(tmp))

                # get the unique row index of elco_df where the Description field matches tmp
                elco_row = elco_df[elco_df['Description'] == tmp].index[0]

                sent2 = row['sent2']
                label = 0
                unicode_repr = emoji_to_unicode(elco_df.loc[elco_row, 'EM'])

                writer.writerow({
                    'sent1': sent1,
                    'sent2': sent2,
                    'unicode': unicode_repr,
                    'label': label,
                    'strategy': elco_df.loc[elco_row, 'Composition strategy'],
                    'attribute': elco_df.loc[elco_row, 'Attribute'],
                    'filename': f"{elco_row}.png",
                    'emoji': elco_df.loc[elco_row, 'EM'],
                })

print(f"Conversion complete! Output saved to {output_csv}")

Conversion complete! Output saved to dataset_only_false.csv


In [26]:
### Utility stuff

In [27]:
import os


def count_missing_numbers(folder_path):
  # Get a list of all files in the folder
  files = os.listdir(folder_path)
  
  # Extract numbers from filenames and convert them to integers
  numbers = sorted([int(f.split('.')[0]) for f in files if f.split('.')[0].isdigit()])
  
  # Find the missing numbers
  missing_numbers = [num for num in range(numbers[0], numbers[-1] + 1) if num not in numbers]
  
  return missing_numbers

folder_path = 'google_dataset'
missing_numbers = count_missing_numbers(folder_path)
print(f"Missing numbers in filenames: {missing_numbers}")
print(len(missing_numbers))

Missing numbers in filenames: [26, 42, 375, 376, 488, 544, 566, 622, 630, 638, 1343, 1521, 1523, 1528, 1529, 1530, 1531, 1533, 1534, 1537, 1539, 1541, 1545, 1551, 1553, 1554, 1555, 1556, 1558]
29


In [28]:
# count number of true and false rows in csvs
import pandas as pd
true_df = pd.read_csv('dataset_only_true.csv')
false_df = pd.read_csv('dataset_only_false.csv')

print(f"Number of true rows: {len(true_df)}")
print(f"Number of false rows: {len(false_df)}")


train_df = pd.read_csv('original_ELCo_dataset/train.csv')
val_df = pd.read_csv('original_ELCo_dataset/val.csv')
test_df = pd.read_csv('original_ELCo_dataset/test.csv')

print(f"Number of rows in train.csv: {len(train_df)}")
print(f"Number of rows in val.csv: {len(val_df)}")
print(f"Number of rows in test.csv: {len(test_df)}")
print(f"Sums to: {len(train_df) + len(val_df) + len(test_df)}")

Number of true rows: 1655
Number of false rows: 1655
Number of rows in train.csv: 2398
Number of rows in val.csv: 394
Number of rows in test.csv: 518
Sums to: 3310


In [29]:
import pandas as pd
import regex

# Function to extract all emojis from a text string using Unicode properties.
def extract_emojis(text):
    emoji_pattern = regex.compile(r'\p{Emoji}', flags=regex.UNICODE)
    return emoji_pattern.findall(text) if isinstance(text, str) else []

# Read the CSV files
df_true = pd.read_csv('dataset_only_true.csv')
df_false = pd.read_csv('dataset_only_false.csv')
df = pd.concat([df_true, df_false])
df_emoji = pd.read_csv('/home/andrew/CS4248_project/dataset/noto-emoji/emoji_dataset.csv')

# Create a new column with the extracted emojis from the original 'emoji' column
emojis = df['emoji'].apply(extract_emojis)
print(emojis)
file_name_list = []
for emoji_list in emojis:
    file_name = []
    for emoji in emoji_list:
        df_emoji_row = df_emoji[df_emoji['emoji'] == emoji]
        if len(df_emoji_row) == 0:
            print(f"Emoji {emoji} not found in emoji dataset")
            continue
        file_name.append("noto-emoji/png/512/"+f"{df_emoji_row['filename'].values[0]}")
    file_name_list.append(file_name)
df['separate_filenames'] = file_name_list

df.to_csv('merged_emoji.csv', index=False)


0                               [👔, 📈]
1                            [🏢, 🤑, 🤑]
2                            [👨, 💻, 🤝]
3       [🏢, 🧑, 🤝, 🧑, 🧑, 🤝, 🧑, 🧑, 🤝, 🧑]
4                            [👩, 💻, 🤑]
                     ...              
1650                            [👍, 👣]
1651                            [👏, 🪜]
1652                         [😤, 🗣, 💬]
1653                            [💨, 🤬]
1654                         [👍, 👣, ➡]
Name: emoji, Length: 3310, dtype: object


In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
# Load the merged CSV file, preserving the header row.
df = pd.read_csv("/home/andrew/CS4248_project/dataset/merged_emoji.csv")  # header=0 by default

# Split the data.
# train: 80%, val: 10%, test: 10%
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Save the splits to CSV files (the column names will be preserved).
train_df.to_csv("/home/andrew/CS4248_project/dataset/train.csv", index=False)
val_df.to_csv("/home/andrew/CS4248_project/dataset/val.csv", index=False)
test_df.to_csv("/home/andrew/CS4248_project/dataset/test.csv", index=False)

print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)

Train shape: (2648, 9)
Validation shape: (331, 9)
Test shape: (331, 9)


In [14]:
import pandas as pd
import regex

# Function to extract all emojis from a text string using Unicode properties.
def extract_emojis(text):
    emoji_pattern = regex.compile(r'\p{Emoji}', flags=regex.UNICODE)
    return emoji_pattern.findall(text) if isinstance(text, str) else []

# Read the CSV files
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_val = pd.read_csv('val.csv')
df_emoji = pd.read_csv('/home/andrew/CS4248_project/dataset/noto-emoji/emoji_dataset.csv')

# Create a new column with the extracted emojis from the original 'emoji' column
for df in [df_train, df_test, df_val]:
    emojis = df['EM'].apply(extract_emojis)
    print(emojis)
    file_name_list = []
    for emoji_list in emojis:
        file_name = []
        for emoji in emoji_list:
            df_emoji_row = df_emoji[df_emoji['emoji'] == emoji]
            if len(df_emoji_row) == 0:
                print(f"Emoji {emoji} not found in emoji dataset")
                continue
            file_name.append("noto-emoji/png/512/"+f"{df_emoji_row['filename'].values[0]}")
        file_name_list.append(file_name)
    df['separate_filenames'] = file_name_list
    
    if df is df_train:
        df.to_csv('train.csv', index=False)
    elif df is df_test:
        df.to_csv('test.csv', index=False)
    elif df is df_val:
        df.to_csv('val.csv', index=False)


0                      [🎇, 🔮]
1                      [🔆, 🔮]
2                      [🌟, 🎓]
3       [😎, 👨, 🔬, 👩, ⚕, 🧑, 💻]
4                      [☀, 🔮]
                ...          
2393                [🔈, 🗑, 🚮]
2394                   [👂, 📋]
2395       [🤔, 🧠, 🔊, 🗣, ☺, 👍]
2396          [🤔, 🧠, 🗣, 🙂, 🆗]
2397                [🗣, 🧏, ♀]
Name: EM, Length: 2398, dtype: object


0                              [👔, 📈]
1                           [🏢, 🤑, 🤑]
2                           [👨, 💻, 🤝]
3      [🏢, 🧑, 🤝, 🧑, 🧑, 🤝, 🧑, 🧑, 🤝, 🧑]
4                           [👩, 💻, 🤑]
                    ...              
513                            [👍, 👣]
514                            [👏, 🪜]
515                         [😤, 🗣, 💬]
516                            [💨, 🤬]
517                         [👍, 👣, ➡]
Name: EM, Length: 518, dtype: object
0         [⏱, 🗺, 🥲]
1            [👣, 🛣]
2            [😪, 🗺]
3      [⏳, 🚗, ✈, 🛤]
4         [🛤, 🚢, 🗺]
           ...     
389       [🍃, 🥬, 🍃]
390          [🆕, 👃]
391    [🌬, 👃, 👌, 😌]
392          [😋, 🥬]
393       [🧼, 👃, 💨]
Name: EM, Length: 394, dtype: object


In [1]:
import pandas as pd
import re

# Load the CSV
df_train = pd.read_csv("train.csv")
df_val = pd.read_csv("val.csv")
df_test = pd.read_csv("test.csv")
# Function to check if string contains ONLY non-English characters
def is_pure_emoji(text):
    return not bool(re.search(r'[A-Za-z]', str(text)))
for df in [df_train, df_test, df_val]:


    # Keep only rows where EM is pure emoji (no English letters)
    df_filtered = df[df['EM'].apply(is_pure_emoji)]

    # Save the result
    if df is df_train:
        df_filtered.to_csv("train_only_emoji.csv", index=False)
    elif df is df_val:
        df_filtered.to_csv("val_only_emoji.csv", index=False)
    elif df is df_test:
        df_filtered.to_csv("test_only_emoji.csv", index=False)
    print(f"Filtered {df} saved as {df}_only_emoji.csv")
    


Filtered                EM             EN  \
0              🎇🔮  bright future   
1              🔆🔮  bright future   
2              🌟🎓  bright future   
3     😎👨‍🔬👩‍⚕️🧑‍💻  bright future   
4             ☀️🔮  bright future   
...           ...            ...   
2393         🔈🗑️🚮   sound advice   
2394           👂📋   sound advice   
2395     🤔🧠🔊🗣️☺️👍   sound advice   
2396       🤔🧠🗣️🙂🆗   sound advice   
2397       🗣️🧏‍♀️   sound advice   

                                                unicode  label  strategy  \
0                                       U+1F387 U+1F52E      1         0   
1                                       U+1F506 U+1F52E      1         0   
2                                       U+1F31F U+1F393      1         1   
3     U+1F60E U+1F468 U+200D U+1F52C U+1F469 U+200D ...      1         2   
4                                 U+2600 U+FE0F U+1F52E      1         0   
...                                                 ...    ...       ...   
2393                     U