In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

In [2]:
emoji_file = "twitter_data/emoji_by_language.csv"
orig_df = pd.read_csv(emoji_file)
orig_df = orig_df[orig_df.lang!="und"]

# Aggregation 1: Raw Counts

How many languages are represented by the each emoji? i.e., in how many languages does each emoji appear?

In [3]:
df = orig_df.copy()
emoji_lang_appearances = df.groupby("emoji").agg({"lang": lambda x: len(np.unique(x))}).to_dict()["lang"]
df = df.groupby("emoji").agg({"count": sum})
df["pdf"] = df["count"] / df["count"].sum()
df = df.sort_values(by="pdf", ascending=False).reset_index()
df["cdf"] = df["pdf"].cumsum()
df["lang_appearances"] = df.apply(lambda x: emoji_lang_appearances[x.emoji], axis="columns")
list_top_emoji = df.sort_values(by="cdf", ascending=True).head(25).emoji.tolist()
df.sort_values(by="cdf", ascending=True).head(100)


Unnamed: 0,emoji,count,pdf,cdf,lang_appearances
0,:loudly_crying_face:,1666332,0.086124,0.086124,62
1,:face_with_tears_of_joy:,1611427,0.083286,0.16941,63
2,:rolling_on_the_floor_laughing:,795204,0.0411,0.21051,64
3,:red_heart:,667475,0.034498,0.245009,64
4,:pleading_face:,473567,0.024476,0.269485,62
5,:smiling_face_with_heart-eyes:,336008,0.017367,0.286851,60
6,:smiling_face_with_hearts:,293209,0.015154,0.302006,60
7,:folded_hands:,283212,0.014638,0.316644,62
8,:smiling_face_with_tear:,265011,0.013697,0.330341,58
9,:sparkles:,260981,0.013489,0.343829,56


# Aggregation 2: 

$$P(emoji) = \sum_{lang}{\frac{count(emoji|lang)}{count(lang)}}$$

In [4]:
df = orig_df.copy()
emoji_lang = df.groupby(['emoji', 'lang']).sum().reset_index()
lang = df.groupby('lang').sum()
emoji_lang['proportion'] = emoji_lang.apply(lambda r: r['count']/lang.loc[r['lang']],axis=1)
new_df = emoji_lang.groupby('emoji').sum().reset_index()

In [5]:
agg2_list = new_df.sort_values(by="proportion", ascending=False).head(25).emoji.tolist()

# Aggregation 3: Counts re-weighted by Language

How much "voting power" does each language have?

In [6]:
df = orig_df.copy()
language_weight_df = df.groupby("lang").agg({"count": sum}) / df["count"].sum()
language_weight_df = language_weight_df.sort_values(by="count", ascending=False)
print(language_weight_df.head(10))
language_weight = language_weight_df.to_dict()["count"]

         count
lang          
en    0.384414
es    0.104100
ar    0.085227
ja    0.080167
pt    0.072360
in    0.071406
tl    0.031040
fr    0.026545
tr    0.026364
ko    0.018307


In [7]:
language_weight_df["cdf"] = language_weight_df["count"].cumsum()
language_weight_df = language_weight_df.reset_index()

Re-weight for equal vote

In [8]:
df = orig_df.copy()
language_weight = df.groupby("lang").agg({"count": sum}).reset_index()
num_langs = len(language_weight)
a = compute_class_weight("balanced", classes=language_weight.lang.unique(), y=df["lang"])
language_reweight = {c: w for c, w in zip(language_weight.lang.unique(), a)}

In [9]:
df = orig_df.copy()
df["new_counts"] = df.apply(lambda x: x["count"]*language_reweight[x["lang"]], axis="columns")
df = df.groupby("emoji").agg({"count":sum, "new_counts": sum})
df["pdf"] = df["new_counts"] / df["new_counts"].sum()
df = df.sort_values(by="pdf", ascending=False).reset_index()
df["cdf"] = df["pdf"].cumsum()
df["lang_appearances"] = df.apply(lambda x: emoji_lang_appearances[x.emoji], axis="columns")
list_top_emoji_reweighted = df.sort_values(by="cdf", ascending=True).head(25).emoji.tolist()
df.sort_values(by="cdf", ascending=True)

Unnamed: 0,emoji,count,new_counts,pdf,cdf,lang_appearances
0,:face_with_tears_of_joy:,1611427,562498.833727,0.09360387,0.093604,63
1,:loudly_crying_face:,1666332,452440.817274,0.07528942,0.168893,62
2,:rolling_on_the_floor_laughing:,795204,269820.346428,0.04490006,0.213793,64
3,:red_heart:,667475,200947.709885,0.03343915,0.247232,64
4,:folded_hands:,283212,151131.341463,0.02514935,0.272382,62
5,:pleading_face:,473567,136952.195752,0.02278984,0.295172,62
6,:smiling_face_with_heart-eyes:,336008,99346.491672,0.01653197,0.311704,60
7,:beaming_face_with_smiling_eyes:,203793,89787.487344,0.01494129,0.326645,63
8,:smiling_face_with_tear:,265011,89535.408435,0.01489934,0.341544,58
9,:grinning_face_with_sweat:,256558,89282.25485,0.01485721,0.356401,63


What's the difference between these lists?

In [10]:
set(list_top_emoji).symmetric_difference(set(list_top_emoji_reweighted))

{':clown_face:', ':relieved_face:', ':skull:', ':slightly_smiling_face:'}

In [11]:
set(list_top_emoji).symmetric_difference(set(agg2_list))

{':backhand_index_pointing_down:',
 ':clown_face:',
 ':grinning_face:',
 ':kiss_mark:',
 ':relieved_face:',
 ':skull:',
 ':slightly_smiling_face:',
 ':sparkles:',
 ':weary_face:',
 ':white_heart:'}

In [12]:
set(list_top_emoji_reweighted).symmetric_difference(set(agg2_list))

{':backhand_index_pointing_down:',
 ':grinning_face:',
 ':kiss_mark:',
 ':sparkles:',
 ':weary_face:',
 ':white_heart:'}

## Language Resource Tiers

**High Resource**: top 90 percentile

**Low Resource**: top 99 percentile

In [13]:
high_resource = language_weight_df[language_weight_df.cdf < .9].lang.tolist()
high_resource

['en', 'es', 'ar', 'ja', 'pt', 'in', 'tl', 'fr', 'tr', 'ko']

In [14]:
low_resource = language_weight_df[(language_weight_df.cdf >= .9) & (language_weight_df.cdf < .999)].lang.tolist()

In [15]:
language_weight_df[language_weight_df.cdf >= .999].lang.tolist()

['bn',
 'gu',
 'bg',
 'ckb',
 'si',
 'kn',
 'ps',
 'pa',
 'am',
 'or',
 'sd',
 'my',
 'hy',
 'dv',
 'lo',
 'ka',
 'km',
 'ug',
 'bo']

In [16]:
print(f"Number of languages with high resources: {len(high_resource)}")
print(f"Number of languages with low resources: {len(low_resource)}")

Number of languages with high resources: 10
Number of languages with low resources: 36


In [17]:
print(f"Number of languages with not enough data: {len(language_weight_df[language_weight_df.cdf >= .999].lang.tolist())}")

Number of languages with not enough data: 19


## Emoji Tiers

**Most popular**: top 50 percentile

**Good representation**: top 90 percentile

In [44]:
top_popular_emojis = df[df.cdf < .5].emoji.tolist()

In [19]:
top_emojis = orig_df[orig_df.emoji.isin(top_popular_emojis)]

In [20]:
lang_emoji_number = top_emojis.groupby(['lang', 'emoji']).sum().reset_index().groupby('lang').size().reset_index()

In [21]:
lang_emoji_number[lang_emoji_number.lang.isin(high_resource)]

Unnamed: 0,lang,0
1,ar,24
13,en,24
14,es,24
19,fr,24
25,in,24
29,ja,24
33,ko,24
47,pt,24
58,tl,24
59,tr,24


In [22]:
lang_emoji_number[lang_emoji_number.lang.isin(low_resource)]

Unnamed: 0,lang,0
5,ca,24
7,cs,24
8,cy,24
9,da,24
10,de,24
12,el,24
15,et,24
16,eu,24
17,fa,24
18,fi,24


### popular emojis

In [43]:
less_popular_emojis = df[(df.cdf < .9)&(df.cdf >=.5)].emoji.tolist()

In [38]:
top_emojis = orig_df[orig_df.emoji.isin(less_popular_emojis)]

In [39]:
lang_emoji_number = top_emojis.groupby(['lang', 'emoji']).sum().reset_index().groupby('lang').size().reset_index()

In [40]:
lang_emoji_number[lang_emoji_number.lang.isin(high_resource)]

Unnamed: 0,lang,0
1,ar,227
13,en,227
14,es,227
19,fr,227
25,in,227
29,ja,227
33,ko,227
47,pt,227
58,tl,227
59,tr,227


In [41]:
lang_emoji_number[lang_emoji_number.lang.isin(low_resource)]

Unnamed: 0,lang,0
5,ca,225
7,cs,222
8,cy,221
9,da,224
10,de,227
12,el,216
15,et,226
16,eu,225
17,fa,223
18,fi,222


In [31]:
temp_df = orig_df.groupby(['lang', 'emoji']).agg({"count": sum}).reset_index()

In [36]:
len(temp_df['emoji'].unique())

3372

In [46]:
temp_df[(temp_df.lang.isin(high_resource+low_resource))&(temp_df.emoji.isin(top_popular_emojis+less_popular_emojis))]['count'].sum()

17179322