In [15]:
import pandas as pd
pd.set_option('display.max_rows', None)
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

In [16]:
emoji_file = "emoji_by_language.csv"
orig_df = pd.read_csv(emoji_file)
orig_df = orig_df[orig_df.lang!="und"]

# Aggregation 1: Raw Counts

How many languages are represented by the each emoji? i.e., in how many languages does each emoji appear?

In [17]:
df = orig_df
emoji_lang_appearances = df.groupby("emoji").agg({"lang": lambda x: len(np.unique(x))}).to_dict()["lang"]
df = df.groupby("emoji").agg({"count": sum})
df["pdf"] = df["count"] / df["count"].sum()
df = df.sort_values(by="pdf", ascending=False).reset_index()
df["cdf"] = df["pdf"].cumsum()
df["lang_appearances"] = df.apply(lambda x: emoji_lang_appearances[x.emoji], axis="columns")
list_top_emoji = df.sort_values(by="cdf", ascending=True).head(25).emoji.tolist()
df.sort_values(by="cdf", ascending=True).head(100)


Unnamed: 0,emoji,count,pdf,cdf,lang_appearances
0,:loudly_crying_face:,1666332,0.086124,0.086124,62
1,:face_with_tears_of_joy:,1611427,0.083286,0.16941,63
2,:rolling_on_the_floor_laughing:,795204,0.0411,0.21051,64
3,:red_heart:,667475,0.034498,0.245009,64
4,:pleading_face:,473567,0.024476,0.269485,62
5,:smiling_face_with_heart-eyes:,336008,0.017367,0.286851,60
6,:smiling_face_with_hearts:,293209,0.015154,0.302006,60
7,:folded_hands:,283212,0.014638,0.316644,62
8,:smiling_face_with_tear:,265011,0.013697,0.330341,58
9,:sparkles:,260981,0.013489,0.343829,56


# Aggregation 2: 

$$P(emoji) = \sum_{emoji}{\frac{count(emoji|lang)}{count(lang)}}$$

# Aggregation 3: Counts re-weighted by Language

How much "voting power" does each language have?

In [22]:
df = orig_df
language_weight_df = df.groupby("lang").agg({"count": sum}) / df["count"].sum()
language_weight_df = language_weight_df.sort_values(by="count", ascending=False)
print(language_weight_df.head(10))
language_weight = language_weight_df.to_dict()["count"]

         count
lang          
en    0.384414
es    0.104100
ar    0.085227
ja    0.080167
pt    0.072360
in    0.071406
tl    0.031040
fr    0.026545
tr    0.026364
ko    0.018307


In [24]:
language_weight_df["cdf"] = language_weight_df["count"].cumsum()
language_weight_df.reset_index()

Unnamed: 0,lang,count,cdf
0,en,0.3844145,0.384414
1,es,0.1041004,0.488515
2,ar,0.08522689,0.573742
3,ja,0.08016736,0.653909
4,pt,0.0723596,0.726269
5,in,0.07140612,0.797675
6,tl,0.03103993,0.828715
7,fr,0.02654495,0.85526
8,tr,0.02636416,0.881624
9,ko,0.01830722,0.899931


Re-weight for equal vote

In [5]:
df = orig_df
language_weight = df.groupby("lang").agg({"count": sum}).reset_index()
num_langs = len(language_weight)
a = compute_class_weight("balanced", classes=language_weight.lang.unique(), y=df["lang"])
language_reweight = {c: w for c, w in zip(language_weight.lang.unique(), a)}

In [9]:
df = orig_df
df["new_counts"] = df.apply(lambda x: x["count"]*language_reweight[x["lang"]], axis="columns")
df = df.groupby("emoji").agg({"new_counts": sum})
df["pdf"] = df["new_counts"] / df["new_counts"].sum()
df = df.sort_values(by="pdf", ascending=False).reset_index()
df["cdf"] = df["pdf"].cumsum()
df["lang_appearances"] = df.apply(lambda x: emoji_lang_appearances[x.emoji], axis="columns")
list_top_emoji_reweighted = df.sort_values(by="cdf", ascending=True).head(25).emoji.tolist()
df.sort_values(by="cdf", ascending=True)

Unnamed: 0,emoji,new_counts,pdf,cdf,lang_appearances
0,:face_with_tears_of_joy:,562498.833727,0.09360387,0.093604,63
1,:loudly_crying_face:,452440.817274,0.07528942,0.168893,62
2,:rolling_on_the_floor_laughing:,269820.346428,0.04490006,0.213793,64
3,:red_heart:,200947.709885,0.03343915,0.247232,64
4,:folded_hands:,151131.341463,0.02514935,0.272382,62
5,:pleading_face:,136952.195752,0.02278984,0.295172,62
6,:smiling_face_with_heart-eyes:,99346.491672,0.01653197,0.311704,60
7,:beaming_face_with_smiling_eyes:,89787.487344,0.01494129,0.326645,63
8,:smiling_face_with_tear:,89535.408435,0.01489934,0.341544,58
9,:grinning_face_with_sweat:,89282.25485,0.01485721,0.356401,63


What's the difference between these lists?

In [7]:
set(list_top_emoji).symmetric_difference(set(list_top_emoji_reweighted))

{':clown_face:', ':relieved_face:', ':skull:', ':slightly_smiling_face:'}