In [None]:
import pandas as pd
import re
import regex
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import geopandas as gp
import geoplot as gplt
import geoplot.crs as gcrs
import contextily as ctx
import numpy as np
import shapely as shapely
from shapely.geometry import Polygon
from shapely.ops import transform
import collections
from collections import Counter
from typing import List, Tuple, Dict, Union, Generator, Optional
from pyproj import Transformer, CRS, Proj
import glob
import os
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import advertools as adv
import warnings
import emoji
warnings.filterwarnings('ignore')

In [None]:
df = gp.read_file(r"C:\Users\saman\OneDrive\Documents\Thesis\Data\RawData_Cleaned_Final.geojson")

In [None]:
df.head()

In [None]:
# convert to geodf, import using WGS 84 since that's how it exported from pgadmin
gdf = gp.GeoDataFrame(df,geometry =gp.points_from_xy(df.long,df.lat),crs =4326)

In [None]:
# reproejct to Mollweide for visualization purposes later
gdf.to_crs("ESRI:54009",inplace=True)

## Start of temporal exploratory analysis:

In [None]:
# first I'll create columns with aggregated data information to make the creation of temporal subsets more straightforward
gdf['post_publish_date'] = pd.to_datetime(gdf['post_publish_date'])
gdf['Month/Year'] = gdf['post_publish_date'].dt.to_period('M')  # add new column showing timestamps aggregated to monthly intervals
gdf['Week/Month'] = gdf['post_publish_date'].dt.to_period('W')  # add new column showing timestamps aggregated to weekly intervals
gdf['HalfMonth'] = gdf['post_publish_date'] + pd.offsets.SemiMonthEnd()  # add new column showing timestamps aggregated to biweekly intervals
gdf.head()

In [None]:
# chart showing posts per month
# note - dataset contains no data for november
sns.set(style = 'whitegrid', font_scale=1.5)
x = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September','October', 'December']
y = gdf['Month/Year'].value_counts().sort_index()
fig, ax = plt.subplots(figsize = (20,5))
width = 0.75
ax.bar(x, y, width)
plt.title("Number of Posts per Month", size =25)
plt.show()

In [None]:
# chart showing posts per month
# note - no data for november
sns.set(style = 'whitegrid', font_scale=2)
fig,ax =plt.subplots(figsize = (25,5))
gdf['Week/Month'].value_counts().sort_index().plot(kind = 'bar')
plt.title("Number of Tweets per Week", size =35)
plt.xticks(rotation=90)

### Note: Significant gaps in data for April, October, and November are due to technical errors during prior data collection and cannot be avoided

In [None]:
# now let's create sub-datasets for each month (these will also be the temporal subsets for the typicality calculations)
gdf_jan = gdf[gdf['Month/Year'] == '2020-01']
gdf_feb = gdf[gdf['Month/Year'] == '2020-02']
gdf_mar = gdf[gdf['Month/Year'] == '2020-03']
gdf_apr = gdf[gdf['Month/Year'] == '2020-04']
gdf_may = gdf[gdf['Month/Year'] == '2020-05']
gdf_jun = gdf[gdf['Month/Year'] == '2020-06']
gdf_jul = gdf[gdf['Month/Year'] == '2020-07']
gdf_aug = gdf[gdf['Month/Year'] == '2020-08']
gdf_sep = gdf[gdf['Month/Year'] == '2020-09']
gdf_oct = gdf[gdf['Month/Year'] == '2020-10']
# reminder - no data for november
gdf_dec = gdf[gdf['Month/Year'] == '2020-12']
gdf_jan.head()

### Now i'll pinpoint more precise time frames with no data available

In [None]:
sns.set(style = 'whitegrid')
fig,ax =plt.subplots(figsize = (10,5))
gdf_apr['Week/Month'].value_counts().plot(kind = 'bar', width = 0.5)
plt.title("Number of Tweets per Week", size = 20)
plt.xticks(rotation=0)

### Note: no data for 2nd, 3rd, 4th week of April

In [None]:
sns.set(style = 'whitegrid')
fig,ax =plt.subplots(figsize = (15,5))
gdf_oct['Week/Month'].value_counts().plot(kind = 'bar')
plt.title("Number of Tweets per Week", size = 20)
plt.xticks(rotation=0)

### Note: No data for 3rd, 4th week of october

In [None]:
# let's look at changing popular emojis over time
top_emojis_jan = most_common_emojis(gdf_jan['emoji'], 10)
top_emojis_feb = most_common_emojis(gdf_feb['emoji'], 10)
top_emojis_mar = most_common_emojis(gdf_mar['emoji'], 10)
top_emojis_apr = most_common_emojis(gdf_apr['emoji'], 10)
top_emojis_may = most_common_emojis(gdf_may['emoji'], 10)
top_emojis_jun = most_common_emojis(gdf_jun['emoji'], 10)
top_emojis_jul = most_common_emojis(gdf_jul['emoji'], 10)
top_emojis_aug = most_common_emojis(gdf_aug['emoji'], 10)
top_emojis_sep = most_common_emojis(gdf_sep['emoji'], 10)
top_emojis_oct = most_common_emojis(gdf_oct['emoji'], 10)
top_emojis_dec = most_common_emojis(gdf_dec['emoji'], 10)

Months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "December"]

In [None]:
# note: this part will not be performed with HLL data due to lack of temporal information

In [None]:
jan_list = top_emojis_jan.Emoji.values.tolist()
feb_list = top_emojis_feb.Emoji.values.tolist()
mar_list = top_emojis_mar.Emoji.values.tolist()
apr_list = top_emojis_apr.Emoji.values.tolist()
may_list = top_emojis_may.Emoji.values.tolist()
jun_list = top_emojis_jun.Emoji.values.tolist()
jul_list = top_emojis_jul.Emoji.values.tolist()
aug_list = top_emojis_aug.Emoji.values.tolist()
sep_list = top_emojis_sep.Emoji.values.tolist()
oct_list = top_emojis_oct.Emoji.values.tolist()
dec_list = top_emojis_dec.Emoji.values.tolist()

In [None]:
top_emojis_over_time = pd.DataFrame(columns = Months)
top_emojis_over_time['January'] = jan_list
top_emojis_over_time['February'] = feb_list
top_emojis_over_time['March'] = mar_list
top_emojis_over_time['April'] = apr_list
top_emojis_over_time['May'] = may_list
top_emojis_over_time['June'] = jun_list
top_emojis_over_time['July'] = jul_list
top_emojis_over_time['August'] = aug_list
top_emojis_over_time['September'] = sep_list
top_emojis_over_time['October'] = oct_list
top_emojis_over_time['December'] = dec_list

top_emojis_over_time

## Now I'll generate some visualizations to get an overview of the data

In [None]:
# create wordcloud of all hashtags in the dataset
def make_wordcloud(newlist):
    
    hashtags = []
    for item in newlist:
        hashtags.append(item.lower().split(','))
    flat_list = [item for sublist in hashtags for item in sublist]
    
    text = " ".join(word for word in flat_list)
    stopwords = set(STOPWORDS)
    wordcloud =WordCloud(stopwords=stopwords,
                         prefer_horizontal = 1,
                         colormap = "winter",
                         background_color="white",
                         width=1600, 
                         height=800,
                         collocations = False,
                         normalize_plurals=False).generate(text)    

    plt.figure(figsize=(20,10))
    plt.tight_layout(pad=0)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    
make_wordcloud(gdf['hashtags'])

## let's take a closer look at some of the most popular hashtags

In [None]:
# create a function to count the total frequency of the most commonly used hashtags
def most_common_hashtags(labels, quantity):
    #words = [i.split(" ", 3)[0] for i in labels]
    #counter = Counter(words).most_common(quantity)
    hashtags = [(re.split(',', i)) for i in labels]
    counter = Counter(x for xs in hashtags for x in set(xs)).most_common(quantity)
    df = pd.DataFrame(counter, columns=["Hashtag", "Occurence number"])\
                        .sort_values(by="Occurence number", ascending=True)
    
    df = df[df["Hashtag"] != " "].reset_index(drop=True)
    
    return df


In [None]:
top_hashtags = most_common_hashtags(gdf['hashtags'], 50)
print(top_hashtags.sort_values(['Occurence number'], ascending=[False]))

# explanations of some popular hashtags:
p2000 = primarily from dutch safety alert system (bot)
rrm = rotterdam rijnmond
wetter = weather = primarily from german weather alert system (bot)
canyaman = famous Turkish actor
loveisland = famous british reality show
nehody = 'accident' in czec and slovak (bot)
günaydın = 'good morning' in turkish
bts = famous kpop group 
gfvip = Grande Fratello VIP (Italian relaity show)
yomequedoencasa = 'i stay at home' in spanish
mon = several possibilities, one is the Mon district of Nagaland, India 
ken = abbreviation for city in the netherlands (bot)
lfc = liverpool football club
mufc = manchester united football club
picemiyeti = "Pi Society" in Turkish - the name of a popular radio show

In [None]:
# create a separate function to create an emoji-cloud
import emojis

def make_emojilist(newlist):
    
    hashtags = []
    for item in newlist:
        hashtags.append(item.lower().split(','))
    flat_list = [item for sublist in hashtags for item in sublist]
    
    text = " ".join(str(word) for word in flat_list)
    return text

class EmojiCloud:
    def __init__(self, font_path='./TwitterColorEmoji-SVGinOT.ttf'):
        self.font_path = font_path
        self.word_cloud = self.initialize_wordcloud()
        self.emoji_probability = None

        
    def initialize_wordcloud(self):
        return WordCloud(font_path=self.font_path,
                               width=2000,
                               height=1000,
                               background_color='white',
                               random_state=42,
                               collocations=False)

    
    def color_func(self, word, font_size, position, orientation, random_state=None,
                   **kwargs):
        hue_saturation = '200, 88%'

        current_emoji_probability = self.emoji_probability[word]
        if current_emoji_probability >= 0.10:
            opacity = 50
        else:
            opacity = 75 - current_emoji_probability/0.2 * 5
        return f"hsl({hue_saturation},{opacity}%)"

    def generate(self, text):
        emoji_frequencies = Counter(emojis.iter(text))
        total_count = sum(emoji_frequencies.values())
        
        self.emoji_probability = {emoji: count/total_count for emoji, count in emoji_frequencies.items()}
        wc = self.word_cloud.generate_from_frequencies(emoji_frequencies)
        
        plt.figure(figsize=(20,10))
        plt.imshow(wc.recolor(color_func=self.color_func, random_state=42))
        plt.axis("off")

In [None]:
emojitext = make_emojilist(df['emoji'])

In [None]:
emoji_cloud = EmojiCloud(font_path='./TwitterColorEmoji-SVGinOT.ttf')
emoji_cloud.generate(emojitext)

In [None]:
# It's a bit hard to differentiate the emojis when they're all the same color - let's tweak it so each emoji is different

In [None]:
# create a separate function to create an emoji-cloud with original colors
class EmojiCloudNoColor:
    def __init__(self, font_path='TwitterColorEmoji-SVGinOT.ttf'):
        self.font_path = font_path
        self.word_cloud = self.initialize_wordcloud_nocolor()
        self.emoji_probability = None

        
    def initialize_wordcloud_nocolor(self):
        return WordCloud(font_path=self.font_path,
                               width=2000,
                               height=1000,
                               background_color='white',
                               random_state=42,
                               collocations=False)


    def generate_nocolor(self, text):
        emoji_frequencies = Counter(emojis.iter(text))
        total_count = sum(emoji_frequencies.values())
        
        self.emoji_probability = {emoji: count/total_count for emoji, count in emoji_frequencies.items()}
        wc = self.word_cloud.generate_from_frequencies(emoji_frequencies)
        
        plt.figure(figsize=(20,10))
        plt.imshow(wc.recolor(color_func=None))
        plt.axis("off")

In [None]:
emoji_cloud = EmojiCloudNoColor(font_path='./TwitterColorEmoji-SVGinOT.ttf')
emoji_cloud.generate_nocolor(emojitext)

### this is way too crowded - let's try again with only most common emojis

In [None]:
# create a function to count the total frequency of the most commonly used emojis
def most_common_emojis(labels, quantity):
    """
    Split all emoji groupings and count how many times each emoji is repeated in the list 
    labels (list) = List of strings to split.
    quantity (int) = Number of most common emojis to return.
    """
    #words = [i.split(" ", 3)[0] for i in labels]
    #counter = Counter(words).most_common(quantity)
    emojis = [(re.split(',', i)) for i in labels]
    counter = Counter(x for xs in emojis for x in set(xs)).most_common(quantity)
    df = pd.DataFrame(counter, columns=["Emoji", "Occurence number"])\
                        .sort_values(by="Occurence number", ascending=False)
    
    df = df[df["Emoji"] != " "].reset_index(drop=True)
    
    return df

In [None]:
#calculate absolute frequency for emojis as they appear in posts
top_emojis = most_common_emojis(gdf['emoji'], 50)
print(top_emojis.sort_values(['Occurence number'], ascending=[False]))

In [None]:
# let's try it again with emojis with skin tone removed - see if it made a big difference
top_emojis = most_common_emojis(gdf['emoji generic'], 50)
print(top_emojis.sort_values(['Occurence number'], ascending=[False]))

### looks like removing the skin tone causes a lot more skin-based emojis to be ranked higher

In [None]:
emoji_cloud = EmojiCloudNoColor(font_path='./TwitterColorEmoji-SVGinOT.ttf')
emoji_cloud.generate_nocolor(top50emojitext)

In [None]:
# add column of emoji description
rownum = 0
for row in top_emojis['Emoji']:
    if rownum <= 49:
        top_emojis.loc[rownum, 'Emoji Description'] = emoji.demojize(top_emojis.loc[rownum]['Emoji'])
        rownum = rownum + 1
    else:
        break
print(top_emojis.sort_values(['Occurence number'], ascending=False))

In [None]:
# count the number of emojis in the whole gdf
def AllEmojiTotalCounter(totaldataset):
    emojicounter = 0
    for post in totaldataset['emoji generic']:
        data = regex.findall(r'\X', post)
        for word in data:
            if any(char in emoji.UNICODE_EMOJI['en'] for char in word):
                emojicounter += 1
    # print("Number of emojis in total dataset: " + str(emojicounter))
    return emojicounter

AllEmojiTotalCounter(gdf)

In [None]:
# calculate relative frequencies for each emoji 
top_emojis['Rel Freq'] = (top_emojis['Occurence number']/6923376) # 6923376 = number of emojis in dataset
top_emojis

### that information tells us a little bit about which emojis are used most often, but maybe the typicality measure can extract more meaning

In [None]:
#  set up functions to calculate each component of the typicality equation

# number of a certain emoji within subset
def EmojiSubsetCounter(emojiname, subset):
    emojicounter = 0
    for post in subset['emoji generic']:
        if emojiname in post:
            emojicounter += 1
    # print("Number of " + emojiname + "in subset: " + str(emojicounter))
    return emojicounter


# number of total emojis in subset
def AllEmojiSubsetCounter(subset):
    emojicounter = 0
    for post in subset['emoji generic']:
        data = regex.findall(r'\X', post)
        for word in data:
            if any(char in emoji.UNICODE_EMOJI['en'] for char in word):
                emojicounter += 1
    # print("Number of emojis in subset: " + str(emojicounter))
    return emojicounter


# number of specific emoji in whole dataset
def EmojiTotalCounter(emojiname, totaldataset):
    emojicounter = 0
    for post in totaldataset['emoji generic']:
        if emojiname in post:
            emojicounter += 1
    # print("Number of " + emojiname + "in total dataset: " + str(emojicounter))
    return emojicounter



# number of emojis in whole dataset
def AllEmojiTotalCounter(totaldataset):
    emojicounter = 0
    for post in totaldataset['emoji generic']:
        data = regex.findall(r'\X', post)
        for word in data:
            if any(char in emoji.UNICODE_EMOJI['en'] for char in word):
                emojicounter += 1
    # print("Number of emojis in total dataset: " + str(emojicounter))
    return emojicounter


# typicality equation

def TypicalityEquation (emojisubset, allemojisubset, emojitotal, allemojitotal):
    t = ((emojisubset/allemojisubset)-(emojitotal/allemojitotal))/(emojitotal/allemojitotal)
    # print("Typicality: " + str(t))
    return t


In [None]:
# for each of the top 50 emojis, calculate the typicality per month
rownum = 0
for row in top_emojis['Emoji']:
    if rownum <= 49:
        emo = top_emojis.loc[rownum]['Emoji']
        emocount_total = top_emojis.loc[rownum]['Occurence number']
        top_emojis.loc[rownum, 'T_Jan'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_jan), AllEmojiSubsetCounter(gdf_jan), emocount_total, 6923376)
        top_emojis.loc[rownum, 'T_Feb'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_feb), AllEmojiSubsetCounter(gdf_feb), emocount_total, 6923376)
        top_emojis.loc[rownum, 'T_Mar'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_mar), AllEmojiSubsetCounter(gdf_mar), emocount_total, 6923376)
        top_emojis.loc[rownum, 'T_Apr'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_apr), AllEmojiSubsetCounter(gdf_apr), emocount_total, 6923376)
        top_emojis.loc[rownum, 'T_May'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_may), AllEmojiSubsetCounter(gdf_may), emocount_total, 6923376)
        top_emojis.loc[rownum, 'T_Jun'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_jun), AllEmojiSubsetCounter(gdf_jun), emocount_total, 6923376)
        top_emojis.loc[rownum, 'T_Jul'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_jul), AllEmojiSubsetCounter(gdf_jul), emocount_total, 6923376)
        top_emojis.loc[rownum, 'T_Aug'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_aug), AllEmojiSubsetCounter(gdf_aug), emocount_total, 6923376)
        top_emojis.loc[rownum, 'T_Sep'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_sep), AllEmojiSubsetCounter(gdf_sep), emocount_total, 6923376)
        top_emojis.loc[rownum, 'T_Oct'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_oct), AllEmojiSubsetCounter(gdf_oct), emocount_total, 6923376)
        top_emojis.loc[rownum, 'T_Dec'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_dec), AllEmojiSubsetCounter(gdf_dec), emocount_total, 6923376)
        print("done" + str(rownum))
        rownum = rownum + 1
    else:
        break
print(top_emojis)


In [None]:
# rename columns (to make future plots look nicer)

top_emojis = top_emojis.rename(columns={"T_Jan": "Jan", "T_Feb": "Feb", "T_Mar": "Mar", "T_Apr": "Apr", "T_May": "May", 
                           "T_Jun": "Jun", "T_Jul": "Jul", "T_Aug": "Aug", "T_Sep": "Sep", "T_Oct": "Oct", "T_Dec": "Dec"})

In [None]:
# prepare data for plotting
columns_trans = top_emojis.transpose()
columns_trans.columns = columns_trans.iloc[0]
columns_trans = columns_trans.transpose()
columns_trans = columns_trans.drop(columns=['Emoji', 'Occurence number'])
columns_trans = columns_trans.transpose()
columns_trans


In [None]:
# here's an example for just one emoji
columns_trans['❤️'].plot(ylim=(-1,1), title="Temporal Typicality of " + emoji.demojize('❤️').replace(":","").replace("_", " ").title() + " Emoji", linewidth=5,grid=True, ylabel="Typicality", xlabel="Month", fontsize = 12)
# columns_trans['😂'].plot(ylim=(-1,1), title='Temporal Typicality of Face With Tears Of Joy Emoji', linewidth=5,grid=True, ylabel="Typicality", xlabel="Month", fontsize = 12)

In [None]:
titlelist = []
for col in columns_trans.columns:
    addon = emoji.demojize(col).replace(":","")
    addon = addon.replace("_", " ")
    addon = addon.title()
    titlelist.append(addon)
titlelist

In [None]:
columns_trans.plot(ylim=(-1,1),figsize=(100, 150), subplots=True,layout=(10, 5), title=titlelist, grid=True, ylabel="Typicality", xlabel="Month", linewidth = 5, sharex=False, legend=False, fontsize = 30)

In [None]:
columns_trans.plot(ylim=(-1,1),figsize=(100, 150), subplots=True,layout=(10, 5), title=None, grid=True, 
                         linewidth = 10, sharex=True, sharey=True, legend=False, fontsize = 70)

In [None]:
# also perform this calculation on the top 50 emojis by userdays (to avoid bias from overactive users)
top_emojis_ud = {'😂': 190683,
     '❤️': 187938,
     '😍': 130963,
     '👏': 99485,
     '👍': 95117,
     '🙏': 93921,
     '💪': 87392,
     '🤣': 82473,
     '💙': 82002,
     '😊': 69271,
     '🔥': 66638,
     '😉': 65919,
     '🥰': 59354,
     '😎': 58073,
     '😁': 54824,
     '🤔': 52586,
     '👌': 50397,
     '🙌': 45654,
     '💛': 45365,
     '☀️': 44965,
     '💚': 41228,
     '⚽️': 40216,
     '😭': 39637,
     '✨': 38818,
     '💜': 37766,
     '🖤': 37687,
     '😘': 37036,
     '👇': 36136,
     '😅': 35820,
     '🤩': 35565,
     '🔴': 34552,
     '♥️': 33384,
     '📸': 33003,
     '🙄': 31538,
     '💕': 31422,
     '🤗': 30618,
     '🎉': 29751,
     '🎶': 29636,
     '😋': 27232,
     '👀': 27175,
     '😀': 27025,
     '🌈': 26916,
     '👉': 26892,
     '🙈': 24852,
     '🥳': 24477,
     '😜': 24255,
     '✅': 22893,
     '😱': 22194,
     '😷': 21079,
     '🌞': 20908}

In [None]:
top_emojis_ud_df = pd.DataFrame(list(top_emojis_ud.items()), columns=['Emoji', 'Userdays'])

In [None]:
# can't use most_common_emojis function since it won't take a list of emojis as an input - se I'll design another counter
rownum = 0
for row in top_emojis_ud_df['Emoji']:
    if rownum <= 49:
        emo = top_emojis_ud_df.loc[rownum]['Emoji']
        emocount = 0
        for row in gdf['emoji generic']:
            if emo in row:
                emocount = emocount +1
        top_emojis_ud_df.loc[rownum, 'Occurrence number'] = emocount
        rownum = rownum + 1
    else:
        break
top_emojis_ud_df

In [None]:
# add column of emoji description, relative frequency, typicality per month
rownum = 0
for row in top_emojis_ud_df['Emoji']:
    if rownum <= 49:
        top_emojis_ud_df.loc[rownum, 'Emoji Description'] = emoji.demojize(top_emojis_ud_df.loc[rownum]['Emoji'])
        top_emojis_ud_df['Rel Freq'] = (top_emojis_ud_df['Occurrence number']/6923376) # 6923376 = number of emojis in dataset
        rownum = rownum + 1
    else:
        break

top_emojis_ud_df

In [None]:
### it's unlear why the soccer ball emoji is 0, but this function will allow for it
rownum = 0
for row in top_emojis_ud_df['Emoji']:
    if rownum <= 49:
        emo = top_emojis_ud_df.loc[rownum]['Emoji']
        emocount_total = top_emojis_ud_df.loc[rownum]['Occurrence number']
        if top_emojis_ud_df.loc[rownum, 'Occurrence number'] == 0:  
            top_emojis_ud_df.loc[rownum, 'T_Jan'] = 0
            top_emojis_ud_df.loc[rownum, 'T_Feb'] = 0
            top_emojis_ud_df.loc[rownum, 'T_Mar'] = 0
            top_emojis_ud_df.loc[rownum, 'T_Apr'] = 0
            top_emojis_ud_df.loc[rownum, 'T_May'] = 0
            top_emojis_ud_df.loc[rownum, 'T_Jun'] = 0
            top_emojis_ud_df.loc[rownum, 'T_Jul'] = 0
            top_emojis_ud_df.loc[rownum, 'T_Aug'] = 0
            top_emojis_ud_df.loc[rownum, 'T_Sep'] = 0
            top_emojis_ud_df.loc[rownum, 'T_Oct'] = 0
            top_emojis_ud_df.loc[rownum, 'T_Dec'] = 0
#             print("done" + str(rownum))
            rownum = rownum + 1
        else:
            top_emojis_ud_df.loc[rownum, 'T_Jan'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_jan), AllEmojiSubsetCounter(gdf_jan), emocount_total, 6923376)
            top_emojis_ud_df.loc[rownum, 'T_Feb'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_feb), AllEmojiSubsetCounter(gdf_feb), emocount_total, 6923376)
            top_emojis_ud_df.loc[rownum, 'T_Mar'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_mar), AllEmojiSubsetCounter(gdf_mar), emocount_total, 6923376)
            top_emojis_ud_df.loc[rownum, 'T_Apr'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_apr), AllEmojiSubsetCounter(gdf_apr), emocount_total, 6923376)
            top_emojis_ud_df.loc[rownum, 'T_May'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_may), AllEmojiSubsetCounter(gdf_may), emocount_total, 6923376)
            top_emojis_ud_df.loc[rownum, 'T_Jun'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_jun), AllEmojiSubsetCounter(gdf_jun), emocount_total, 6923376)
            top_emojis_ud_df.loc[rownum, 'T_Jul'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_jul), AllEmojiSubsetCounter(gdf_jul), emocount_total, 6923376)
            top_emojis_ud_df.loc[rownum, 'T_Aug'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_aug), AllEmojiSubsetCounter(gdf_aug), emocount_total, 6923376)
            top_emojis_ud_df.loc[rownum, 'T_Sep'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_sep), AllEmojiSubsetCounter(gdf_sep), emocount_total, 6923376)
            top_emojis_ud_df.loc[rownum, 'T_Oct'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_oct), AllEmojiSubsetCounter(gdf_oct), emocount_total, 6923376)
            top_emojis_ud_df.loc[rownum, 'T_Dec'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_dec), AllEmojiSubsetCounter(gdf_dec), emocount_total, 6923376)
#             print("done" + str(rownum))
            rownum = rownum + 1
    else:
        break
# top_emojis_ud_df      

In [None]:
# save df to csv 
top_emojis_ud_df.to_csv(r"C:\Users\saman\OneDrive\Documents\Thesis\Data\TopEmojisHLL_MonthlyTypicality.csv", index=True)

In [None]:
# read back in (optional, to avoid re-running above code in the future)
top_emojis_ud_df = pd.read_csv(r"C:\Users\saman\OneDrive\Documents\Thesis\Data\TopEmojisHLL_MonthlyTypicality.csv")

In [None]:
top_emojis_ud_df.head()

In [None]:
# rename columns (to make future plots look nicer)

top_emojis_ud_df = top_emojis_ud_df.rename(columns={"T_Jan": "Jan", "T_Feb": "Feb", "T_Mar": "Mar", "T_Apr": "Apr", "T_May": "May", 
                           "T_Jun": "Jun", "T_Jul": "Jul", "T_Aug": "Aug", "T_Sep": "Sep", "T_Oct": "Oct", "T_Dec": "Dec"})

# prepare data for plotting
columns_trans_ud = top_emojis_ud_df.transpose()
columns_trans_ud.columns = columns_trans_ud.iloc[0]
columns_trans_ud = columns_trans_ud.transpose()
columns_trans_ud = columns_trans_ud.drop(columns=['Emoji', 'Unnamed: 0', 'Rel Freq', 'Occurrence number', 'Emoji Description', 'Userdays'])
columns_trans_ud = columns_trans_ud.transpose()

# # make new list of titles
# titlelist2 = []
# for col in columns_trans_ud.columns:
#     addon = emoji.demojize(col).replace(":","")
#     addon = addon.replace("_", " ")
#     addon = addon.title()
#     titlelist2.append(addon)

# plot
columns_trans_ud.plot(ylim=(-1,1),figsize=(100, 150), subplots=True,layout=(10, 5), title=None, grid=True, 
                      linewidth = 10, sharex=True, sharey=True, legend=False, fontsize = 70)

In [None]:
# masked face emoji goes off the chart, let's look at it alone

columns_trans_ud[48].plot(ylim=(-1,1.5), title="Temporal Typicality of " + emoji.demojize('😷').replace(":","").replace("_", " ").title() + " Emoji", linewidth=5, fontsize = 12)

# begin spatial analysis

In [None]:
# create spatial subsets, starting with country boundaries
countries_gdf = gp.read_file("Europe_Clipped_BBox.shp")
countries_gdf

In [None]:
countries_gdf.to_crs("ESRI:54009", inplace=True)

In [None]:
# Plot the custom shapefile (clipped to fit data)
fig, ax = plt.subplots(figsize=(15, 6))
countries_gdf.plot(ax=ax)
ax.set_title("Study Area", fontsize=20)
ax.set_axis_off()
plt.show()
ax.set_axis_off()

In [None]:
# plot the points on top of study area - this takes quite a while
fig, ax = plt.subplots(figsize=(35, 20))
countries_gdf.boundary.plot(ax=ax, color="black")
gdf.plot(ax=ax, color="purple", markersize=3, alpha = 0.5)
ax.set_title("Twitter Post Locations", fontsize=20)
ax.set_axis_off()
plt.show()
fig.savefig(r"C:\Users\saman\OneDrive\Documents\Thesis\Figures\AllPosts_mapped.png", dpi=300, bbox_inches = "tight")

In [None]:
county_ud = {'United Kingdom': 810535,
 'France': 230294,
 'Spain': 288819,
 'Italy': 141807,
 'Germany': 142974,
 'Netherlands': 73083,
 'Turkey': 108351,
 'Czech Republic': 10711,
 'Belgium': 39852,
 'Switzerland': 23061,
 'Portugal': 12699,
 'Austria': 18070}


In [None]:
# from HLL data, I know that the top 10 countries by userdays are:
#  ('United Kingdom', 810535),
#  ('Spain', 288819),
#  ('France', 230294),
#  ('Germany', 142974),
#  ('Italy', 141807),
#  ('Turkey', 108351),
#  ('Netherlands', 73083),
#  ('Belgium', 39852),
#  ('Switzerland', 23061),
#  ('Austria', 18070),

#extracting boundaries of countries for making specific country based grids 

uk = countries_gdf[countries_gdf['NAME_EN'] == "United Kingdom"]
sp = countries_gdf[countries_gdf['NAME_EN'] == "Spain"]
fr = countries_gdf[countries_gdf['NAME_EN'] == "France"]
de = countries_gdf[countries_gdf['NAME_EN'] == "Germany"]
it = countries_gdf[countries_gdf['NAME_EN'] == "Italy"]
tu = countries_gdf[countries_gdf['NAME_EN'] == "Turkey"]
ne = countries_gdf[countries_gdf['NAME_EN'] == "Netherlands"]
be = countries_gdf[countries_gdf['NAME_EN'] == "Belgium"]
sw = countries_gdf[countries_gdf['NAME_EN'] == "Switzerland"]
au = countries_gdf[countries_gdf['NAME_EN'] == "Austria"]

In [None]:
# plot countries
fig, (ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8,ax9,ax10) = plt.subplots(1,10,figsize = (15,10))

for ax in (ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8,ax9,ax10):
    ax.set_axis_off()
    
plt.tight_layout()

ax1.title.set_text('UK')
ax2.title.set_text('SP')
ax3.title.set_text('FR')
ax4.title.set_text('GE')
ax5.title.set_text('IT')
ax6.title.set_text('TU')
ax7.title.set_text('NE')
ax8.title.set_text('BE')
ax9.title.set_text('SW')
ax10.title.set_text('AU')


uk.plot(ax=ax1)
sp.plot(ax=ax2)
fr.plot(ax=ax3)
de.plot(ax=ax4)
it.plot(ax=ax5)
tu.plot(ax=ax6)
ne.plot(ax=ax7)
be.plot(ax=ax8)
sw.plot(ax=ax9)
au.plot(ax=ax10)

In [None]:
# conduct spatial joins so that the gdf is split up by country - this takes a looooong time

uk_join = gdf.sjoin(uk, how="right")
sp_join = gdf.sjoin(sp, how="right")
fr_join = gdf.sjoin(fr, how="right")
ge_join = gdf.sjoin(de, how="right")
it_join = gdf.sjoin(it, how="right")
tu_join = gdf.sjoin(tu, how="right")
ne_join = gdf.sjoin(ne, how="right")
be_join = gdf.sjoin(be, how="right")
sw_join = gdf.sjoin(sw, how="right")
au_join = gdf.sjoin(au, how="right")

In [None]:
# begin individual spatial analysis by country

# United Kingdom

In [None]:
#calculate top 10 emojis 
top_emojis_uk = most_common_emojis(uk_join['emoji generic'], 10)
# add column of emoji description
rownum = 0
for row in top_emojis_uk['Emoji']:
    if rownum <= 9:
        emo = top_emojis_uk.loc[rownum, 'Emoji']
        top_emojis_uk.loc[rownum, 'Emoji Description'] = emoji.demojize(emo).replace(":","")
        emocount_total = EmojiTotalCounter(emo, gdf)
        top_emojis_uk.loc[rownum, 'Typicality'] = TypicalityEquation(EmojiSubsetCounter(emo, uk_join), AllEmojiSubsetCounter(uk_join), emocount_total, 6923376)
        rownum = rownum + 1
    else:
        break
top_emojis_uk


# Spain

In [None]:
#calculate top 10 emojis for each country
top_emojis_sp = most_common_emojis(sp_join['emoji generic'], 10)
# add column of emoji description
rownum = 0
for row in top_emojis_sp['Emoji']:
    if rownum <= 9:
        emo = top_emojis_sp.loc[rownum, 'Emoji']
        top_emojis_sp.loc[rownum, 'Emoji Description'] = emoji.demojize(emo).replace(":","")
        emocount_total = EmojiTotalCounter(emo, gdf)
        top_emojis_sp.loc[rownum, 'Typicality'] = TypicalityEquation(EmojiSubsetCounter(emo, sp_join), AllEmojiSubsetCounter(sp_join), emocount_total, 6923376)
        rownum = rownum + 1
    else:
        break
top_emojis_sp


# France

In [None]:
#calculate top 10 emojis 
top_emojis_fr = most_common_emojis(fr_join['emoji generic'], 10)
# add column of emoji description
rownum = 0
for row in top_emojis_fr['Emoji']:
    if rownum <= 9:
        emo = top_emojis_fr.loc[rownum, 'Emoji']
        top_emojis_fr.loc[rownum, 'Emoji Description'] = emoji.demojize(emo).replace(":","")
        emocount_total = EmojiTotalCounter(emo, gdf)
        top_emojis_fr.loc[rownum, 'Typicality'] = TypicalityEquation(EmojiSubsetCounter(emo, fr_join), AllEmojiSubsetCounter(fr_join), emocount_total, 6923376)
        rownum = rownum + 1
    else:
        break
top_emojis_fr


# Germany

In [None]:
#calculate top 10 emojis for each country
top_emojis_ge = most_common_emojis(ge_join['emoji generic'], 10)
# add column of emoji description
rownum = 0
for row in top_emojis_ge['Emoji']:
    if rownum <= 9:
        emo = top_emojis_ge.loc[rownum, 'Emoji']
        top_emojis_ge.loc[rownum, 'Emoji Description'] = emoji.demojize(emo).replace(":","")
        emocount_total = EmojiTotalCounter(emo, gdf)
        top_emojis_ge.loc[rownum, 'Typicality'] = TypicalityEquation(EmojiSubsetCounter(emo, ge_join), AllEmojiSubsetCounter(ge_join), emocount_total, 6923376)
        rownum = rownum + 1
    else:
        break
top_emojis_ge


In [None]:
# the umbrella emoji seems to be extremely typical for Germany - let's dig a little deeper and find out why that is

In [None]:
#number of times umbrella used in germany
print(EmojiTotalCounter('☔', ge_join))
#number of times used in europe
print(EmojiTotalCounter('☔', gdf))


#### One possible explanation for the popularity of the umbrella emoji is it's significance with the Hong Kong pro-democracy protests and Black Lives Matter movements. Of course it could also just refer to weather. Let's find out:

In [None]:
# let's look at posts containing umbrellas in germany
ge_umbrellaposts = ge_join[ge_join['emoji'].str.contains('☂')]
ge_umbrellaposts

In [None]:
# create a function to count the total frequency of the most commonly used emojis
def most_common_hashtags(labels, quantity):
    """
    labels (list) = List of strings to split.
    quantity (int) = Number of most common hashtags to return.
    """
    #words = [i.split(" ", 3)[0] for i in labels]
    #counter = Counter(words).most_common(quantity)
    hashtags = [(re.split(',', i)) for i in labels]
    counter = Counter(x for xs in hashtags for x in set(xs)).most_common(quantity)
    umbrella_hashtags = pd.DataFrame(counter, columns=["Hashtag", "Occurence number"])\
                        .sort_values(by="Occurence number", ascending=False)
    
    umbrella_hashtags = umbrella_hashtags[umbrella_hashtags["Hashtag"] != " "].reset_index(drop=True)
    
    return umbrella_hashtags

In [None]:
most_common_hashtags(ge_umbrellaposts['hashtags'], 10)

Ok, looks like the umbrella emoji is actually mostly weather related

# Italy

In [None]:
#calculate top 10 emojis 
top_emojis_it = most_common_emojis(it_join['emoji generic'], 10)
# add column of emoji description
rownum = 0
for row in top_emojis_it['Emoji']:
    if rownum <= 9:
        emo = top_emojis_it.loc[rownum, 'Emoji']
        top_emojis_it.loc[rownum, 'Emoji Description'] = emoji.demojize(emo).replace(":","")
        emocount_total = EmojiTotalCounter(emo, gdf)
        top_emojis_it.loc[rownum, 'Typicality'] = TypicalityEquation(EmojiSubsetCounter(emo, it_join), AllEmojiSubsetCounter(it_join), emocount_total, 6923376)
        rownum = rownum + 1
    else:
        break
top_emojis_it


# Turkey

In [None]:
#calculate top 10 emojis 
top_emojis_tu = most_common_emojis(tu_join['emoji generic'], 10)
# add column of emoji description
rownum = 0
for row in top_emojis_tu['Emoji']:
    if rownum <= 9:
        emo = top_emojis_tu.loc[rownum, 'Emoji']
        top_emojis_tu.loc[rownum, 'Emoji Description'] = emoji.demojize(emo).replace(":","")
        emocount_total = EmojiTotalCounter(emo, gdf)
        top_emojis_tu.loc[rownum, 'Typicality'] = TypicalityEquation(EmojiSubsetCounter(emo, tu_join), AllEmojiSubsetCounter(tu_join), emocount_total, 6923376)
        rownum = rownum + 1
    else:
        break
top_emojis_tu


# Netherlands

In [None]:
#calculate top 10 emojis 
top_emojis_ne = most_common_emojis(ne_join['emoji generic'], 10)
# add column of emoji description
rownum = 0
for row in top_emojis_ne['Emoji']:
    if rownum <= 9:
        emo = top_emojis_ne.loc[rownum, 'Emoji']
        top_emojis_ne.loc[rownum, 'Emoji Description'] = emoji.demojize(emo).replace(":","")
        emocount_total = EmojiTotalCounter(emo, gdf)
        top_emojis_ne.loc[rownum, 'Typicality'] = TypicalityEquation(EmojiSubsetCounter(emo, ne_join), AllEmojiSubsetCounter(ne_join), emocount_total, 6923376)
        rownum = rownum + 1
    else:
        break
top_emojis_ne

### shouldn't the red circle emoji have a high typicality, since it is used comparatively often in the netherlands compared to the total dataset?

In [None]:
print(EmojiTotalCounter('🔴', ne_join))
print(EmojiSubsetCounter('🔴', ne_join))
print(AllEmojiSubsetCounter(ne_join))
print(EmojiTotalCounter('🔴', gdf))

In [None]:
# is this phenomenon specific to netherlands? Let's calculate the typicality of the red circle emoji using the dutch subset
ne_redcircle_typicality = TypicalityEquation(EmojiSubsetCounter('🔴', ne_join), AllEmojiSubsetCounter(ne_join), EmojiTotalCounter('🔴', gdf), 6923376)
ne_redcircle_typicality

In [None]:
redcircleposts = gdf[gdf['emoji'].str.contains('🔴')]
most_common_hashtags(redcircleposts['hashtags'], 20)

#samensterk = together we are strong (covid-related)
#denhaag - city
#hgl - region haaglanden (around den haag)

### let's look at the temporal trend of emojis containing the red circle emojis - do they correspond with covid restrictions?

In [None]:
sns.set(style = 'whitegrid')
fig,ax =plt.subplots(figsize = (20,5))
ne_join['Week/Month'].value_counts().sort_index().plot(kind = 'bar')
plt.title("Number of 🔴 Tweets per Week", size =15)
plt.xticks(rotation=0)
plt.locator_params(nbins=8)

from these results, it doesn't seem that there's a significant spike in red circle emojis corresponding with the start of the pandemic (March 2020)

# Belgium

In [None]:
#calculate top 10 emojis 
top_emojis_be = most_common_emojis(be_join['emoji generic'], 10)
# add column of emoji description
rownum = 0
for row in top_emojis_be['Emoji']:
    if rownum <= 9:
        emo = top_emojis_be.loc[rownum, 'Emoji']
        top_emojis_be.loc[rownum, 'Emoji Description'] = emoji.demojize(emo).replace(":","")
        emocount_total = EmojiTotalCounter(emo, gdf)
        top_emojis_be.loc[rownum, 'Typicality'] = TypicalityEquation(EmojiSubsetCounter(emo, be_join), AllEmojiSubsetCounter(be_join), emocount_total, 6923376)
        rownum = rownum + 1
    else:
        break
top_emojis_be


# Switzerland

In [None]:
#calculate top 10 emojis 
top_emojis_sw = most_common_emojis(sw_join['emoji generic'], 10)
# add column of emoji description
rownum = 0
for row in top_emojis_sw['Emoji']:
    if rownum <= 9:
        emo = top_emojis_sw.loc[rownum, 'Emoji']
        top_emojis_sw.loc[rownum, 'Emoji Description'] = emoji.demojize(emo).replace(":","")
        emocount_total = EmojiTotalCounter(emo, gdf)
        top_emojis_sw.loc[rownum, 'Typicality'] = TypicalityEquation(EmojiSubsetCounter(emo, sw_join), AllEmojiSubsetCounter(sw_join), emocount_total, 6923376)
        rownum = rownum + 1
    else:
        break
top_emojis_sw


# Austria

In [None]:
#calculate top 10 emojis 
top_emojis_au = most_common_emojis(au_join['emoji generic'], 10)
# add column of emoji description
rownum = 0
for row in top_emojis_au['Emoji']:
    if rownum <= 9:
        emo = top_emojis_au.loc[rownum, 'Emoji']
        top_emojis_au.loc[rownum, 'Emoji Description'] = emoji.demojize(emo).replace(":","")
        emocount_total = EmojiTotalCounter(emo, gdf)
        top_emojis_au.loc[rownum, 'Typicality'] = TypicalityEquation(EmojiSubsetCounter(emo, au_join), AllEmojiSubsetCounter(au_join), emocount_total, 6923376)
        rownum = rownum + 1
    else:
        break
top_emojis_au


In [None]:
# compare top emojis across countries
# United Kingdom
# Spain
# France
# Germany
# Italy
# Turkey
# Netherlands
# Belgium
# Switzerland
# Austria


uk_list = top_emojis_uk.Emoji.values.tolist()
sp_list = top_emojis_sp.Emoji.values.tolist()
fr_list = top_emojis_fr.Emoji.values.tolist()
ge_list = top_emojis_ge.Emoji.values.tolist()
it_list = top_emojis_it.Emoji.values.tolist()
tu_list = top_emojis_tu.Emoji.values.tolist()
ne_list = top_emojis_ne.Emoji.values.tolist()
be_list = top_emojis_be.Emoji.values.tolist()
sw_list = top_emojis_sw.Emoji.values.tolist()
au_list = top_emojis_au.Emoji.values.tolist()

top_emojis_by_country = pd.DataFrame(columns = ['United Kingdom', 'Spain', 'France', 'Germany','Italy', 'Turkey', 'Netherlands',
                                                'Belgium', 'Switzerland', 'Austria'])
top_emojis_by_country['United Kingdom'] = uk_list
top_emojis_by_country['Spain'] = sp_list
top_emojis_by_country['France'] = fr_list
top_emojis_by_country['Germany'] = ge_list
top_emojis_by_country['Italy'] = it_list
top_emojis_by_country['Turkey'] = tu_list
top_emojis_by_country['Netherlands'] = ne_list
top_emojis_by_country['Belgium'] = be_list
top_emojis_by_country['Switzerland'] = sw_list
top_emojis_by_country['Austria'] = au_list

d = dict(selector="th",
    props=[('text-align', 'center')])

top_emojis_over_time.style.set_properties(**{'width':'6em', 'text-align':'center'})\
        .set_table_styles([d])

top_emojis_by_country

# re-calculate typicality for next 50 most frequently used emojis

In [None]:
# let's try it with emojis with skin tone removed
top_emojis_100 = most_common_emojis(gdf['emoji generic'], 100)
print(top_emojis_100.sort_values(['Occurence number'], ascending=[False]))

In [None]:
# we'll filter out the top 50, since those were already analyzed
nexttop50 = top_emojis_100.iloc[50: , :]
nexttop50

In [None]:
# add column of emoji description
rownum = 50
for row in nexttop50['Emoji']:
    if rownum <= 99:
        nexttop50.loc[rownum, 'Emoji Description'] = emoji.demojize(nexttop50.loc[rownum]['Emoji'])
        # calculate relative frequencies for each emoji 
        nexttop50['Rel Freq'] = (nexttop50['Occurence number']/6923376) # 6923376 = number of emojis in dataset
        emo = nexttop50.loc[rownum]['Emoji']
        emocount_total = nexttop50.loc[rownum]['Occurence number']
        nexttop50.loc[rownum, 'T_Jan'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_jan), AllEmojiSubsetCounter(gdf_jan), emocount_total, 6923376)
        nexttop50.loc[rownum, 'T_Feb'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_feb), AllEmojiSubsetCounter(gdf_feb), emocount_total, 6923376)
        nexttop50.loc[rownum, 'T_Mar'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_mar), AllEmojiSubsetCounter(gdf_mar), emocount_total, 6923376)
        nexttop50.loc[rownum, 'T_Apr'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_apr), AllEmojiSubsetCounter(gdf_apr), emocount_total, 6923376)
        nexttop50.loc[rownum, 'T_May'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_may), AllEmojiSubsetCounter(gdf_may), emocount_total, 6923376)
        nexttop50.loc[rownum, 'T_Jun'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_jun), AllEmojiSubsetCounter(gdf_jun), emocount_total, 6923376)
        nexttop50.loc[rownum, 'T_Jul'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_jul), AllEmojiSubsetCounter(gdf_jul), emocount_total, 6923376)
        nexttop50.loc[rownum, 'T_Aug'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_aug), AllEmojiSubsetCounter(gdf_aug), emocount_total, 6923376)
        nexttop50.loc[rownum, 'T_Sep'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_sep), AllEmojiSubsetCounter(gdf_sep), emocount_total, 6923376)
        nexttop50.loc[rownum, 'T_Oct'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_oct), AllEmojiSubsetCounter(gdf_oct), emocount_total, 6923376)
        nexttop50.loc[rownum, 'T_Dec'] = TypicalityEquation(EmojiSubsetCounter(emo, gdf_dec), AllEmojiSubsetCounter(gdf_dec), emocount_total, 6923376)

        rownum = rownum + 1
    else:
        break
print(nexttop50.sort_values(['Occurence number']))

# Instead of most frequently used, now let's look at the most TYPICAL emojis by country

In [None]:
# create a function to count the total frequency of all emojis
def count_all_emojis(labels):
    """
    Split all emoji groupings and count how many times each emoji is repeated in the list 
    labels (list) = List of strings to split.
    """
    emojis = [(re.split(',', i)) for i in labels]
    
    counter = Counter(x for xs in emojis for x in set(xs))
    df = pd.DataFrame.from_dict(counter, orient='index', columns=["Occurrence number"]).sort_values(by="Occurrence number", ascending=False)
    
    return df

In [None]:
# let's test it out on the uk
uk_emojis = count_all_emojis(uk_join['emoji generic'])
uk_emojis.head()

In [None]:
# for each row in table, calculate typicality only if occurrence number > 1000
uk_emojis_typ = uk_emojis[uk_emojis['Occurrence number'] >= 1000] # this is necessary because typicality values become skewed for infrequently used emojis
index = uk_emojis_typ.index
allemojisubset = AllEmojiSubsetCounter(uk_join)
for emo in index:
    uk_emojis_typ.loc[emo, 'Emoji description'] = emoji.demojize(emo)
    emojisubset = uk_emojis_typ.loc[emo, 'Occurrence number']
    emocount_total = EmojiTotalCounter(emo, gdf)
    uk_emojis_typ.loc[emo, 'Typicality'] = TypicalityEquation(emojisubset, allemojisubset, emocount_total, 6923376)
uk_emojis_typ.sort_values(by="Typicality", ascending=False)

In [None]:
top_countries = {
    "United Kingdom": uk_join,
    "Spain": sp_join,
    "France": fr_join,
    "Germany": ge_join,
    "Italy": it_join,
    "Turkey": tu_join,
    "Netherlands": ne_join,
    "Belgium": be_join,
    "Switzerland": sw_join,
    "Austria": au_join
}

In [None]:
# repeat this process for top 10 countries by userday
country_typ ={}
for country, join in top_countries.items():
    emojicount = count_all_emojis(join['emoji generic'])
    co_emojis_typ = emojicount[emojicount['Occurrence number'] >= 1000] # this is necessary because typicality values become skewed for infrequently used emojis
    index = co_emojis_typ.index
    allemojisubset = AllEmojiSubsetCounter(join)
    for emo in index:
        co_emojis_typ.loc[emo, 'Emoji description'] = emoji.demojize(emo)
#         emojisubset = co_emojis_typ.loc[emo, 'Occurrence number']
        emosubset = EmojiSubsetCounter(emo, join)
        emocount_total = EmojiTotalCounter(emo, gdf)
        co_emojis_typ.loc[emo, 'Typicality'] = TypicalityEquation(emojisubset, allemojisubset, emocount_total, 6923376)
    country_typ[country] = co_emojis_typ.sort_values(by="Typicality", ascending=False)
    
country_typ

In [None]:
for country, df in country_typ.items():
    # save df to csv 
    df.to_csv(r"C:\Users\saman\OneDrive\Documents\Thesis\Data\EmojiTypicality_" + country + ".csv", index=True)

In [None]:
top_countries_list = [
    "United Kingdom",
    "Spain",
    "France",
    "Germany",
    "Italy",
    "Turkey",
    "Netherlands",
    "Belgium",
    "Switzerland",
    "Austria"]

In [None]:
# read data back in so I don't have to re-run the whole program in the future
country_typ = {}
for country in top_countries_list:
    country_typ[country] = pd.read_csv(r"C:\Users\saman\OneDrive\Documents\Thesis\Data\TypicalEmojis_ByCountry\EmojiTypicality_" + country + ".csv", index_col=False)
country_typ

In [None]:
pos_country_typ = {}
for country, df in country_typ.items():
    pos_df = df.drop(df.index[df['Typicality'] < 0])
    pos_country_typ[country] = pos_df
    
pos_country_typ

In [None]:
# the most typical emojis per country will now be analyzed in the Emoji-Specific Analysis notebook