In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import geopandas as gp
import geoplot as gplt
import geoplot.crs as gcrs
import contextily as ctx
import numpy as np
import shapely as shapely
from shapely.geometry import Polygon
from shapely.ops import transform
import collections
from collections import Counter
from typing import List, Tuple, Dict, Union, Generator, Optional
from pyproj import Transformer, CRS, Proj
import glob
import os
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import advertools as adv
import warnings
warnings.filterwarnings('ignore')
import emoji

In [2]:
# setting the path for joining multiple files
files = os.path.join(r"C:\Users\saman\OneDrive\Documents\Thesis\Data\\", "RawData_*.csv")
# list of merged files returned
files = glob.glob(files)
# joining files with concat and read_csv
df = pd.concat(map(pd.read_csv, files), ignore_index=True)

In [3]:
# convert to geodf, wgs 84
gdf = gp.GeoDataFrame(df,geometry =gp.points_from_xy(df.long,df.lat),crs ="EPSG:4326")

In [4]:
# define custom function to take out brackets

def no_brackets(column):
    if '{' in column:
        return re.sub("[{}]", "", column)
    else:
        return column

# clean data, display in proper formats
gdf['hashtags'] = gdf['hashtags'].apply(no_brackets).str.lower()
gdf['emoji'] = gdf['emoji'].apply(no_brackets)

In [1]:
# make sure everything looks ok
gdf.head()

NameError: name 'gdf' is not defined

In [8]:
# let's check that all columns contain at least one emoji:
gdf_noemoji = gdf[gdf['emoji'] =='']
gdf_noemoji.head()

Unnamed: 0,post_publish_date,post_body,hashtags,emoji,long,lat,geometry
30,2020-04-01 00:09:08,Throwback to World Championships 2019 🇮🇳 #wako...,"wakoharyana,wakoantalya2019kickboxing,teamkmt,...",,30.6833,36.9,POINT (30.68330 36.90000)
143,2020-04-01 00:33:35,Mon ami Wald du #restaurant #Brésilien du sol ...,"amitie,cotedazur,nice,folowme,restaurant,franc...",,7.26591,43.734459,POINT (7.26591 43.73446)
155,2020-04-01 00:36:29,"Casino Barcelona, #mcjoy #cariocasmcs #casinob...","mcjoy,djs,espanha,cariocasmcs,chorameliga,casi...",,2.196795,41.386449,POINT (2.19680 41.38645)
228,2020-04-01 01:00:33,Diaspora #Haïti 🇭🇹 in the #USA great position ...,"coronavirus,usa,haïti",,-3.703508,40.477795,POINT (-3.70351 40.47779)
245,2020-04-01 01:08:12,توقعات بموت ٢٤٠ الف أمريكي بسبب فيروس كورونا ....,"السعودية,مصر,فلسطين,الكورونا,covidー19,coronakrise",,9.177347,48.779302,POINT (9.17735 48.77930)


In [9]:
# hmm... there seems to be a few posts with blank emoji columns - let's look at the post contents to see why
gdf_noemoji['post_body']

30         Throwback to World Championships 2019 🇮🇳 #wako...
143        Mon ami Wald du #restaurant #Brésilien du sol ...
155        Casino Barcelona, #mcjoy #cariocasmcs #casinob...
228        Diaspora #Haïti 🇭🇹 in the #USA great position ...
245        توقعات بموت ٢٤٠ الف أمريكي بسبب فيروس كورونا ....
                                 ...                        
4167890    #ToyParliament End Devolution NOW🇬🇧🇬🇧🇬🇧🇬🇧🇬🇧🇬🇧🇬🇧🇬🇧
4167903    #CoronaVirus #COVID19  Las medidas para las Fa...
4167911    #CoronaVirus #COVID19  Las medidas para las Fa...
4167920    🇺🇸 #USA Dans un ton consternant #Trump estime ...
4167948    #CoronaVirus #COVID19  Las medidas para las Fa...
Name: post_body, Length: 147921, dtype: object

### looks like most cases with blank emoji fields are blank because the flag emoji gets converted to country codes. Since no straigtforward solution exists to fix this, I'll go ahead and remove them to avoid later confusion

In [10]:
gdf.drop(gdf.index[gdf['emoji'] == ''], inplace=True)
# let's reset the indexes - drop=true because I don't need the old ones
gdf = gdf.reset_index(drop=True)
gdf

Unnamed: 0,post_publish_date,post_body,hashtags,emoji,long,lat,geometry
0,2020-04-01 00:00:32,👏👏📱Buena iniciativa de @vodafone_es me recuerd...,"quedateencasa,covid_19","👏,📱",2.141227,41.392657,POINT (2.14123 41.39266)
1,2020-04-01 00:00:55,Possibile che ogni frame di loro due sia un qu...,"paolo,gfvip","❤️,🥰",11.331656,44.488735,POINT (11.33166 44.48873)
2,2020-04-01 00:01:04,Arkadaşlar 1 Nisan falan diye şaka yapmaya kal...,1nisan,🤦🏻‍♂️,29.005222,41.021321,POINT (29.00522 41.02132)
3,2020-04-01 00:01:11,Can bildiğin insanla yapılan sohbet de olmasa ...,ibrahimtimur,🐺,28.844873,41.009816,POINT (28.84487 41.00982)
4,2020-04-01 00:01:16,Βαρέθηκα κι εγώ 😂 #Tweet #StayAtHome #covid_19...,"tweet,stayathome,covid_19gr",😂,23.503525,37.081730,POINT (23.50353 37.08173)
...,...,...,...,...,...,...,...
4020041,2020-03-31 23:59:14,Mes 3 amours ❤️ #kids #family #love #lovely #m...,"family,love,kids,myeverything,lovely",❤️,2.277010,48.767030,POINT (2.27701 48.76703)
4020042,2020-03-31 23:59:17,Is it just me or has it NOT RAINED ONCE since ...,"coronavirusuk,covid19uk",😡,-2.155641,53.414608,POINT (-2.15564 53.41461)
4020043,2020-03-31 23:59:18,En mode confinement 🦋💙 Day 16 J + 16 Covid-19 ...,mood,"💙,🦋",2.265175,48.886271,POINT (2.26518 48.88627)
4020044,2020-03-31 23:59:19,🇪🇸 Spain: 🔥🔹🔥🔹🔥🔹⚪⚪⚪✨ Slow growth and improving...,stayathome,"☣️,⚪,✨,🔥,🔹,🤧",-4.000000,40.000000,POINT (-4.00000 40.00000)


In [11]:
# and now we don't need the post_body column anymore, so let's remove it to reduce the gdf size
gdf.drop('post_body', inplace=True, axis=1)

#### here we have another problem - emojis with different skin tones are being treated as different emojis for frequency/typicality calculations. To fix this, i'll remove the skin tone modifiers and then convert them back into emojis

In [12]:
# first, we need to make a column containing the text version of emojis (so that the skin tone modifiers can be removed)
rownum = 0
for row in gdf['emoji']:
    if rownum <= 4020045:
        gdf.loc[rownum, 'emoji description'] = emoji.demojize(gdf.loc[rownum, 'emoji'])
        rownum = rownum + 1
    else:
        break
gdf.head()

Unnamed: 0,post_publish_date,hashtags,emoji,long,lat,geometry,emoji description
0,2020-04-01 00:00:32,"quedateencasa,covid_19","👏,📱",2.141227,41.392657,POINT (2.14123 41.39266),":clapping_hands:,:mobile_phone:"
1,2020-04-01 00:00:55,"paolo,gfvip","❤️,🥰",11.331656,44.488735,POINT (11.33166 44.48873),":red_heart:,:smiling_face_with_hearts:"
2,2020-04-01 00:01:04,1nisan,🤦🏻‍♂️,29.005222,41.021321,POINT (29.00522 41.02132),:man_facepalming_light_skin_tone:
3,2020-04-01 00:01:11,ibrahimtimur,🐺,28.844873,41.009816,POINT (28.84487 41.00982),:wolf:
4,2020-04-01 00:01:16,"tweet,stayathome,covid_19gr",😂,23.503525,37.08173,POINT (23.50353 37.08173),:face_with_tears_of_joy:


In [2]:
# let's convert the emojis to text to find the name 
# of each skin tone
sample = emoji.demojize("👍🏻 👍🏼 👍🏽 👍🏾 👍🏿 👍")
sample

':thumbs_up_light_skin_tone: :thumbs_up_medium-light_skin_tone: :thumbs_up_medium_skin_tone: :thumbs_up_medium-dark_skin_tone: :thumbs_up_dark_skin_tone: :thumbs_up:'

In [5]:
# let's remove the skin tone modifiers
sample = sample.replace("_light_skin_tone","")
sample = sample.replace("_medium-light_skin_tone","")
sample = sample.replace("_medium_skin_tone","")
sample = sample.replace("_medium-dark_skin_tone","")
sample = sample.replace("_dark_skin_tone","")
# and now convert the text back into emojis
sample = emoji.emojize(sample, language='alias') 
# return the result
sample

'👍 👍 👍 👍 👍 👍'

In [None]:
# let's test out removing the skin tone modifiers4
sample = sample.replace("_light_skin_tone","")
sample = sample.replace("_medium-light_skin_tone","")
sample = sample.replace("_medium_skin_tone","")
sample = sample.replace("_medium-dark_skin_tone","")
sample = sample.replace("_dark_skin_tone","")
sample = emoji.emojize(sample, language='alias')
sample

In [15]:
rownum = 0
for row in gdf['emoji']:
    if rownum <= 4020045:
        gdf.loc[rownum, 'emoji description'] = gdf.loc[rownum, 'emoji description'].replace("_light_skin_tone","")
        gdf.loc[rownum, 'emoji description'] = gdf.loc[rownum, 'emoji description'].replace("_medium-light_skin_tone","")
        gdf.loc[rownum, 'emoji description'] = gdf.loc[rownum, 'emoji description'].replace("_medium_skin_tone","")
        gdf.loc[rownum, 'emoji description'] = gdf.loc[rownum, 'emoji description'].replace("_medium-dark_skin_tone","")
        gdf.loc[rownum, 'emoji description'] = gdf.loc[rownum, 'emoji description'].replace("_dark_skin_tone","")
        rownum = rownum + 1
    else:
        break
gdf.head()

Unnamed: 0,post_publish_date,hashtags,emoji,long,lat,geometry,emoji description
0,2020-04-01 00:00:32,"quedateencasa,covid_19","👏,📱",2.141227,41.392657,POINT (2.14123 41.39266),":clapping_hands:,:mobile_phone:"
1,2020-04-01 00:00:55,"paolo,gfvip","❤️,🥰",11.331656,44.488735,POINT (11.33166 44.48873),":red_heart:,:smiling_face_with_hearts:"
2,2020-04-01 00:01:04,1nisan,🤦🏻‍♂️,29.005222,41.021321,POINT (29.00522 41.02132),:man_facepalming:
3,2020-04-01 00:01:11,ibrahimtimur,🐺,28.844873,41.009816,POINT (28.84487 41.00982),:wolf:
4,2020-04-01 00:01:16,"tweet,stayathome,covid_19gr",😂,23.503525,37.08173,POINT (23.50353 37.08173),:face_with_tears_of_joy:


In [16]:
rownum = 0
for row in gdf['emoji description']:
    if rownum <= 4020045:
        gdf.loc[rownum, 'emoji generic'] = emoji.emojize(gdf.loc[rownum, 'emoji description'], language='alias')
        rownum = rownum + 1
    else:
        break
gdf.head()

Unnamed: 0,post_publish_date,hashtags,emoji,long,lat,geometry,emoji description,emoji generic
0,2020-04-01 00:00:32,"quedateencasa,covid_19","👏,📱",2.141227,41.392657,POINT (2.14123 41.39266),":clapping_hands:,:mobile_phone:","👏,📱"
1,2020-04-01 00:00:55,"paolo,gfvip","❤️,🥰",11.331656,44.488735,POINT (11.33166 44.48873),":red_heart:,:smiling_face_with_hearts:","❤️,🥰"
2,2020-04-01 00:01:04,1nisan,🤦🏻‍♂️,29.005222,41.021321,POINT (29.00522 41.02132),:man_facepalming:,🤦‍♂️
3,2020-04-01 00:01:11,ibrahimtimur,🐺,28.844873,41.009816,POINT (28.84487 41.00982),:wolf:,🐺
4,2020-04-01 00:01:16,"tweet,stayathome,covid_19gr",😂,23.503525,37.08173,POINT (23.50353 37.08173),:face_with_tears_of_joy:,😂


In [17]:
# save this resulting gdf to GeoJSON (csv will cause errors)
# this way the data can be directly imported into other notebooks without repeating these processing steps
gdf.to_file(r"C:\Users\saman\OneDrive\Documents\Thesis\Data\RawData_Cleaned_Final.geojson", driver="GeoJSON") 