In [None]:
# this notebook shows the visualization process for the spatial typicality of individual emojis

In [None]:
import warnings
warnings.filterwarnings("ignore")
from pathlib import Path
import geopandas as gp
import pandas as pd
from pyproj import Transformer, CRS, Proj
from shapely.geometry import shape, Point, Polygon, box
from matplotlib.colors import LinearSegmentedColormap
import numpy as np
import shapely.speedups as speedups
import contextily as ctx
from collections import Counter
import matplotlib.pyplot as plt
import mapclassify as mc
speedups.enable()
import emoji
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import matplotlib.colors as mcolors

In [None]:
"""
Defining constants to be used throughout the program

"""

#create grids based on the custom made eu shapefile

GRID_SIZE_METERS = 100000 
                        
# target projection: Mollweide
EPSG_CODE = 54009
CRS_PROJ = f"esri:{EPSG_CODE}"

# Input projection WGS 84
CRS_WGS = "epsg:4326"

# define Transformer ahead of time
# with xy-order of coordinates
PROJ_TRANSFORMER = Transformer.from_crs(
    CRS_WGS, CRS_PROJ, always_xy=True)

# also define reverse projection
PROJ_TRANSFORMER_BACK = Transformer.from_crs(
    CRS_PROJ, CRS_WGS, always_xy=True)

#projecting the bounds of the eu-shapefile to Mollweide

XMIN = PROJ_TRANSFORMER.transform(-18.729512 , 29.234046)[0]
XMAX = PROJ_TRANSFORMER.transform(39.73858, 29.234046)[0]
YMAX = PROJ_TRANSFORMER.transform(49.59352369, 71.16987838)[1]
YMIN = PROJ_TRANSFORMER.transform(49.59352369, 28.017169)[1]

# color map to use for typicality maps
BrBG = cm.get_cmap('BrBG')
newcmp = ListedColormap(BrBG(np.linspace(0.25, 0.75, 10)))

In [None]:
# here's an example emoji
EMOJI = "🙏"

In [None]:
# open and read csv file containing HLL data
df = pd.read_csv (r"C:\Users\saman\OneDrive\Documents\Thesis\Data\HLLData_load.csv")
# Converting it to data frame
df = pd.DataFrame(data=df)
# Convert to geodataframe, projection Mollweide
gdf = gp.GeoDataFrame(df,geometry =gp.points_from_xy(df.longitude,df.latitude),crs =4326)

In [None]:
gdf.to_crs(CRS_PROJ,inplace=True)

In [None]:
gdf

In [None]:
# read in the custom study area shapefile (created in ArcGIS Pro)
europe = gp.read_file("Europe_Clipped_BBox.shp")
europe.to_crs(CRS_PROJ, inplace =True)
europe.plot()

In [None]:
# now let's create the 100 x 100 km grids
def create_grids():
    
    """
    Creating polygons based on the grid size
    """
    
    width = GRID_SIZE_METERS
    length = GRID_SIZE_METERS
    cols = list(range(int(np.floor(XMIN)), int(np.ceil(XMAX)), width))
    rows = list(range(int(np.floor(YMIN)), int(np.ceil(YMAX)), length))
    rows.reverse()

    polygons = []
    for x in cols:
         for y in rows:
                # combine to tuple: (x,y, poly)
                # and append to list
                polygons.append(
                    (x, y,
                     Polygon([
                         (x, y),
                         (x+width, y),
                         (x+width, y-length),
                         (x, y-length)])))
    grid = pd.DataFrame(polygons)
        # name columns
    col_labels=['xbin', 'ybin', 'bin_poly']
    grid.columns = col_labels
        # use x and y as index columns
    grid.set_index(['xbin', 'ybin'], inplace=True)
    grid = gp.GeoDataFrame(
            grid.drop(
                columns=["bin_poly"]),
                geometry=grid.bin_poly)
    grid.crs = CRS_PROJ
    return grid,cols,rows

grid,cols,rows = create_grids()

In [None]:
ybins = np.array(rows)
xbins = np.array(cols)

def get_best_bins(search_values_x, search_values_y,xbins, ybins): 
    """Will return best bin for a lat and lng input
    
    Note: prepare bins and values in correct matching projection
    
    """
    xbins_idx = np.digitize(search_values_x, xbins, right=False)
    ybins_idx = np.digitize(search_values_y, ybins, right=False)
    return (xbins[xbins_idx-1], ybins[ybins_idx-1])


xbins_match, ybins_match = get_best_bins(
    search_values_x=gdf.geometry.x.to_numpy(),
    search_values_y=gdf.geometry.y.to_numpy(),
    xbins=xbins, ybins=ybins)

In [None]:
base = grid.plot(figsize=(22,28), color='white', edgecolor='black', linewidth=0.2)
base.grid(False)
# Hide axes ticks
base.axes.xaxis.set_visible(False)
base.axes.yaxis.set_visible(False)
# combine with europe geometry
plot = europe.boundary.plot(ax=base, linewidth=0.8)
plt.title("100x100 km Grid Superimposed on Study Area", size =35)

In [None]:
gdf.loc[:, 'xbins_match'] = xbins_match
gdf.loc[:, 'ybins_match'] = ybins_match
gdf.drop(columns = ['longitude','latitude','geometry'],inplace =True)
gdf.set_index(['xbins_match', 'ybins_match'], inplace=True)
# gdf.dropna(subset = 'hashtag', inplace =True)
grid.sort_index(inplace =True)
gdf.sort_index(inplace = True)
common_idx = grid.index.intersection(gdf.index) 
#instead of a spatial join, indexes are used to find which hashtag belongs to which grid
gdf

In [None]:
#counting the occurence of each emoji in preparation of typicality calculations
count = Counter()  
gdf['emoji'].str.split(',').apply(count.update)

In [None]:
#calculating frequency for total dataset     
n_t = count['🙏']
N_t = sum(count.values())
F_t = n_t/N_t

In [None]:
def grid_typicality(new_test,idx):    
     
        #calculating frequency for each grid (sub-dataset) 
        counter = Counter()
        new_test.str.split(',').apply(counter.update)
        n_s = counter[EMOJI]
        if (n_s == 0):
            typ.loc[idx,'typicality'] = -1.0
        else:    
            N_s = sum(counter.values())
            F_s = n_s/N_s
            typ.loc[idx,'typicality'] = (F_s - F_t)/F_t 

In [None]:
typ = pd.DataFrame(index = common_idx, columns = ['typicality'], data = '') #dummy dataframe to hold the typicality values

for idx,midx in enumerate(common_idx): #looping through all the common indexes between the grids and dataframe
    grid_typicality(gdf.loc[midx,"emoji"], common_idx[idx])

In [None]:
geom = grid.loc[common_idx, "geometry"]
typ_gdf = gp.GeoDataFrame(data = typ['typicality'], geometry =geom, crs = CRS_PROJ)
typ_gdf

In [None]:
base = grid.plot(figsize=(22,28), color='white', edgecolor='black', linewidth=0.1)
# combine with europe geometry
plot = typ_gdf.plot(ax=base, column = 'typicality', colormap='Greens', alpha = 0.7, edgecolor='gray', linewidth=0.1)
europe.boundary.plot(ax=base, alpha=0.3)
plt.title("Typicality of Folded Hands Emoji", size =35)

In [None]:
# repeat for other emojis
EMOJI = '🔴'

typ = pd.DataFrame(index = common_idx, columns = ['typicality'], data = '') #dummy dataframe to hold the typicality values

for idx,midx in enumerate(common_idx): #looping through all the common indexes between the grids and dataframe
    grid_typicality(gdf.loc[midx,"emoji"], common_idx[idx])


In [None]:
geom = grid.loc[common_idx, "geometry"]
typ_gdf = gp.GeoDataFrame(data = typ['typicality'], geometry =geom, crs = CRS_PROJ)
typ_gdf

In [None]:
base = grid.plot(figsize=(22,28), color='white', edgecolor='black', linewidth=0.1)
# combine with europe geometry
plot = typ_gdf.plot(ax=base, column = 'typicality', colormap='Greens', alpha = 0.7, edgecolor='gray', linewidth=0.1)
europe.boundary.plot(ax=base, alpha=0.3)
plt.title("Typicality of Red Circle Emoji", size =35)

In [None]:
# repeat for other emojis
EMOJI = '😷'

typ = pd.DataFrame(index = common_idx, columns = ['typicality'], data = '') #dummy dataframe to hold the typicality values

for idx,midx in enumerate(common_idx): #looping through all the common indexes between the grids and dataframe
    grid_typicality(gdf.loc[midx,"emoji"], common_idx[idx])

geom = grid.loc[common_idx, "geometry"]
typ_gdf = gp.GeoDataFrame(data = typ['typicality'], geometry =geom, crs = CRS_PROJ)

base = grid.plot(figsize=(22,28), color='white', edgecolor='black', linewidth=0.1)
# combine with europe geometry
plot = typ_gdf.plot(ax=base, column = 'typicality', colormap='Greens', alpha = 0.7, edgecolor='gray', linewidth=0.1)
europe.boundary.plot(ax=base, alpha=0.3)
plt.title("Typicality of Masked Face Emoji", size =35)

In [None]:
top100emojis = gp.read_file(r"C:\Users\saman\OneDrive\Documents\Thesis\Data\Top100Emojis.csv")
# the emojis don't read properly from a csv so we need to re-generate them from the descriptions
rownum = 0
for row in top100emojis['Emoji Description']:
    if rownum <= 4020045:
        top100emojis.loc[rownum, 'Emoji'] = emoji.emojize(top100emojis.loc[rownum, 'Emoji Description'], language='alias')
        rownum = rownum + 1
    else:
        break

In [None]:
top100emojis

In [None]:
from mpl_toolkits.axes_grid1.anchored_artists import AnchoredSizeBar
import matplotlib.font_manager as fm
fontprops = fm.FontProperties(size=18)

In [None]:
test_cmap = mpl.colors.LinearSegmentedColormap.from_list('beigeblue',['#d9af8c','#FFFFFF','#009999'], N=8)

In [None]:
rownum = 0
for row in top100emojis['Emoji']:
    if rownum <=100:
        EMOJI = top100emojis.loc[rownum, 'Emoji']

        #calculating frequency for total dataset     
        n_t = count[EMOJI]
        N_t = sum(count.values())
        F_t = n_t/N_t

        typ = pd.DataFrame(index = common_idx, columns = ['typicality'], data = '') #dummy dataframe to hold the typicality values

        for idx,midx in enumerate(common_idx): #looping through all the common indexes between the grids and dataframe
            grid_typicality(gdf.loc[midx,"emoji"], common_idx[idx])

        geom = grid.loc[common_idx, "geometry"]
        typ_gdf = gp.GeoDataFrame(data = typ['typicality'], geometry =geom, crs = CRS_PROJ)

        base = grid.plot(figsize=(22,28), color='white', edgecolor='black', linewidth=0.1)
        # combine with europe geometry
        plot = typ_gdf.plot(ax=base, column = 'typicality', colormap='Greens', alpha = 0.7, edgecolor='gray', linewidth=0.1)
        europe.boundary.plot(ax=base, alpha=0.3)
        emojiname = emoji.demojize(str(EMOJI)).replace(":","")
        plt.title("Typicality of "+ emojiname +" Emoji", size =35)
        fig = plot.get_figure()
        fig.savefig(r"C:\Users\saman\OneDrive\Documents\Thesis\Figures\TypicalityMap_" + emojiname + ".png", dpi=300, bbox_inches = "tight")
        rownum += 1
    else:
        break

In [None]:
rownum = 0
for row in top100emojis['Emoji']:
    if rownum <= 100:
        EMOJI = top100emojis.loc[rownum, 'Emoji']

        #calculating frequency for total dataset     
        n_t = count[EMOJI]
        N_t = sum(count.values())
        F_t = n_t/N_t

        typ = pd.DataFrame(index = common_idx, columns = ['typicality'], data = '') #dummy dataframe to hold the typicality values

        for idx,midx in enumerate(common_idx): #looping through all the common indexes between the grids and dataframe
            grid_typicality(gdf.loc[midx,"emoji"], common_idx[idx])

        geom = grid.loc[common_idx, "geometry"]
        typ_gdf = gp.GeoDataFrame(data = typ['typicality'], geometry =geom, crs = CRS_PROJ)

        base = grid.plot(figsize=(22,28), color='white', edgecolor='black', linewidth=0.1)
        # combine with europe geometry
        emojinameunderscore = emoji.demojize(str(EMOJI)).replace(":","")
        emojiname = emojinameunderscore.replace("_", " ")
        emojiname = emojiname.title()
        plot = typ_gdf.plot(ax=base, column = 'typicality', colormap='Greens', alpha = 0.7, edgecolor='gray', linewidth=0.1)
        europe.boundary.plot(ax=base, alpha=0.3)
        fig = plot.get_figure()
        fig.suptitle("Typicality of "+ emojiname +" Emoji", size =35, y=0.78)
        fig.savefig(r"C:\Users\saman\OneDrive\Documents\Thesis\Figures\TypicalityMap_" + emojinameunderscore + ".png", dpi=300, bbox_inches = "tight")
        rownum += 1
    else:
        break

In [None]:
rownum = 0
for row in top100emojis['Emoji']:
    if rownum <= 100:
        EMOJI = top100emojis.loc[rownum, 'Emoji']

        #calculating frequency for total dataset     
        n_t = count[EMOJI]
        N_t = sum(count.values())
        F_t = n_t/N_t

        typ = pd.DataFrame(index = common_idx, columns = ['typicality'], data = '') #dummy dataframe to hold the typicality values

        for idx,midx in enumerate(common_idx): #looping through all the common indexes between the grids and dataframe
            grid_typicality(gdf.loc[midx,"emoji"], common_idx[idx])

        geom = grid.loc[common_idx, "geometry"]
        typ_gdf = gp.GeoDataFrame(data = typ['typicality'], geometry =geom, crs = CRS_PROJ)

        base = grid.plot(figsize=(22,28), color='white', edgecolor='black', linewidth=0.1)
        # combine with europe geometry
        emojinameunderscore = emoji.demojize(str(EMOJI)).replace(":","")
        emojiname = emojinameunderscore.replace("_", " ")
        emojiname = emojiname.title()
        plot = typ_gdf.plot(ax=base, column = 'typicality', colormap=test_cmap, alpha = 0.8, edgecolor='gray', linewidth=0.1)
        europe.boundary.plot(ax=base, alpha=0.3)
        fig = plot.get_figure()
        fig.suptitle("Typicality of "+ emojiname +" Emoji", size =35, y=0.78)
        fig.savefig(r"C:\Users\saman\OneDrive\Documents\Thesis\Figures\Spatial_Typicality_BeigeBlue\Grid_"+ emojinameunderscore + ".png", dpi=300, bbox_inches = "tight")
        rownum += 1
    else:
        break

In [None]:
fig, ax = plt.subplots(figsize=(10, 1))
fig.subplots_adjust(bottom=0.5)
cmap = test_cmap
norm = mpl.colors.Normalize(vmin=-1, vmax=1)
cb1 = mpl.colorbar.ColorbarBase(ax, cmap=test_cmap,
                                norm=norm,
                                orientation='horizontal')
cb1.set_label('Typicality', fontsize=20)
fig.show()
fig.savefig(r"C:\Users\saman\OneDrive\Documents\Thesis\Figures\BeigeBlue_ColorRamp_Horizontal.png", dpi=300, bbox_inches = "tight")


In [None]:
# create custom scale cell
white_cmap = mpl.colors.LinearSegmentedColormap.from_list('white',['#FFFFFF','#FFFFFF'], N=1)

fig, ax = plt.subplots(figsize=(1,2))
fig.subplots_adjust(bottom=0.5)
cmap = test_cmap
norm = mpl.colors.Normalize(vmin=-1, vmax=1)
cb1 = mpl.colorbar.ColorbarBase(ax, cmap=white_cmap,
                                norm=norm,
                                orientation='vertical')
cb1.set_ticks([])
cb1.set_label('= 10,000 square kilometers', labelpad=440, y=0.9, rotation=0, fontsize = 60)
fig.show()
fig.savefig(r"C:\Users\saman\OneDrive\Documents\Thesis\Figures\ScaleCell.png", dpi=300, bbox_inches = "tight")


In [None]:
typ_gdf['typicality'].max()

In [None]:
# this part is for a single emoji
EMOJI = '😷'

#calculating frequency for total dataset     
n_t = count[EMOJI]
N_t = sum(count.values())
F_t = n_t/N_t

typ = pd.DataFrame(index = common_idx, columns = ['typicality'], data = '') #dummy dataframe to hold the typicality values

for idx,midx in enumerate(common_idx): #looping through all the common indexes between the grids and dataframe
    grid_typicality(gdf.loc[midx,"emoji"], common_idx[idx])

geom = grid.loc[common_idx, "geometry"]
typ_gdf = gp.GeoDataFrame(data = typ['typicality'], geometry =geom, crs = CRS_PROJ)

base = grid.plot(figsize=(22,28), color='white', edgecolor='black', linewidth=0.1)
# combine with europe geometry
emojinameunderscore = emoji.demojize(str(EMOJI)).replace(":","")
emojiname = emojinameunderscore.replace("_", " ")
emojiname = emojiname.title()
plot = typ_gdf.plot(ax=base, column = 'typicality', colormap=test_cmap, alpha = 0.8, edgecolor='gray', linewidth=0.1, 
                    legend=True)
europe.boundary.plot(ax=base, alpha=1, edgecolor='black', linewidth=0.3)
# plt.legend(["-1.0", "0", "1"])

# vmin = -1
# vmax = 1
# sm = plt.cm.ScalarMappable(cmap=test_cmap, norm=plt.Normalize(vmin=vmin, vmax=vmax))
# sm._A = []
# cbar = fig.colorbar(sm)

fig = plot.get_figure()
fig.suptitle("Typicality of "+ emojiname +" Emoji", size =35, y=0.78)
# fig.savefig(r"C:\Users\saman\OneDrive\Documents\Thesis\Figures\ScienceSlam\TypicalityMap_" + emojinameunderscore + ".png", dpi=300, bbox_inches = "tight")


In [None]:
fig, ax = plt.subplots(figsize=(6, 1))
fig.subplots_adjust(bottom=0.5)
cmap = test_cmap
cb1 = mpl.colorbar.ColorbarBase(ax, cmap=test_cmap,
                                orientation='horizontal')
cb1.set_label('Typicality')
fig.show()

In [None]:
# this part is for the science slam
EMOJI = '🍺'

#calculating frequency for total dataset     
n_t = count[EMOJI]
N_t = sum(count.values())
F_t = n_t/N_t

typ = pd.DataFrame(index = common_idx, columns = ['typicality'], data = '') #dummy dataframe to hold the typicality values

for idx,midx in enumerate(common_idx): #looping through all the common indexes between the grids and dataframe
    grid_typicality(gdf.loc[midx,"emoji"], common_idx[idx])

geom = grid.loc[common_idx, "geometry"]
typ_gdf = gp.GeoDataFrame(data = typ['typicality'], geometry =geom, crs = CRS_PROJ)

base = grid.plot(figsize=(22,28), color='white', edgecolor='black', linewidth=0.1)
# combine with europe geometry
emojinameunderscore = emoji.demojize(str(EMOJI)).replace(":","")
emojiname = emojinameunderscore.replace("_", " ")
emojiname = emojiname.title()
plot = typ_gdf.plot(ax=base, column = 'typicality', colormap=test_cmap, alpha = 0.7, edgecolor='gray', linewidth=0.1)
europe.boundary.plot(ax=base, alpha=1, edgecolor='black', linewidth=0.3)
fig = plot.get_figure()
fig.suptitle("Typicality of "+ emojiname +" Emoji", size =35, y=0.78)
# fig.savefig(r"C:\Users\saman\OneDrive\Documents\Thesis\Figures\ScienceSlam\TypicalityMap_" + emojinameunderscore + ".png", dpi=300, bbox_inches = "tight")
