# Setup Stuff

In [1]:
from google.colab import drive
drive.mount('/drive')

Mounted at /drive


In [2]:
import plotly.graph_objects as go
from datetime import datetime
import pandas as pd
import numpy as np
import json

In [3]:
with open('languages.json') as f:
  languages = json.load(f)

In [4]:
features = ['text_tokens', 'hashtags', 'tweet_id', 'media', 'links','domains', 'tweet_type',     
    'language', 'timestamp', 'a_user_id', 'a_follower_count','a_following_count', 'a_is_verified',          
    'a_account_creation', 'b_user_id','b_follower_count', 'b_following_count', 'b_is_verified',         
    'b_account_creation', 'b_follows_a', 'reply', 'retweet', 'retweet_comment', 'like']

In [5]:
df = pd.read_csv('/drive/MyDrive/part-00000.tsv', sep='\x01', header=None)
df.columns = features

In [6]:
df['language'] = df['language'].map(languages).fillna(df['language'])

In [7]:
df['dt_day'] = pd.to_datetime(df['timestamp'], unit='s').dt.day.values.astype(np.int8)
df['dt_dow'] = pd.to_datetime(df['timestamp'], unit='s').dt.dayofweek.values.astype(np.int8)
df['dt_hour'] = pd.to_datetime(df['timestamp'], unit='s').dt.hour.values.astype(np.int8)

In [8]:
df['n_tokens'] = df['text_tokens'].apply(lambda x: str(x).count('\t')+1 if not(pd.isnull(x)) else 0)

In [9]:
df['type_toplevel'] = 0
df['type_retweet'] = 0
df['type_quote'] = 0

df.loc[df.tweet_type=='TopLevel', 'type_toplevel'] = 1
df.loc[df.tweet_type=='Retweet', 'type_retweet'] = 1
df.loc[df.tweet_type=='Quote', 'type_quote'] = 1

In [10]:
df['n_hashtags'] = df['hashtags'].apply(lambda x: str(x).count('\t')+1 if not(pd.isnull(x)) else 0)
df['n_domains'] = df['domains'].apply(lambda x: str(x).count('\t')+1 if not(pd.isnull(x)) else 0)
df['n_links'] = df['links'].apply(lambda x: str(x).count('\t')+1 if not(pd.isnull(x)) else 0)

In [11]:
df['n_photo'] = df['media'].apply(lambda x: str(x).count('Photo') if not(pd.isnull(x)) else 0)
df['n_gif'] = df['media'].apply(lambda x: str(x).count('GIF') if not(pd.isnull(x)) else 0)
df['n_video'] = df['media'].apply(lambda x: str(x).count('Video') if not(pd.isnull(x)) else 0)

In [12]:
df['support'] = 1

df['reply'] = df['reply'].fillna(0)
df['retweet'] = df['retweet'].fillna(0)
df['retweet_comment'] = df['retweet_comment'].fillna(0)
df['like'] = df['like'].fillna(0)

df.loc[df.reply>0, 'reply'] = 1
df.loc[df.retweet>0, 'retweet'] = 1
df.loc[df.retweet_comment>0, 'retweet_comment'] = 1
df.loc[df.like>0, 'like'] = 1

In [13]:
top_10_languages = df['language'].value_counts().index[:10]
tmp = df.groupby(by=['language']).sum()
tmp = tmp.reset_index()

# Global distribution of languages

In [14]:
counts = df['language'].value_counts()

fig = go.Figure(go.Bar(
            x=counts.index,
            y=counts.values,
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

# Number of tweets for each day of the week

From an analysis of the tweets it seems that a shift of two weeks has been applied to the timestamps, maintaining therefore the day of the week of the original tweet

## Global

In [15]:
dow_counts = df['dt_dow'].value_counts()

fig = go.Figure(go.Bar(
            x=dow_counts.index,
            y=dow_counts.values,
            orientation='v'))

fig.show(renderer="colab")

## Italian

In [16]:
dow_counts = df.loc[df.language == 'italiano']['dt_dow'].value_counts()

fig = go.Figure(go.Bar(
            x=dow_counts.index,
            y=dow_counts.values,
            orientation='v'))

fig.show(renderer="colab")

## English

In [17]:
dow_counts = df.loc[df.language == 'inglese']['dt_dow'].value_counts()

fig = go.Figure(go.Bar(
            x=dow_counts.index,
            y=dow_counts.values,
            orientation='v'))

fig.show(renderer="colab")

## Japanese

In [18]:
dow_counts = df.loc[df.language == 'giapponese']['dt_dow'].value_counts()

fig = go.Figure(go.Bar(
            x=dow_counts.index,
            y=dow_counts.values,
            orientation='v'))

fig.show(renderer="colab")

## Spanish

In [19]:
dow_counts = df.loc[df.language == 'spagnolo']['dt_dow'].value_counts()

fig = go.Figure(go.Bar(
            x=dow_counts.index,
            y=dow_counts.values,
            orientation='v'))

fig.show(renderer="colab")

# Number of tweets for each hour

From an analysis of the tweets it seems that the original hour was kept. This is also confirmed by the following plots

## Italian

In [20]:
italian_hour_counts = df.loc[df.language == 'italiano']['dt_hour'].value_counts()

fig = go.Figure(go.Bar(
            x=italian_hour_counts.index,
            y=italian_hour_counts.values,
            orientation='v'))

fig.show(renderer="colab")

## Japanese

* 11:00 GTM = 20:00 in Japan
* 19:00 GTM = 04:00 in Japan

In [21]:
japanese_hour_counts = df.loc[df.language == 'giapponese']['dt_hour'].value_counts()

fig = go.Figure(go.Bar(
            x=japanese_hour_counts.index,
            y=japanese_hour_counts.values,
            orientation='v'))

fig.show(renderer="colab")

## English

In [22]:
english_hour_counts = df.loc[df.language == 'inglese']['dt_hour'].value_counts()

fig = go.Figure(go.Bar(
            x=english_hour_counts.index,
            y=english_hour_counts.values,
            orientation='v'))

fig.show(renderer="colab")

## Spanish

In [23]:
spanish_hour_counts = df.loc[df.language == 'spagnolo']['dt_hour'].value_counts()

fig = go.Figure(go.Bar(
            x=spanish_hour_counts.index,
            y=spanish_hour_counts.values,
            orientation='v'))

fig.show(renderer="colab")

# Number of media

## Number of photos

In [24]:
tmp['photo_perc'] = tmp['n_photo'] / tmp['support']

sortx = [x for _,x in sorted(zip(tmp.photo_perc,tmp.language), reverse=True)]
sorty = sorted(tmp.photo_perc, reverse=True)

colors = []
for i in sortx:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=sortx,
            y=sorty,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

## Number of GIFs

In [25]:
tmp['gif_perc'] = tmp['n_gif'] / tmp['support']

sortx = [x for _,x in sorted(zip(tmp.gif_perc,tmp.language), reverse=True)]
sorty = sorted(tmp.gif_perc, reverse=True)

colors = []
for i in sortx:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=sortx,
            y=sorty,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

## Number of videos

In [26]:
tmp['video_perc'] = tmp['n_video'] / tmp['support']

sortx = [x for _,x in sorted(zip(tmp.video_perc,tmp.language), reverse=True)]
sorty = sorted(tmp.video_perc, reverse=True)

colors = []
for i in sortx:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=sortx,
            y=sorty,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

# Number of hashtags and links

## Number of hashtags

In [27]:
tmp['hashtag_perc'] = tmp['n_hashtags'] / tmp['support']

sortx = [x for _,x in sorted(zip(tmp.hashtag_perc,tmp.language), reverse=True)]
sorty = sorted(tmp.hashtag_perc, reverse=True)

colors = []
for i in sortx:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=sortx,
            y=sorty,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

## Number of links

In [28]:
tmp['link_perc'] = tmp['n_links'] / tmp['support']

sortx = [x for _,x in sorted(zip(tmp.link_perc,tmp.language), reverse=True)]
sorty = sorted(tmp.link_perc, reverse=True)

colors = []
for i in sortx:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=sortx,
            y=sorty,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

# Number of likes, replies, retweets, comments

## Number of likes

In [29]:
tmp['like_perc'] = tmp['like'] / tmp['support']

sortx = [x for _,x in sorted(zip(tmp.like_perc,tmp.language), reverse=True)]
sorty = sorted(tmp.like_perc, reverse=True)

colors = []
for i in sortx:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=sortx,
            y=sorty,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

## Number of reply

In [30]:
tmp['reply_perc'] = tmp['reply'] / tmp['support']

sortx = [x for _,x in sorted(zip(tmp.reply_perc,tmp.language), reverse=True)]
sorty = sorted(tmp.reply_perc, reverse=True)

colors = []
for i in sortx:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=sortx,
            y=sorty,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

## Number of retweets

In [31]:
tmp['retweet_perc'] = tmp['retweet'] / tmp['support']

sortx = [x for _,x in sorted(zip(tmp.retweet_perc,tmp.language), reverse=True)]
sorty = sorted(tmp.retweet_perc, reverse=True)

colors = []
for i in sortx:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=sortx,
            y=sorty,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

## Number of retweets with comments

In [32]:
tmp['comment_perc'] = tmp['retweet_comment'] / tmp['support']

sortx = [x for _,x in sorted(zip(tmp.comment_perc,tmp.language), reverse=True)]
sorty = sorted(tmp.comment_perc, reverse=True)

colors = []
for i in sortx:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=sortx,
            y=sorty,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

# Verified

## Engaged With

In [33]:
tmp['a_perc'] = tmp['a_is_verified'] / tmp['support']

sortx = [x for _,x in sorted(zip(tmp.a_perc,tmp.language), reverse=True)]
sorty = sorted(tmp.a_perc, reverse=True)

colors = []
for i in sortx:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=sortx,
            y=sorty,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

## Engaging

In [34]:
tmp['b_perc'] = tmp['b_is_verified'] / tmp['support']

sortx = [x for _,x in sorted(zip(tmp.b_perc,tmp.language), reverse=True)]
sorty = sorted(tmp.b_perc, reverse=True)

colors = []
for i in sortx:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=sortx,
            y=sorty,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

# Follower count

## Engaged with follower

In [35]:
tmp['a_follower_perc'] = tmp['a_follower_count'] / tmp['support']

sortx = [x for _,x in sorted(zip(tmp.a_follower_perc,tmp.language), reverse=True)]
sorty = sorted(tmp.a_follower_perc, reverse=True)

colors = []
for i in sortx:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=sortx,
            y=sorty,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

## Engaging follower

In [36]:
tmp['b_follower_perc'] = tmp['b_follower_count'] / tmp['support']

sortx = [x for _,x in sorted(zip(tmp.b_follower_perc,tmp.language), reverse=True)]
sorty = sorted(tmp.b_follower_perc, reverse=True)

colors = []
for i in sortx:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=sortx,
            y=sorty,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

# Number of tokens

In [37]:
tmp['token_perc'] = tmp['n_tokens'] / tmp['support']

sortx = [x for _,x in sorted(zip(tmp.token_perc,tmp.language), reverse=True)]
sorty = sorted(tmp.token_perc, reverse=True)

colors = []
for i in sortx:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=sortx,
            y=sorty,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

# Tweet type

## TopLevel

In [38]:
tmp['toplevel_perc'] = tmp['type_toplevel'] / tmp['support']

sortx = [x for _,x in sorted(zip(tmp.toplevel_perc,tmp.language), reverse=True)]
sorty = sorted(tmp.toplevel_perc, reverse=True)

colors = []
for i in sortx:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=sortx,
            y=sorty,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

## Retweet

In [39]:
tmp['retweet_type_perc'] = tmp['type_retweet'] / tmp['support']

sortx = [x for _,x in sorted(zip(tmp.retweet_type_perc,tmp.language), reverse=True)]
sorty = sorted(tmp.retweet_type_perc, reverse=True)

colors = []
for i in sortx:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=sortx,
            y=sorty,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

## Quote

In [40]:
tmp['quote_perc'] = tmp['type_quote'] / tmp['support']

sortx = [x for _,x in sorted(zip(tmp.quote_perc,tmp.language), reverse=True)]
sorty = sorted(tmp.quote_perc, reverse=True)

colors = []
for i in sortx:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=sortx,
            y=sorty,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

# Cross interactions

## English

In [41]:
# Users who interacted with english tweets at least once
b = df.loc[df.language=='inglese']['b_user_id']
b = b.to_list()
tmp = df[df['b_user_id'].isin(b)]
tmp = tmp[tmp['language'] != 'inglese']

counts = tmp['language'].value_counts()

colors = []
for i in counts.index:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=counts.index,
            y=counts.values,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

## Italian

In [42]:
# Users who interacted with italian tweets at least once
b = df.loc[df.language=='italiano']['b_user_id']
b = b.to_list()
tmp = df[df['b_user_id'].isin(b)]
tmp = tmp[tmp['language'] != 'italiano']

counts = tmp['language'].value_counts()

colors = []
for i in counts.index:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=counts.index,
            y=counts.values,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

## Japanese

In [43]:
# Users who interacted with japanese tweets at least once
b = df.loc[df.language=='giapponese']['b_user_id']
b = b.to_list()
tmp = df[df['b_user_id'].isin(b)]
tmp = tmp[tmp['language'] != 'giapponese']

counts = tmp['language'].value_counts()

colors = []
for i in counts.index:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=counts.index,
            y=counts.values,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

## Spanish

In [None]:
# Users who interacted with spanish tweets at least once
b = df.loc[df.language=='spagnolo']['b_user_id']
b = b.to_list()
tmp = df[df['b_user_id'].isin(b)]
tmp = tmp[tmp['language'] != 'spagnolo']

counts = tmp['language'].value_counts()

colors = []
for i in counts.index:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=counts.index,
            y=counts.values,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

## German

In [None]:
# Users who interacted with german tweets at least once
b = df.loc[df.language=='tedesco']['b_user_id']
b = b.to_list()
tmp = df[df['b_user_id'].isin(b)]
tmp = tmp[tmp['language'] != 'tedesco']

counts = tmp['language'].value_counts()

colors = []
for i in counts.index:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=counts.index,
            y=counts.values,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

## Hindi

In [None]:
# Users who interacted with hindi tweets at least once
b = df.loc[df.language=='hindi']['b_user_id']
b = b.to_list()
tmp = df[df['b_user_id'].isin(b)]
tmp = tmp[tmp['language'] != 'hindi']

counts = tmp['language'].value_counts()

colors = []
for i in counts.index:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=counts.index,
            y=counts.values,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")

## Corean

In [None]:
# Users who interacted with corean tweets at least once
b = df.loc[df.language=='coreano']['b_user_id']
b = b.to_list()
tmp = df[df['b_user_id'].isin(b)]
tmp = tmp[tmp['language'] != 'coreano']

counts = tmp['language'].value_counts()

colors = []
for i in counts.index:
  if i in top_10_languages and i != 'inglese':
    colors.append('#f39c12')
  elif i == 'inglese':
    colors.append('#e74c3c')
  else:
    colors.append('#636efa')

fig = go.Figure(go.Bar(
            x=counts.index,
            y=counts.values,
            marker={'color': colors},
            orientation='v'))

fig.update_layout(
    xaxis = go.layout.XAxis(
        tickangle = 45)
)

fig.show(renderer="colab")