In this notebook, we calculate the metrics explored in the previous notebook using the data collected from other Hurricanes. In this notebook, these are imported from csv files and combined into a single dataframe. The code used to export the data is included below.

In [1]:
# # Code to export Tweets to csv:
# import pandas as pd
# from streamcollect.models import Tweet, Event

# print('Exporting Tweets for:', Event.objects.all()[0].name)
# ts = Tweet.objects.filter(data_source__gt=0)
# df = pd.DataFrame(list(ts.values()))
# df['text'] = df['text'].str.replace('\r', ' ') # Must strip to prevent csv row splits
# df.to_csv('{}-tweets-export.csv'.format(Event.objects.all()[0].name))

In [2]:
### Initialisation ###
import pandas as pd

# Location of data files
DIR = './data/harvey_user_location/tw_dfs/'
files = ['Hurricane-Irma', 'Hurricane-Michael', 'Hurricane-Florence', 'Hurricane-Willa']
suffix = '-tweets-export.csv'

In [3]:
for f in files:
    path = DIR + f + suffix
#    dfa = pd.read_csv(path, index_col='tweet_id')
    df_temp = pd.read_csv(path, index_col=0)
    df_temp['event'] = f
    try:
        df = df.append(df_temp, ignore_index=True, sort=False)
    except:
        df = df_temp.copy()

  mask |= (ar1 == a)


In [4]:
# Adding the Harvey tweets from the current database
ts = Tweet.objects.filter(data_source__gte=1)
df2 = pd.DataFrame(list(ts.values()))
df2['event'] = 'Hurricane-Harvey'
df = df.append(df2, ignore_index=True, sort=False)

df.event.value_counts()

Hurricane-Florence    1779303
Hurricane-Michael      335709
Hurricane-Willa         53823
Hurricane-Harvey        46872
Hurricane-Irma          42167
Name: event, dtype: int64

We can now explore various metrics examined in the prior notebook. Note that the dataset is heavily skewed towards Florence.

In [28]:
print('Total Tweets per data_source')
for i in range(1,5):
    print('{}: {}, {:.2f}%'.format(i, sum(df.data_source==i), sum(df.data_source==i)/df.shape[0]*100))

Total Tweets per data_source
1: 532098, 23.57%
2: 85742, 3.80%
3: 94717, 4.19%
4: 1545317, 68.44%


In [23]:
tot_coord = df.loc[(~df.coordinates_lon.isnull()) & (df.data_source < 3)].shape[0]
tot = df.loc[(df.data_source < 3) & (df.data_source > 0)].shape[0]

print('{} of {} ({:.2f}%) keyword streamed Tweets include coordinates'.format(tot_coord, tot, tot_coord/tot*100))

6914 of 617840 (1.12%) keyword streamed Tweets include coordinates


In [72]:
fp = ['Twitter for iPhone',
 'Twitter Web Client',
 'Twitter for Android',
 'TweetDeck',
 'Twitter for iPad',
 'Twitter Lite',
 'Twitter for Windows',
 'Twitter for Mac',
 'Mobile Web (M2)',
 'iOS',
 'Twitter for Windows Phone',
 'Twitter for Android Tablets',
 'Twitter for BlackBerry']

s1 = df['source'].apply(lambda x: True if x in fp else False)
print('{} of {} ({:.2f}%) Tweets published by first-party clients'.format(sum(s1), s1.count(), sum(s1)/s1.count()*100))
s1 = df.loc[(df.data_source < 4) & (df.data_source > 0)].source.apply(lambda x: True if x in fp else False)
print('{} of {} ({:.2f}%) non-Place Tweets published by first-party clients'.format(sum(s1), s1.count(), sum(s1)/s1.count()*100))

print('\n')

s1 = df.loc[(df.data_source < 3) & (df.data_source > 0)].source.apply(lambda x: True if x in fp else False)
print('{} of {} ({:.2f}%) keyword Tweets published by first-party clients'.format(sum(s1), s1.count(), sum(s1)/s1.count()*100))
s1 = df.loc[df.data_source == 3].source.apply(lambda x: True if x in fp else False)
print('{} of {} ({:.2f}%) coordinate Tweets published by first-party clients'.format(sum(s1), s1.count(), sum(s1)/s1.count()*100))
s1 = df.loc[df.data_source == 4].source.apply(lambda x: True if x in fp else False)
print('{} of {} ({:.2f}%) place Tweets published by first-party clients'.format(sum(s1), s1.count(), sum(s1)/s1.count()*100))

1911319 of 2257874 (84.65%) Tweets published by first-party clients
372158 of 712557 (52.23%) non-Place Tweets published by first-party clients


371248 of 617840 (60.09%) keyword Tweets published by first-party clients
910 of 94717 (0.96%) coordinate Tweets published by first-party clients
1539161 of 1545317 (99.60%) place Tweets published by first-party clients


In [70]:
# TODO: this should include all Tweets (i.e. also data_source <= 0)

s1 = df.loc[df.data_source == 3].source == 'Instagram'
print('{} of {} ({:.2f}%) coordinate Tweets came from Instagram'.format(sum(s1), s1.count(), sum(s1)/s1.count()*100))

s1 = df.loc[df.source == 'Instagram'].coordinates_lat.notnull()
print('{} of {} ({:.2f}%) total Instagram-sourced Tweets include coordinates'.format(sum(s1), s1.count(), sum(s1)/s1.count()*100))

s1 = df.loc[(df.source == 'Instagram') & (df.data_source < 3)].coordinates_lat.notnull()
print('{} of {} ({:.2f}%) Instagram-sourced Tweets from keyword streams include coordinates'.format(sum(s1), s1.count(), sum(s1)/s1.count()*100))

63036 of 94717 (66.55%) coordinate Tweets came from Instagram
67816 of 77999 (86.94%) total Instagram-sourced Tweets include coordinates
4780 of 14963 (31.95%) Instagram-sourced Tweets from keyword streams include coordinates


In [101]:
# Top sources in geo stream:
s1 = df.loc[df.data_source == 3].source
print('Total:', s1.count())
(s1.value_counts() / s1.count()*100).head(20)

Total: 94716


Instagram                   66.552642
WxTweeter                    6.802441
Foursquare                   4.854512
TweetMyJOBS                  3.866295
Untappd                      2.759829
SafeTweet by TweetMyJOBS     2.446260
WxPic                        1.637527
TTN RDU traffic              1.354576
TownTweet                    1.179315
iembot                       1.093796
Cities                       0.839351
circlepix                    0.831961
BubbleLife                   0.635584
Twitter for Android          0.533173
WxUvCast                     0.502555
Tweetbot for iΟS             0.448710
NotableBirds                 0.426538
TTN CHS traffic              0.305123
Twitter for iPhone           0.291397
Squarespace                  0.243887
Name: source, dtype: float64

In [102]:
# Top sources in keyword streams:

s1 = df.loc[(df.data_source < 3) & (df.data_source > 0)].source
print('Total:', s1.count())
(s1.value_counts() / s1.count()*100).head(20)

Total: 617840


Twitter for iPhone           20.918199
Twitter Web Client           20.564547
Twitter for Android          11.944355
IFTTT                         6.570957
Facebook                      5.067655
TweetDeck                     2.977470
dlvr.it                       2.668814
Instagram                     2.421824
Hootsuite Inc.                1.847566
Twitter Lite                  1.798848
Twitter for iPad              1.750291
WordPress.com                 1.698336
Google                        1.328985
Buffer                        1.297261
Global Citizen Mobile App     0.976790
SocialNewsDesk                0.738217
Tweet Old Post                0.659556
Backfire                      0.618607
SocialOomph                   0.612294
Hootsuite                     0.515344
Name: source, dtype: float64

In [103]:
# Top sources in place stream:

s1 = df.loc[df.data_source == 4].source
print('Total:', s1.count())
(s1.value_counts() / s1.count()*100).head(20)

Total: 1545317


Twitter for iPhone        79.550086
Twitter for Android       14.342559
Twitter Web Client         4.305395
Twitter for iPad           1.403596
Tweetbot for iΟS           0.349378
Echofon                    0.020319
Tweetbot for Mac           0.011130
Trendsmap Alerting         0.010677
Twitter for  iPhone        0.002071
SocialGest                 0.001165
weatherarena               0.001165
SoundHound                 0.000841
Sprout Social              0.000841
GivePulseEvents            0.000518
Twidere for Android #7     0.000129
Oktopost                   0.000065
Slixa                      0.000065
Name: source, dtype: float64