*Analyze Bulk Tweets*

Raw data from: https://archive.org/details/archiveteam-twitter-stream-2021-01
https://archive.org/details/twitterstream 


In [2]:
import pandas as pd 
import csv
import json
import os
import bz2

In [3]:
#Read in all the tweets from a file
def read_tweets_from_bzfile(filename):

    count = 0 
    tweets = []

    with bz2.open(filename, "rb") as data_file:
        for line in data_file:
            try: 
                tweet = json.loads(line)
                #print(tweet['text'])
                tweets.append(tweet)
                count = count + 1
            except:
                pass
    print("file read: {}".format(filename))
    print("total tweets read in file: {}".format(count))

    return tweets

In [4]:
### test the read_tweets_from_bzfile function ###
#tweets = []
#read_tweets_from_bzfile("00.json.bz2", tweets)

In [5]:
rootdir = ".\\2020\\12\\01"  # December 1 of 2020

total_files_read = 0
tweets_df = pd.DataFrame()
tweets = []

for directory, subdirectory, filenames in  os.walk(rootdir):
    print(directory)
    
    for filename in filenames:
        full_path_filename = os.path.join(directory, filename)

        # call the read tweets function and keep track of counters
        tweets.extend( read_tweets_from_bzfile(full_path_filename) )
        total_files_read = total_files_read + 1

        print("total files read so far: {}".format(total_files_read))
        print("results so far: {} tweets".format(len(tweets)))

# store it in a dataframe as infrequently as
# possible to be memory efficient
tweets_df = tweets_df.append(tweets)

print("done. size of tweets array: {}".format(len(tweets_df)))

.\2020
.\2020\12
.\2020\12\01
.\2020\12\01\00
file read: .\2020\12\01\00\29.json.bz2
total tweets read in file: 2407
total files read so far: 1
results so far: 2407 tweets
file read: .\2020\12\01\00\30.json.bz2
total tweets read in file: 2390
total files read so far: 2
results so far: 4797 tweets
file read: .\2020\12\01\00\31.json.bz2
total tweets read in file: 2479
total files read so far: 3
results so far: 7276 tweets
file read: .\2020\12\01\00\32.json.bz2
total tweets read in file: 2540
total files read so far: 4
results so far: 9816 tweets
file read: .\2020\12\01\00\33.json.bz2
total tweets read in file: 2736
total files read so far: 5
results so far: 12552 tweets
file read: .\2020\12\01\00\34.json.bz2
total tweets read in file: 2484
total files read so far: 6
results so far: 15036 tweets
file read: .\2020\12\01\00\35.json.bz2
total tweets read in file: 2522
total files read so far: 7
results so far: 17558 tweets
file read: .\2020\12\01\00\36.json.bz2
total tweets read in file: 245

In [6]:
# number of tweets and min max date

import datetime

print(tweets_df.columns)
print("total tweets: {}".format(len(tweets_df)))
tweets_df['created_at'] = pd.to_datetime(tweets_df['created_at'])
print("date time range: {} to {}".format(tweets_df['created_at'].min(),tweets_df['created_at'].max()))


Index(['created_at', 'id', 'id_str', 'text', 'source', 'truncated',
       'in_reply_to_status_id', 'in_reply_to_status_id_str',
       'in_reply_to_user_id', 'in_reply_to_user_id_str',
       'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place',
       'contributors', 'retweeted_status', 'is_quote_status', 'quote_count',
       'reply_count', 'retweet_count', 'favorite_count', 'entities',
       'extended_entities', 'favorited', 'retweeted', 'possibly_sensitive',
       'filter_level', 'lang', 'timestamp_ms', 'display_text_range',
       'quoted_status_id', 'quoted_status_id_str', 'quoted_status',
       'quoted_status_permalink', 'extended_tweet', 'delete',
       'withheld_in_countries'],
      dtype='object')
total tweets: 289399
date time range: 2020-12-01 07:29:00+00:00 to 2020-12-01 08:59:59+00:00


In [7]:
# tweets with places listed

filtered_df = tweets_df[tweets_df['place'].notnull()]
print(filtered_df['place'].head())
print("size of filtered df: {}".format(len(filtered_df)))
print("percentage of tweets with identified places: {:.2f}%".format(100* len(filtered_df) / len(tweets_df) ) )
#tweets_df.to_csv("temp.csv")
#filtered_df.to_csv("temp.csv")

122    {'id': '7d04c46babac50ea', 'url': 'https://api...
208    {'id': '1fff0d34abca6d24', 'url': 'https://api...
518    {'id': 'f227fd9a20f20e9d', 'url': 'https://api...
568    {'id': '315b740b108481f6', 'url': 'https://api...
670    {'id': '4313fafe1157eaaf', 'url': 'https://api...
Name: place, dtype: object
size of filtered df: 1401
percentage of tweets with identified places: 0.48%


In [26]:
# within twitter results, the 'place' field is a dict with subdata like city and country and country code
places_df = filtered_df['place']
lines = []
for row in places_df:
    line = {}
    line = row
    lines.append(line)
    print(line['country_code'])
print("total country code mentions: {}".format(len(lines)))

GB
RU
US
GB
MY
ID
BR
FR
CA
PH
JP
GB
JP
IN
DE
ZA
SN
JP
UA
PE
US
GB
EG
IN
ES
CA
US
GB
NG
PA
NG
US
MY
US
PL
US
US
FR
ZA
MY
TH
IT
IT
RU
JP
AR
NO
LK
SA
JP
NL
JP
AE
BH
GB
PH
NL
MX
IN
SG
IN
LB
GN
ES
US
KE
MX
BR
IN
ID
US
LB
US
ES
US
US
NG
PH
TR
ES
IN
KW
ID
AZ
TR
FR
ZA
TR
ID
JP
MX
DE
MY
NG
NG
JP
US
JP
TR
PK
JP
MY
ID
CA
IN
JP
IN
IN
US
US
ID
GB
KE
JP
IT
IN
QA
JP
NG
SA
IT
IN
ZA
ID
TH
ZA
VN
US
GB
JP
SA
US
US
AU
GB
US
MZ
US
ES
US
MW
DE
US
PT
ID
GB
PH
ZA
MX
US
US
GB
AU
US
IN
JP
SA
ES
US
AU
CN
US
TR
US
IT
ZA
JP
NG
NG
GB
IN
US
SA
AU
IN
GR
US
JP
ES
AR
TH
US
US
BR
PH
JP
ID
UG
BD
ES
JP
IN
BR
ES
TR
CO
IN
US
ES
JP
US
CA
IN
US
DE
JP
CH
ES
IN
BR
US
ZA
US
IN
US
TR
PL
IN
GB
GB
PT
MY
SA
BR
US
US
NG
RU
ID
MX
AE
AR
FR
US
BH
JP
JP
PH
IN
ZA
JP
JP
IN
TR
US
GH
PH
ES
FR
MW
GB
US
US
ID
GH
DE
ES
GB
NL
BR
US
IN
NL
DE
TR
HU
US
US
BW
JP
GB
GB
JP
US
PE
BG
JP
GB
IN
SA
SE
BR
ES
ZA
NL
PH
DE
PL
MY
IT
NL
SA
SA
BR
EG
TH
JP
GB
IN
NG
US
US
US
CY
PK
AU
ZM
EG
GB
TH
GB
US
CM
US
GB
BR
RS
FR
GB
GB
KW
US
NG
HU
PH
US
ZA
US
NL
US
ZA
PH
GB
J

In [33]:
countries_df = pd.DataFrame(lines)
print(countries_df['country_code'].value_counts())



US    204
JP    159
GB    109
IN     92
ES     66
     ... 
UY      1
DZ      1
MV      1
BS      1
        1
Name: country_code, Length: 92, dtype: int64


In [12]:
filtered_coords_df = tweets_df[tweets_df['coordinates'].notnull()]
print(filtered_coords_df['coordinates'].head())
print("size of filtered df: {}".format(len(filtered_df)))
print("percentage of tweets with identified coordinates: {:.2f}%".format(100* len(filtered_df) / len(tweets_df) ) )

1196    {'type': 'Point', 'coordinates': [7.096968, 43...
2712    {'type': 'Point', 'coordinates': [38.029633, 4...
2948    {'type': 'Point', 'coordinates': [-95.6969, 29...
3385    {'type': 'Point', 'coordinates': [-5.31220501,...
3701    {'type': 'Point', 'coordinates': [-113.5796029...
Name: coordinates, dtype: object
size of filtered df: 1401
percentage of tweets with identified coordinates: 0.48%


In [34]:
#let's look at coordinates now
coords_df = filtered_coords_df['coordinates']
coords_lines = []
for row in coords_df:
    line = {}
    line = row
    coords_lines.append(line)
    print(line['coordinates'])

print("total coordinates mentions: {}".format(len(coords_lines)))

[7.096968, 43.60839238]
[38.029633, 48.3071]
[-95.6969, 29.9689]
[-5.31220501, 35.88824033]
[-113.5796029, 53.581466]
[7.43908406, 9.11716183]
[139.71767451, 35.56056944]
[135.50320479, 34.66477851]
[16.86604296, 41.10984789]
[77.1116021, 28.44458168]
[-1.5053, 53.8066]
[-4.02189732, 40.75125588]
[139.71456548, 35.73148632]
[139.71767451, 35.56056944]
[-98.0156, 33.6353]
[77.14942932, 28.6925829]
[3.39583, 6.45306]
[47.1228, 42.8256]
[24.75, 42.15]
[-46.541345, -23.51717947]
[77.62265682, 13.00221611]
[20.45273155, 44.82038967]
[-2.73333, 55.5167]
[-2.950121, 53.430379]
[139.69674385, 35.656136]
[-43.30482915, -22.89767847]
[139.62013185, 35.33138079]
[9.08333333, 40.05]
[82.12403995, 26.76957172]
[-3.59144646, 40.49120601]
[1.05, 41.1333]
[136.87940251, 35.16759606]
[28.968588, 41.03756]
[28.9683514, 41.0371099]
[-71.50138889, 42.36916667]
[-2.1581245, 53.6172937]
[139.65429076, 35.85631824]
[-1.505924, 53.683298]
[4.03462124, 49.24427414]
[4.32121488, 51.34474532]
[1.71414, 41.22312]

In [60]:
lat = []
lon = []

for coord_string in coords_lines:
    #the first number is lon, the second is lat
    lon.append( coord_string['coordinates'][0] )
    lat.append( coord_string['coordinates'][1] )



In [56]:
#installation



Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1


In [61]:
# try a folium map

# if you don't already have folium installed then run this once:
# !pip install folium 

# Create a world map to show distributions of users 
import folium
from folium.plugins import MarkerCluster
#empty map
world_map= folium.Map(tiles="cartodbpositron")
marker_cluster = MarkerCluster().add_to(world_map)
#for each coordinate, create circlemarker of user percent
for i in range(len(lat)):
        radius=5
        folium.CircleMarker(location = [lat[i], lon[i]], radius=radius, fill =True).add_to(marker_cluster)
#show the map
world_map