In [1]:
import os
import json
import pandas as pd
from datetime import datetime
import re
import fds_utils as utils
import importlib
_ = importlib.reload(utils)

# Data Loading
This section loads the raw Twitter data, extracts relevant columns, and formats it for further analysis

In [2]:
# Load Tweets from JSON Lines
tweets_data = []
tweets_file = open("data/geotagged_tweets_20160812-0912.jsons", 'r')
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except Exception as e:
        print("Error: ", e)
        continue

In [3]:
# Extract information and store in a DataFrame
# We creates a df with columns for date, text, lang, country, and city.
tweets = pd.DataFrame()
tweets['date'] = utils.get_data('created_at', tweets_data)
#Converts Twitter’s date format into a standard date string (YYYY-MM-DD).
tweets['date'] = tweets['date'].apply(lambda x: datetime.strftime(datetime.strptime(x, '%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d'))
tweets['text'] = utils.get_data('text', tweets_data)
tweets['lang'] = utils.get_data('lang', tweets_data)
tweets['country'] = utils.get_data(['place', 'country'], tweets_data)

# City is used here as a collective term including towns and villages
tweets['city'] = utils.get_data(['place', 'name'], tweets_data)


In [4]:
tweets.shape

(657307, 5)

In [5]:
pd.set_option('display.max_colwidth', None)
tweets.head()

Unnamed: 0,date,text,lang,country,city
0,2016-08-12,@theblaze @realDonaldTrump https://t.co/TY9DlZ584c,und,United States,Frontenac
1,2016-08-12,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN COLLUSION TOGETHER \n\n#NOJUSTICE \n\n@realDonaldTrump \n#TrumpPence \n\nhttps://t.co/5GMNZq40V3,en,United States,Baton Rouge
2,2016-08-12,@theblaze @realDonaldTrump https://t.co/n050DBSpv0,und,United States,Frontenac
3,2016-08-12,@HillaryClinton he will do in one year all the things you should have done in eight,en,Australia,Melbourne
4,2016-08-12,"#CNN #newday clear #Trump deliberately throwing this race,in 2007 he knew that #ISIS and destabilization of Mideast started w/Iraq invasion",en,United States,Baltimore


In [6]:
tweets.to_pickle("data/tweets_sample.pkl")