In [8]:
import json
import time
import pandas as pd
import os

In [2]:
#Support Functions:
#List[JSON Object] -> String[URL]
#Purpose: To inspect tweets on twitter, use this funciton to pass JSON objects and generate
#URLS to examine in the browser.
def tweeturlgen(jsonlist):
    baseURL = "https://twitter.com/" 
    urlList = []
    for obj in jsonlist:
        urlList.append(baseURL + obj['user']['screen_name'] + "/status/" + obj['id_str'])
    return urlList


### Start:

First, lets look at what JSON tweet objects look like. A sample of 5 tweet objects is below:

In [7]:
tweets_data = []
with open("./data/fiveline.json", "r") as tweets_file:
    for line in tweets_file:
        try:
            tweet = json.loads(line)
            tweets_data.append(tweet)
        except:
            continue
            
tweets_data

[{'created_at': 'Fri Sep 27 19:03:37 +0000 2019',
  'id': 1177660079400800258,
  'id_str': '1177660079400800258',
  'text': 'RT @tylerwhat16: Y’all better be going out like this to vote on Oct. 21! #ClimateChange #ClimateStrikeCanada #climatestriketoronto #ONPoli…',
  'source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>',
  'truncated': False,
  'in_reply_to_status_id': None,
  'in_reply_to_status_id_str': None,
  'in_reply_to_user_id': None,
  'in_reply_to_user_id_str': None,
  'in_reply_to_screen_name': None,
  'user': {'id': 39512124,
   'id_str': '39512124',
   'name': 'Under the Radar',
   'screen_name': 'retail_recruit',
   'location': 'I mute with reckless abandon. #INTJ',
   'url': None,
   'description': "Small L liberal. 🇨🇦 Int'l Politics/ History pro. Still a fashionista in cosmopolitan #Montréal-snark, progressive, festivals, culture, art, fashion #Philotimo",
   'translator_type': 'regular',
   'protected': False,
   'verified': False,
   'fol

### Lets look at a single tweet, and see what fields we want for our DataFrame:

Use json.dumps function, with an indent level to easily view the data.

In [15]:
print(json.dumps(tweets_data[2], indent=4, sort_keys=False))
#created_at, id, text, user (id, name, screen_name), followers_count, friends_count, favourites_count, profile_background_color
#profile_text_color, 

{
    "created_at": "Fri Sep 27 19:03:37 +0000 2019",
    "id": 1177660081388871681,
    "id_str": "1177660081388871681",
    "text": "RT @TheBeaverton: Trudeau comes to Montreal climate strike to protest self #cdnpoli #elxn43 #ClimateStrikeCanada",
    "source": "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>",
    "truncated": false,
    "in_reply_to_status_id": null,
    "in_reply_to_status_id_str": null,
    "in_reply_to_user_id": null,
    "in_reply_to_user_id_str": null,
    "in_reply_to_screen_name": null,
    "user": {
        "id": 2393055440,
        "id_str": "2393055440",
        "name": "Darren Cargill \ud83c\udde8\ud83c\udde6",
        "screen_name": "reasonablewlvrn",
        "location": "Windsor, ON",
        "url": null,
        "description": "Father, Husband, PallMed MD in Windsor, ON. Going \ud83d\udc1d  RT = chat. Spelling \ud83d\udeab part of plan. speak GIPHY & \ud83d\ude45\ud83c\udffc\u200d trolls. Not a Bot #TrudeauMust

All of the fields (I have deemed relevent) are listed below. An example of how to load each field is given;
some of them are nested in sub-json objects (like the user object).

In [35]:
tweetObj  = tweets_data[2]

#Now, lets see how we can access all of our fields of interest:

#created_at
print(tweetObj["created_at"])
#id
print(tweetObj["id"])

#text
print(tweetObj["text"])

#user
#print(tweetObj["user"])

#user->id
print(tweetObj["user"]["id"])
#user->screen_name
#keep this for a basic user reference
print(tweetObj["user"]["screen_name"])


#user->name
#print(tweetObj["user"]["name"])
#I choose not to store the follower/friends data for a user; these can be looked up separately 
#causes largely-redundant columns.

#entities->hashtags [tag1, tag2, tag3...]
#our internal tweet objects are mini-dictionaries!
hashTagString = ""
if tweetObj["entities"]["hashtags"]:
    for item in tweetObj["entities"]["hashtags"]:
        hashTagString = hashTagString + "," + item["text"]
    hashTagString = hashTagString[1:] #removes the leading comma; i don't want a base-case above loop.
    print(hashTagString)

#quote count
print(tweetObj["quote_count"])

#reply count
print(tweetObj["reply_count"])

#favourite count
print(tweetObj["retweet_count"])

#relevent count
print(tweetObj["favorite_count"])





Fri Sep 27 19:03:37 +0000 2019
1177660081388871681
RT @TheBeaverton: Trudeau comes to Montreal climate strike to protest self #cdnpoli #elxn43 #ClimateStrikeCanada
2393055440
reasonablewlvrn
cdnpoli,elxn43,ClimateStrikeCanada
0
0
0
0


### Experiment 1: Make a Dataframe from 3 large files:

Here, 3 full csvs are loaded. A selection of user and top level fields are put in the data frame. Its size,
and compression are measured.

In [7]:
#Lets read in the tweets, and convert them to a DataFrame
topLevelFields = ["created_at", "id", "text"]

userFields = ["id", "name", "screen_name", "followers_count", 
              "friends_count", "favourites_count", "profile_background_color","profile_text_color"]

#make an empty DF.
tweetDF = pd.DataFrame(columns=topLevelFields + list(map(lambda x: "user_" + x , userFields)))

baseDir = "/home/sean/ca_election_tweets/data/"
commList = ["comments1.json","comments2.json","comments3.json"]

def parsetweet(jsonObj):
    retList = []
    for label in topLevelFields:
        retList.append(jsonObj[label])
        
    for label in userFields:
        retList.append(jsonObj["user"][label])
    return retList

#build a beast...
with open(baseDir + "comments1.json", "r") as tweets_file:
    for i,line in enumerate(tweets_file):
        try:
            tweet = json.loads(line)
            retList = parsetweet(tweet)
            tweetDF.loc[tweetDF.shape[0]] = retList
            
        except ValueError as e:
            print('Handling run-time error:', e)            
    tweets_file.close()


FileNotFoundError: [Errno 2] No such file or directory: '/home/sean/ca_election_tweets/data/comments1.json'

In [75]:
#Memory usage before casting
tweetDF.memory_usage(index=True,deep=True).sum()

17457200

In [76]:
#Now lets convert our datatypes accordingly
tweetDF["id"] = tweetDF["id"].astype("int")
tweetDF["user_id"] = tweetDF["user_id"].astype("int")
tweetDF["user_followers_count"] = tweetDF["user_followers_count"].astype("int32")
tweetDF["user_friends_count"] = tweetDF["user_friends_count"].astype("int32")
tweetDF["user_favourites_count"] = tweetDF["user_favourites_count"].astype("int32")


In [77]:
#Memory usage before casting
tweetDF.memory_usage(index=True,deep=True).sum()

14210628

In [73]:
tweetDF.head()

Unnamed: 0,created_at,id,text,user_id,user_name,user_screen_name,user_followers_count,user_friends_count,user_favourites_count,user_profile_background_color,user_profile_text_color
0,Fri Sep 27 19:03:37 +0000 2019,,RT @CanadianGreens: This is a moment in histor...,172399876,cynthia w Nelson,cindian1,473,2045,10539,000000,0
1,Fri Sep 27 19:03:37 +0000 2019,,RT @tylerwhat16: Y’all better be going out lik...,39512124,Under the Radar,retail_recruit,3389,4497,179789,000000,0
2,Fri Sep 27 19:03:37 +0000 2019,,RT @AndreeLyne_H: Justin Trudeau is promising ...,16272844,Greg Johansen 🏳️‍🌈 🐓🍷,johangreg,3198,1179,220678,C0DEED,333333
3,Fri Sep 27 19:03:37 +0000 2019,,RT @TheBeaverton: Trudeau comes to Montreal cl...,2393055440,Darren Cargill 🇨🇦,reasonablewlvrn,2578,1910,39860,C0DEED,333333
4,Fri Sep 27 19:03:39 +0000 2019,,RT @ewbcalgary: It may have rained and snowed ...,1171638022787543040,debwong,deborahwongsy,16,83,59,F5F8FA,333333


In [20]:
userFields = ["id", "name", "screen_name", "followers_count", 
              "friends_count", "favourites_count", "profile_background_color","profile_text_color"]



['user_id',
 'user_name',
 'user_screen_name',
 'user_followers_count',
 'user_friends_count',
 'user_favourites_count',
 'user_profile_background_color',
 'user_profile_text_color']

In [38]:
#Appending rows with loc:
#Nice Trick by user Qinsi on page: https://stackoverflow.com/questions/26309962/appending-a-list-or-series-to-a-pandas-dataframe-as-a-row
myDF = pd.DataFrame({"A":[1,2,3],"B":[1,2,3]})
myDF.loc[4] = [4,4]

### Total Memory Estimate (in RAM):

One 150mb file, with a subset of columns loaded, and data types optimized, turned about to be ~15mb. 
Most fields in our tweet json object are not useful. Suppose that our maximum DF size per file is about 50mb
We will not exceed about 500 tweet files (20000 tweets per file).

This gives us 500 * 50 = 25000 MB ~ 25GB. The entire tweet dataframe could technically be loaded into memory 


### Experimental Code 1: 

Lets load 5 files, and make two tables (for users and tweets).



In [30]:
#Simple Timeit usage https://stackoverflow.com/questions/7370801/measure-time-elapsed-in-python

timeStart = time.time()
timeEnd = time.time()

 





def get3frames():
    def parsetweet(jsonObj):
        retList = []
        for label in topLevelFields: #last one doesn't exist!
            retList.append(jsonObj[label])

        hashTagString = ""
        if jsonObj["entities"]["hashtags"]:
            for item in jsonObj["entities"]["hashtags"]:
                hashTagString = hashTagString + "," + item["text"]
            hashTagString = hashTagString[1:]
        retList.append(hashTagString)
        return retList       

    def parseuser(jsonObj):
        #first, extract id from jsonObj for user. Is it in our table of users?
        #To check (quickly) for a users presence, we need a hash table.
        #In python, dictionaries serve the same purpose
        retList = []
        if jsonObj["user"]["id"] not in userCheckDict:
            userCheckDict[id] = 1
            for label in userFields:
                retList.append(jsonObj["user"][label])
        else:
            sameUserCount = sameUserCount + 1   
        return retList
    
    
    #Lets read in the tweets, and convert them to a DataFrame
    topLevelFields = ["id", "created_at", "text","quote_count","reply_count","retweet_count","favorite_count"]

    userFields = ["id", "created_at", "name", "screen_name", "verified", "followers_count", 
                  "friends_count", "favourites_count", "statuses_count", 
                 "profile_background_color","profile_text_color"]

    userCheckDict = {"test": 1}

    sameUserCount = 0

    #make an empty DF.
    tweetDF = pd.DataFrame(columns=topLevelFields + ["hashtag_list"])
    userDF = pd.DataFrame(columns=userFields)  #list(map(lambda x: "user_" + x , userFields)))

    baseDir = "./data/"
    commList = ["comments1.json","comments2.json","comments3.json"]
    
    
    
    with open(baseDir + filename, "r") as tweets_file:
        for i,line in enumerate(tweets_file):
            try:
                tweet = json.loads(line)
                tweetDF.loc[tweetDF.shape[0]] = parsetweet(tweet)
                retList = parseuser(tweet)
                if retList: #Not empty: 
                    userDF.loc[userDF.shape[0]] = retList
            except ValueError as e:
                print('Handling run-time error:', e)            
        tweets_file.close()

In [28]:
        
#build a beast...

    with open(baseDir + filename, "r") as tweets_file:
        for i,line in enumerate(tweets_file):
            try:
                tweet = json.loads(line)
                tweetDF.loc[tweetDF.shape[0]] = parsetweet(tweet)
                retList = parseuser(tweet)
                if retList: #Not empty: 
                    userDF.loc[userDF.shape[0]] = retList
            except ValueError as e:
                print('Handling run-time error:', e)            
        tweets_file.close()


tweetDF.sample(n=10)
tweetDF.shape
print("tweetDF memory:" + str(tweetDF.memory_usage(index=True,deep=True).sum()))
userDF.sample(n=10)
userDF.shape
print("UserDF memory:" + str(userDF.memory_usage(index=True,deep=True).sum()))
print("Time to Execute 3 Files in Total: " + str(timeEnd - timeStart))

userDF.to_csv("./data/userDF.csv")
tweetDF.to_csv("./data/tweetDF.csv")

In [35]:
### Experiment 2: Lets do all 20 DFs:

timeStart = time.time()
#relies on funcitons in Experiment 1.
#get comment list

import glob
baseDir = "./data/tweets/"
commList = glob.glob(baseDir+"*")

#build a beast...
for filename in commList: #Note: Glob list gives path from relative root directory!
    with open(filename, "r") as tweets_file:
        for i,line in enumerate(tweets_file):
            try:
                tweet = json.loads(line)
                tweetDF.loc[tweetDF.shape[0]] = parsetweet(tweet)
                retList = parseuser(tweet)
                if retList: #Not empty: 
                    userDF.loc[userDF.shape[0]] = retList
            except ValueError as e:
                print('Handling run-time error:', e)            
        tweets_file.close()



timeEnd = time.time()
tweetDF.sample(n=5)
tweetDF.shape
print("tweetDF memory:" + str(tweetDF.memory_usage(index=True,deep=True).sum()))
userDF.sample(n=5)
userDF.shape
print("UserDF memory:" + str(userDF.memory_usage(index=True,deep=True).sum()))
print("Number of duplicate users detected:" + str(sameUserCount))
print("Time to Execute 20 Files in Total: " + str(timeEnd - timeStart))

userDF.to_csv("./data/userDF.csv")
tweetDF.to_csv("./data/tweetDF.csv")

Handling run-time error: Expecting value: line 1 column 1 (char 0)


KeyboardInterrupt: 

In [1]:
baseDir = "./data/tweets/"
commList = glob.glob(baseDir+"*")

NameError: name 'glob' is not defined