## Introduction:

This notebook is the first step in our processing pipeline. It takes raw JSON files (~150mb in size), picks
out hand selected fields, and saves them to smaller CSV files.

### Note: You might have to change base directory, to get this to run. 

Script assumes that data folder is ../.. levels up.

In [9]:
import ray
import json
import glob
import pandas as pd

baseDir = "../../data/"

### Support Code:

In [7]:
@ray.remote
def getframes(argTuple):   
    index = argTuple[0]    
    fileName = argTuple[1]

    #Lets read in the tweets, and convert them to a DataFrame
    topLevelFields = ["id","created_at", "text","quote_count","reply_count","retweet_count",
                      "favorite_count","user_id","user_screen_name"]

    userFields = ["id", "created_at", "name", "screen_name", "verified", "followers_count", 
                  "friends_count", "favourites_count", "statuses_count", 
                 "profile_background_color","profile_text_color"]

    userCheckDict = {"test": 1}

    #make an empty DF.
    tweetDF = pd.DataFrame(columns=topLevelFields + ["hashtag_list"])
    userDF = pd.DataFrame(columns=userFields)  #list(map(lambda x: "user_" + x , userFields)))

    def parsetweet(jsonObj):
        retList = []
        for label in topLevelFields[:len(topLevelFields) - 2]: 
            retList.append(jsonObj[label])

        hashTagString = ""
        if jsonObj["entities"]["hashtags"]:
            for item in jsonObj["entities"]["hashtags"]:
                hashTagString = hashTagString + "," + item["text"]
            hashTagString = hashTagString[1:]
            
        #Finally, we need to add foreign keys to the user table.
        retList.append(jsonObj["user"]["id"])
        retList.append(jsonObj["user"]["screen_name"])
        
        retList.append(hashTagString)
        return retList       

    def parseuser(jsonObj):
        #first, extract id from jsonObj for user. Is it in our table of users?
        #To check (quickly) for a users presence, we need a hash table.
        #In python, dictionaries serve the same purpose
        retList = []
        if jsonObj["user"]["id"] not in userCheckDict:
            userCheckDict[jsonObj["user"]["id"]] = 1
            for label in userFields:
                retList.append(jsonObj["user"][label])

        return retList
        
    with open(fileName, "r") as tweets_file:
        for i,line in enumerate(tweets_file):
            try:
                tweet = json.loads(line)
                tweetDF.loc[tweetDF.shape[0]] = parsetweet(tweet)
                retList = parseuser(tweet)
                if retList: #Not empty: 
                    userDF.loc[userDF.shape[0]] = retList
            except ValueError as e:
                print('Handling run-time error:', e)            
        tweets_file.close()
    tweetDF.to_csv(baseDir + "1.stage1/tweets/" + "tweets" + str(index) + ".csv")
    userDF.to_csv(baseDir + "1.stage1/users/" + "users" + str(index) + ".csv")
    return 

### Main Code:

Warning: **Do not** use all 16 cores with Ray; you need 1 or more cores to run your GUI and terminals. I have set it to 12 to be safe.

In [12]:
#Only call once! 

ray.init(num_cpus=12) #One File per CPU. Don't exceed 12 (avoid console lockup).
startDir = "../../data/1.stage0/"
enumList = list(enumerate(glob.glob(startDir+"*")))

idLists = [getframes.remote(name) for name in enumList]

print(ray.get(idLists))

ray.shutdown() #Always shutdown; you will have old processes lying around if you dont.



2019-10-23 21:05:34,707	INFO resource_spec.py:205 -- Starting Ray with 7.42 GiB memory available for workers and up to 3.73 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).


[2m[36m(pid=17634)[0m Handling run-time error: Expecting value: line 1 column 1 (char 0)
[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]


In [11]:
#Use if script crashes. Ray doesn't clean up after itself.
ray.shutdown()

### END