# Extract Gnip Data

### Objective:
- Extract Tweets and Retweets from JSON files


#### General Imports

In [None]:
import pandas as pd
import json
import numpy as np
import datetime as dt
import rarfile
import gzip
import os
import tarfile
import re
import matplotlib as plt
from collections import Counter
import csv
import math
import seaborn as sns
sns.set(color_codes=True)
%matplotlib inline

#### How to extract Tweets and Retweets for Gnip Data?

Ideally, if you want to work on pandas dataframe then extract everything in a dictionary and then convert that to a dataframe. This makes the process very fast, instead of adding one row at a time into a dictionary.  
  
Below are the functions to extract some useful information of tweets and retweets **ONLY FOR DATA PULLED USING GNIP**.  
*Field names in GNIP and Public Twitter API are different. So, you can convert if you'd like and use the same functions.*

In [None]:
# Get Retweet Info
def updateRetweetDict(tweetExtract, tweets):
    for tweet in tweets:
        if tweet['verb']=='share':
            tweetExtract['actorId'].append(re.findall('\d+', tweet['actor']['id'])[0]) #Extract only the id number
            tweetExtract['displayName'].append(tweet['actor']['displayName'])
            tweetExtract['screenName'].append(tweet['actor']['preferredUsername'])
            tweetExtract['statusesCount'].append(tweet['actor']['statusesCount'])
            tweetExtract['favoritesCount'].append(tweet['actor']['favoritesCount'])
            tweetExtract['friendsCount'].append(tweet['actor']['friendsCount'])
            tweetExtract['followersCount'].append(tweet['actor']['followersCount'])
            tweetExtract['listedCounts'].append(tweet['actor']['listedCount'])
            if 'languages' not in tweet['actor']:
                tweetExtract['actorLanguages'].append([])
            else:
                tweetExtract['actorLanguages'].append(tweet['actor']['languages'])
            if 'summary' not in tweet['actor']:
                tweetExtract['summary'].append(None)
            else:
                tweetExtract['summary'].append(tweet['actor']['summary'])
            tweetExtract['createdAt'].append(tweet['actor']['postedTime'])
            tweetExtract['verified'].append(tweet['actor']['verified'])
            if 'location' not in tweet['actor']:
                tweetExtract['location'].append("null")
                tweetExtract['locationType'].append("null")
            else:
                tweetExtract['location'].append(tweet['actor']['location']['displayName'])
                tweetExtract['locationType'].append(tweet['actor']['location']['objectType'])
            tweetExtract['verb'].append(tweet['verb'])
            tweetExtract['tweetId'].append(re.findall(':\d+',tweet['id'])[0][1:]) #Extract only the id number
            tweetExtract['tweetFavCount'].append(tweet['favoritesCount'])
            tweetExtract['generator'].append(tweet['generator']['displayName'])
            tweetExtract['postedTime'].append(tweet['postedTime'])
            tweetExtract['retweetCount'].append(tweet['retweetCount'])
            tweetExtract['object.body'].append(tweet['object']['body']) 
            tweetExtract['object.tweetId'].append(re.findall(':\d+',tweet['object']['id'])[0][1:]) #Extract only the id number
            tweetExtract['object.actorId'].append(re.findall('\d+', tweet['object']['actor']['id'])[0]) #Extract only the id number
            tweetExtract['object.postedTime'].append(tweet['object']['postedTime'])

Body of Original Tweet for the Retweet is sometimes important because it is possible that Tweet may not belong to the timeline used to filter data.  
But we got the retweet for such tweet

In [None]:
# Get Tweet Info
def updateTweetDict(tweetExtract, tweets):
    for tweet in tweets:
        if tweet['verb']=='post':
            tweetExtract['actorId'].append(re.findall('\d+', tweet['actor']['id'])[0]) #Extract only the id number
            tweetExtract['displayName'].append(tweet['actor']['displayName'])
            tweetExtract['screenName'].append(tweet['actor']['preferredUsername'])
            tweetExtract['statusesCount'].append(tweet['actor']['statusesCount'])
            tweetExtract['favoritesCount'].append(tweet['actor']['favoritesCount'])
            tweetExtract['friendsCount'].append(tweet['actor']['friendsCount'])
            tweetExtract['followersCount'].append(tweet['actor']['followersCount'])
            tweetExtract['listedCounts'].append(tweet['actor']['listedCount'])
            if 'languages' not in tweet['actor']:
                tweetExtract['actorLanguages'].append([])
            else:
                tweetExtract['actorLanguages'].append(tweet['actor']['languages'])
            if 'summary' not in tweet['actor']:
                tweetExtract['summary'].append(None)
            else:
                tweetExtract['summary'].append(tweet['actor']['summary'])
            tweetExtract['createdAt'].append(tweet['actor']['postedTime'])
            tweetExtract['verified'].append(tweet['actor']['verified'])
            if 'location' not in tweet['actor']:
                tweetExtract['location'].append("null")
                tweetExtract['locationType'].append("null")
            else:
                tweetExtract['location'].append(tweet['actor']['location']['displayName'])
                tweetExtract['locationType'].append(tweet['actor']['location']['objectType'])
            tweetExtract['body'].append(tweet['body'])
            tweetExtract['verb'].append(tweet['verb'])
            tweetExtract['tweetId'].append(re.findall(':\d+',tweet['id'])[0][1:]) #Extract only the id number
            tweetExtract['tweetFavCount'].append(tweet['favoritesCount'])
            tweetExtract['generator'].append(tweet['generator']['displayName'])
            tweetExtract['hashtags'].append([hashtag['text'] for hashtag in tweet['twitter_entities']['hashtags']])
            tweetExtract['mentionIds'].append([mention['id_str'] for mention in tweet['twitter_entities']['user_mentions']])
            tweetExtract['mentionScreenNames'].append([mention['screen_name'] for mention in tweet['twitter_entities']['user_mentions']])
            if 'inReplyTo' not in tweet: # If Post is a reply
                tweetExtract['inReplyTo'].append("null")
            else:
                tweetExtract['inReplyTo'].append(re.findall('/\d+',tweet['inReplyTo']['link'])[0][1:]) #Extract only the id number
            if 'twitter_quoted_status' not in tweet: # if post is a Quoted Tweet
                tweetExtract['quotedTweetId'].append("null")
                tweetExtract['quotedTweetUserId'].append("null")
            else:
                tweetExtract['quotedTweetId'].append(re.findall(':\d+',tweet['twitter_quoted_status']['id'])[0][1:]) #Extract only the id number
                tweetExtract['quotedTweetUserId'].append(re.findall('\d+', tweet['twitter_quoted_status']['actor']['id'])[0]) #Extract only the id number
            tweetExtract['tweetLanguage'].append(tweet['twitter_lang'])
            tweetExtract['postedTime'].append(tweet['postedTime'])
            tweetExtract['retweetCount'].append(tweet['retweetCount'])

#### Types of Activity on Twitter:
- Tweet: Write an original post
- Retweet: Similar to share on Facebook (shows echoing)
- Reply: Reply to a tweet
- Quote: Sharing with some changes to the original post
- Like: Liking the tweet
and others but not relevant right now.  
<br>

In GNIP, you can track tweet, retweet, reply and a quote. Using *verb* field we can check if post is a tweet or a retweet.  
- On GNIP, verb for tweet, reply and quote is **post** and verb for pure retweet is **share**.  
*A pure retweet is a tweet which is not not tampered with i.e. tweet is just shared as it is. * 
<br><br>
- Reply is identified by filtering verb using post and then checking if there is *inreplyto* field in it
- Quoted Tweet is identified by filtering verb using post and then checking if there is *twitter_quoted_status* field in it, which contains the original tweet

In [None]:
# Retweet Dictionary 
retweetsDict = {'actorId':[],'displayName':[],'screenName':[],'statusesCount':[], 'favoritesCount':[],
                'friendsCount':[], 'followersCount':[],'listedCounts':[], 'actorLanguages':[],'summary':[],
                'createdAt':[],'verified':[],'location':[],'locationType':[],'verb':[],'tweetId':[],
                'tweetFavCount':[], 'generator':[],'postedTime':[],'retweetCount':[],
                'object.body':[],'object.tweetId':[],'object.actorId':[],'object.postedTime':[]
               }

# Tweet Dictionary
tweetsDict = {'actorId':[],'displayName':[],'screenName':[],'statusesCount':[], 'favoritesCount':[],
                'friendsCount':[], 'followersCount':[],'listedCounts':[], 'actorLanguages':[],'summary':[],
                'createdAt':[],'verified':[],'location':[],'locationType':[],'body':[], 'verb':[],'tweetId':[],
                'tweetFavCount':[], 'generator':[],'hashtags':[],'mentionIds':[],'mentionScreenNames':[],
                'inReplyTo':[],'quotedTweetId':[],'quotedTweetUserId':[],'tweetLanguage':[],'postedTime':[],'retweetCount':[]}



#### Somethings to keep in mind:
Data extracted from Gnip is dependent on way you have done it. But I generally prefer extracting in multiple files, maybe 500 posts in each file and then storing in one rar.    
  
So, each JSON file has maximum of 500 posts. This makes sure that if one file gets corrupt, then you don't end up loosing lot of data.    
  
Now, there are two different ways people prefer storing their JSON files. Compressing each JSON file and then compressing that folder or just compressing that entire folder.  
  
I will show how to deal with both types of folders but there's not much difference

In [None]:
#When data is compressed in .gz format
def extractDataGz(dataDict,function,path):
    wasteFiles = [] # collects file names that have been either courrpt or files that encounter any errors while parsing
    fileNo = 0
    tweetNo = 0
    allFiles = os.listdir(path)
    for file in allFiles:
        fileNo+=1
        if fileNo %100 == 0: # printing progress
            print("file: ",fileNo) # prints number of files parsed
            print("data: ",len(dataDict['actorId'])) # prints data collected in dictionary

        try:
            with gzip.open(path+file, 'rb') as f1: #gzip is the only difference
                file_content = f1.read()
                file_content = json.loads(file_content)
                function(dataDict,file_content['results'])

            continue
        except OSError: # if file is courrpt
            wasteFiles.append(file)
            continue
            # Another error that can occur is when file is not JSON format and is OS file. Error encountered is JSONDecodeError

In [None]:
#When data is JSON format
def extractData(dataDict,function,path):
    wasteFiles = [] # collects file names that have been either courrpt or files that encounter any errors while parsing
    fileNo = 0
    tweetNo = 0
    allFiles = os.listdir(path)
    for file in allFiles:
        fileNo+=1
        if fileNo %100 == 0: # printing progress
            print("file: ",fileNo) # prints number of files parsed
            print("data: ",len(dataDict['actorId'])) # prints data collected in dictionary

        try:
            with open(path+file, 'rb') as f1:
                file_content = f1.read()
                file_content = json.loads(file_content)
                function(dataDict,file_content['results'])

            continue
        except OSError: # if file is courrpt
            wasteFiles.append(file)
            continue

#### Converting to dataframe

In [None]:
tweetData = pd.DataFrame(tweetsDict) #Use the dictionary that you want

#### Making sure you don't have duplicate data
Sometimes, if you have made multiple pulls and try to combine those, it may happen that you might end up getting duplicate data

In [None]:
cols = list(tweetData.columns)# remove tweetId
cols.remove('tweetId')

In [None]:
tempData = tweetData.groupby('tweetId',as_index=False)[cols]
realTweetData = tempData.agg(lambda x: x.iloc[-1])

#### Saving Data in python:

##### Issue:
While using pandas, both to_csv and read_csv functions are not very consistent. Now, issues occur while dealing with text data. Specifically, issues with line terminators.  
Line terminators depend on type of Operating System used and can be as "\r", "\n", "\r\n".  

For Macs:

In [None]:
realTweetData.to_csv("PATH WHERE YOU WANT TO SAVE YOUR FILE/filename.csv",quoting=csv.QUOTE_NONNUMERIC, date_format='%Y-%m-%d %H:%M:%S', encoding='utf-8',line_terminator = '\n')