# Parse Twitter Data

1. Import retrieved tweets (from JSON file, pickle or similar)
2. Read in individual tweets
5. Create CSV file (and drop unwanted data)

## Get Data and Enrich It

In [14]:
import sys,re,json,os,csv
import numpy as np
import cPickle as pickle
import uuid
from IPython.display import display_javascript, display_html, display

## Read Pickle File with Tweets 

In [15]:
picklepath = '../data/raw/tweets.p'

In [16]:
tweets = pickle.load( open(picklepath, "rb" ) )

## Number of Tweets

In [17]:
print('We have %d tweets in total' % len(tweets))

We have 281 tweets in total


## What Does a Tweet Look Like?
Let's make JSON look nice (with thanks to [Renderjson](http://caldwell.github.io/renderjson/))

In [18]:
class RenderJSON(object):
    def __init__(self, json_data):
        if isinstance(json_data, dict):
            self.json_str = json.dumps(json_data)
        else:
            self.json_str = json
        self.uuid = str(uuid.uuid4())

    def _ipython_display_(self):
        display_html('<div id="{}" style="height: 600px; width:100%;"></div>'.format(self.uuid),
            raw=True
        )
        display_javascript("""
        require(["https://rawgit.com/caldwell/renderjson/master/renderjson.js"], function() {
          document.getElementById('%s').appendChild(renderjson(%s))
        });
        """ % (self.uuid, self.json_str), raw=True)

In [45]:
RenderJSON(tweets[12])

## Get Rid of Line Breaks in Tweets

In [7]:
tweetLinebreakError=0

for tweet in tweets:
  try:
    tweet['text'] = tweet['text'].replace('\n', ' ').replace('\r', '')
  except:
    tweetLinebreakError+=1
    tweet['text'] = 'NaN'

print('Failed removing line breaks in %d tweets' % tweetLinebreakError)

Failed removing line breaks in 0 tweets


## Save Data to Disk

### Save as JSON

In [11]:
with open('../data/allTweets.json','wb') as f: f.write(json.dumps(tweets))

### Save as Pickle file

In [12]:
pickle.dump(tweets, open('../data/allTweets.p', "wb" ) )
with open('../data/allTweets.json','wb') as f: f.write(json.dumps(tweets))

### Save as TSV

In [13]:
header=['Tweets ID','Tweet Time','Tweet Text','Tweet Language','User Location','User Followers','User Friends']
outFile=csv.writer(open('../data/allTweets.tsv','wb'),delimiter='\t')
outFile.writerow(header)

NameError: name 'header' is not defined

In [None]:
nIdError=0
nDateError=0
nTextError=0
nLanguageError=0
nLocationError=0
nLikesError=0
nRetweetsError=0
nFollowersError=0
nFriendsError=0
nTagsError=0
nMentionsError=0
nLinksError=0
nDomainsError=0
nDescriptionError=0
nScreenNameError=0

documents=[]

for tweet in tweets:
  outList=[]
  try:
    outList.append(tweet['id'])
    documents.append(tweet['id'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nIdError+=1
  try:
    outList.append(tweet['created_at'])
    documents.append(tweet['created_at'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nDateError+=1
  try:
    outList.append(tweet['text'].encode('utf-8'))
    documents.append(tweet['text'].encode('utf-8'))
  except:
    outList.append('NaN')
    documents.append('NaN')
    nTextError+=1
  try:
    outList.append(tweet['lang'])
    documents.append(tweet['lang'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nLanguageError+=1
  try:
    outList.append(tweet['user']['location'].encode('utf-8'))
    documents.append(tweet['user']['location'].encode('utf-8'))
  except:
    outList.append('NaN')
    documents.append('NaN')
    nLocationError+=1
  try:
    outList.append(tweet['geo'].encode('utf-8'))
    documents.append(tweet['geo'].encode('utf-8'))
  except:
    outList.append('NaN')
    documents.append('NaN')
    nGeoeoError+=1
  try:
    outList.append(tweet['place'].encode('utf-8'))
    documents.append(tweet['place'].encode('utf-8'))
  except:
    outList.append('NaN')
    documents.append('NaN')
    nUngpPlaceError+=1
  try:
    outList.append(tweet['favorite_count'])
    documents.append(tweet['favorite_count'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nLikesError+=1
  try:
    outList.append(tweet['retweet_count'])
    documents.append(tweet['retweet_count'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nRetweetsError+=1
  try:
    outList.append(tweet['user']['followers_count'])
    documents.append(tweet['user']['followers_count'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nFollowersError+=1
  try:
    outList.append(tweet['user']['friends_count'])
    documents.append(tweet['user']['friends_count'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nFriendsError+=1
  try:
    outList.append(tweet['user']['listed_count'])
    documents.append(tweet['user']['listed_count'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nListedError+=1
  try:
    outList.append(tweet['user']['favourites_count'])
    documents.append(tweet['user']['favourites_count'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nFavouritesError+=1
  try:
    tweetTags=','.join([h.lower() for h in tweet['entities']['hashtags']])
    outList.append(tweetTags.decode('utf-8'))
    documents.append(tweetTags.decode('utf-8'))
  except:
    nTagsError+=1
    outList.append('NaN')
    documents.append('NaN')
  try:
    tweetMentions=','.join([m.lower() for m in tweet['entities']['user_mentions']])
    outList.append(tweetMentions.decode('utf-8'))
    documents.append(tweetMentions.decode('utf-8'))
  except:
    nMentionsError+=1
    outList.append('NaN')
    documents.append('NaN')
  try:
    tweetLinks=','.join([m.lower() for m in tweet['entities']['urls']])
    outList.append(tweetLinks.decode('utf-8'))
    documents.append(tweetLinks.decode('utf-8'))
  except:
    nLinksError+=1
    outList.append('NaN')
    documents.append('NaN')
  try:
    outList.append(tweet['user']['description'].encode('utf-8'))
    documents.append(tweet['user']['description'].encode('utf-8'))
  except:
    nDescriptionError+=1
    outList.append('NaN')
    documents.append('NaN')
  try:
    outList.append(tweet['twitter']['user']['screen_name'].encode('utf-8'))
    documents.append(tweet['twitter']['user']['screen_name'].encode('utf-8'))
  except:
    nScreenNameError+=1
    outList.append('NaN')
    documents.append('NaN')
    
            
  outFile.writerow(outList)

print "%d ID errors." % nIdError
print "%d Date errors." % nDateError
print "%d Content errors." % nContentError
print "%d Type errors." % nTypeError
print "%d DataSift language errors." % nLanguageError
print "%d Twitter language errors." % nTwitterLanguageError
print "%d Twitter Location errors." % nLocationError
print "%d UNGP Location errors." % nUngpLocationError
print "%d Gender errors." % nGenderError
print "%d UNGP gender errors." % nUngpGenderError
print "%d UNGP gender probability errors." % nUngpGenderProbError
print "%d Follower errors." % nFollowersError
print "%d Friends errors." % nFriendsError
print "%d Topic Key errors." % nTopicKeyError
print "%d Topic Value errors." % nTopicValueError
print "%d Topic errors." % nTopicError
print "%d Topic lengtherrors." % nTopicLengthError
print "%d Interaction hashtag errors." % nTagsError
print "%d Interaction mention errors." % nMentionsError
print "%d Interaction link errors." % nLinksError
print "%d Domain errors." % nDomainsError
print "%d Description errors." % nDescriptionError
print "%d Screen name errors." % nScreenNameError

In [48]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)