# Parse Twitter Data

1. Import retrieved JSON files (from S3)
2. Read in individual tweets
3. Geolocate
4. Add gender
5. Create CSV file (and drop unwanted data)

## Get Data and Enrich It

In [1]:
import sys
sys.path.append('/mnt/home/ubuntu/projects/tools/')

In [3]:
import sys,re,json,os,csv,glob
import numpy as np
import matplotlib.pyplot as plt
from dateutil.parser import parse
import time,random,traceback
import cPickle as pickle
#from geopy import distance
#import geolocator
#geo=geolocator.Geolocator()
#geo.init()

## Read Pickle File with Tweets 

In [4]:
picklepath = '../data/raw/tweets.p'

In [5]:
tweets = pickle.load( open(picklepath, "rb" ) )

## Number of Tweets

In [10]:
print('We have %d tweets in total' % len(tweets))

We have 18 tweets in total


## Get Rid of Line Breaks in Tweets

In [13]:
tweetLinebreakError=0

for tweet in tweets:
  try:
    tweet['text'] = tweet['text'].replace('\n', ' ').replace('\r', '')
  except:
    tweetLinebreakError+=1
    tweet['text'] = 'NaN'

print('Failed removing line breaks in %d tweets' % tweetLinebreakError)

Failed removing line breaks in 0 tweets


## Geolocate From User Location

In [9]:
geoError=0
for tweet in tweets:
  try:
    tweet['geolocated']=geo.geoLocate(tweet['twitter']['retweet']['user']['location'])[0][3] 
  except:
    try:
        tweet['geolocated']=geo.geoLocate(tweet['twitter']['user']['location'])[0][3]
    except:
        geoError+=1
        tweet['geolocated']=None
print('Couldn\'t geolocate %d tweets' % geoError)
print('Geolocated %d tweets' % (len(tweets) - (geoError)))
print('Managed to geolocate %d percent' % (100.0*(1.0-(float(geoError)/len(tweets)))))

Couldn't geolocate 263646 tweets
Geolocated 198249 tweets
Managed to geolocate 42 percent


In [11]:
# Testing that it worked
tweets[0]['geolocated']

u'CH'

## Insert Gender

In [13]:
import gender
g=gender.Gender()
g.gender(tweets[1]['interaction']['author']['name']) #Testing that it works

{u'VALENTINE': {'gender': 'mm',
  'probability': 0.7840717162530856,
  'volume_female': 3324.0,
  'volume_male': 12070.0}}

In [14]:
# Gender of tweeter or retweeter
genderError=0
for tweet in tweets:
  try:
    tweet['gender']=g.gender(tweet['interaction']['author']['name'])
  except:
    genderError+=1
    tweet['gender']=None
print('Couldn\'t add gender probability for %d tweets' % genderError)
print('Managed to add gender to %d p.c.' % (100.0*(1.0-(float(genderError)/len(tweets)))))

Couldn't add gender probability for 21030 tweets
Managed to add gender to 95 p.c.


In [16]:
# Testing that it worked
tweets[1]['gender'].values()[0]['gender']

'mm'

## Save the Topics and Hashtags

In [17]:
topics=['Campaign','Discrimination','Prevention','Testing']
emptyTopics=[0 for t in topics]
header=['id','time','content','type','datasift_lang','twitter_lang','twitter_location','UNGP_location',
        'datasift_gender','UNGP_gender','gender_prob','followers','friends','topic','subtopic']
header.extend(topics)
header.extend(['interaction_hashtags','twitter_mentions','normalised_links','links_domain','user_description',
               'user_screen_name'])

## Save Data to Disk

### Save as JSON

In [18]:
with open('../data/all.json','wb') as f: f.write(json.dumps(tweets))

### Save as TSV

In [19]:
outFile=csv.writer(open('../data/all.tsv','wb'),delimiter='\t')
outFile.writerow(header)

In [20]:
nIdError=0
nDateError=0
nContentError=0
nTypeError=0
nLanguageError=0
nTwitterLanguageError=0
nLocationError=0
nUngpLocationError=0
nGenderError=0
nUngpGenderError=0
nUngpGenderProbError=0
nFollowersError=0
nFriendsError=0
nTopicKeyError=0
nTopicValueError=0
nTopicError=0
nTopicLengthError=0
nTagsError=0
nMentionsError=0
nLinksError=0
nDomainsError=0
nDescriptionError=0
nScreenNameError=0

documents=[]

for tweet in tweets:
  outList=[]
  try:
    outList.append(tweet['interaction']['id'])
    documents.append(tweet['interaction']['id'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nIdError+=1
  try:
    outList.append(tweet['interaction']['created_at'])
    documents.append(tweet['interaction']['created_at'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nDateError+=1
  try:
    outList.append(tweet['interaction']['content'].encode('utf-8').replace('\n',' '))
    documents.append(tweet['interaction']['content'].encode('utf-8').replace('\n',' '))
  except:
    outList.append('NaN')
    documents.append('NaN')
    nContentError+=1
  try:
    outList.append(tweet['interaction']['type'].encode('utf-8'))
    documents.append(tweet['interaction']['type'].encode('utf-8'))
  except:
    outList.append('NaN')
    documents.append('NaN')
    nTypeError+=1
  try:
    outList.append(tweet['language']['tag'].encode('utf-8'))
    documents.append(tweet['language']['tag'].encode('utf-8'))
  except:
    outList.append('NaN')
    documents.append('NaN')
    nLanguageError+=1
  try:
    outList.append(tweet['twitter']['lang'].encode('utf-8'))
    documents.append(tweet['twitter']['lang'].encode('utf-8'))
  except:
    outList.append('NaN')
    documents.append('NaN')
    nTwitterLanguageError+=1
  try:
    outList.append(tweet['twitter']['user']['location'].encode('utf-8'))
    documents.append(tweet['twitter']['user']['location'].encode('utf-8'))
  except:
    outList.append('NaN')
    documents.append('NaN')
    nLocationError+=1
  try:
    outList.append(tweet['geolocated'].encode('utf-8'))
    documents.append(tweet['geolocated'].encode('utf-8'))
  except:
    outList.append('NaN')
    documents.append('NaN')
    nUngpLocationError+=1
  try:
    outList.append(tweet['demographic']['gender'].encode('utf-8'))
    documents.append(tweet['demographic']['gender'].encode('utf-8'))
  except:
    outList.append('NaN')
    documents.append('NaN')
    nGenderError+=1
  try:
    outList.append(tweet['gender'].values()[0]['gender'].encode('utf-8'))
    documents.append(tweet['gender'].values()[0]['gender'].encode('utf-8'))
  except:
    outList.append('NaN')
    documents.append('NaN')
    nUngpGenderError+=1
  try:
    outList.append(tweet['gender'].values()[0]['probability'])
    documents.append(tweet['gender'].values()[0]['probability'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nUngpGenderProbError+=1
  try:
    outList.append(tweet['twitter']['user']['followers_count'])
    documents.append(tweet['twitter']['user']['followers_count'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nFollowersError+=1
  try:
    outList.append(tweet['twitter']['user']['friends_count'])
    documents.append(tweet['twitter']['user']['friends_count'])
  except:
    outList.append('NaN')
    documents.append('NaN')
    nFriendsError+=1
  try:
    outList.append(tweet['interaction']['tag_tree']['topic'].keys()[0].encode('utf-8'))
    documents.append(tweet['interaction']['tag_tree']['topic'].keys()[0].encode('utf-8'))
  except:
    outList.append('NaN')
    documents.append('NaN')
    nTopicKeyError+=1
  try:
    outList.append(tweet['interaction']['tag_tree']['topic'].values()[0][0].encode('utf-8'))
    documents.append(tweet['interaction']['tag_tree']['topic'].values()[0][0].encode('utf-8'))
  except:
    outList.append('NaN')
    documents.append('NaN')
    nTopicValueError+=1
  try:
    tweetTopics=tweet['interaction']['tag_tree']['topic']
    binaryTopics=[0 for e in emptyTopics]
    for t in tweetTopics:
        binaryTopics[topics.index(t)]=1
    outList.extend(binaryTopics)
    documents.extend(binaryTopics)
  except:
    outList.extend(emptyTopics)
    nTopicError+=1
  try:
    tweetTags=','.join([h.lower() for h in tweet['interaction']['hashtags']])
    outList.append(tweetTags.decode('utf-8'))
    documents.append(tweetTags.decode('utf-8'))
  except:
    nTagsError+=1
    outList.append('NaN')
    documents.append('NaN')
  try:
    tweetMentions=','.join([m.lower() for m in tweet['twitter']['mentions']])
    outList.append(tweetMentions.decode('utf-8'))
    documents.append(tweetMentions.decode('utf-8'))
  except:
    nMentionsError+=1
    outList.append('NaN')
    documents.append('NaN')
  try:
    tweetLinks=','.join(tweet['links']['normalized_url'])
    outList.append(tweetLinks.decode('utf-8'))
    documents.append(tweetLinks.decode('utf-8'))
  except:
    nLinksError+=1
    outList.append('NaN')
    documents.append('NaN')
  try:
    tweetDomain=','.join(tweet['links']['domain'])
    outList.append(tweetDomain.decode('utf-8'))
    documents.append(tweetDomain.decode('utf-8'))
  except:
    nDomainsError+=1
    outList.append('NaN')
    documents.append('NaN')
  try:
    outList.append(tweet['twitter']['user']['description'].encode('utf-8'))
    documents.append(tweet['twitter']['user']['description'].encode('utf-8'))
  except:
    nDescriptionError+=1
    outList.append('NaN')
    documents.append('NaN')
  try:
    outList.append(tweet['twitter']['user']['screen_name'].encode('utf-8'))
    documents.append(tweet['twitter']['user']['screen_name'].encode('utf-8'))
  except:
    nScreenNameError+=1
    outList.append('NaN')
    documents.append('NaN')
    
            
  outFile.writerow(outList)

print "%d ID errors." % nIdError
print "%d Date errors." % nDateError
print "%d Content errors." % nContentError
print "%d Type errors." % nTypeError
print "%d DataSift language errors." % nLanguageError
print "%d Twitter language errors." % nTwitterLanguageError
print "%d Twitter Location errors." % nLocationError
print "%d UNGP Location errors." % nUngpLocationError
print "%d Gender errors." % nGenderError
print "%d UNGP gender errors." % nUngpGenderError
print "%d UNGP gender probability errors." % nUngpGenderProbError
print "%d Follower errors." % nFollowersError
print "%d Friends errors." % nFriendsError
print "%d Topic Key errors." % nTopicKeyError
print "%d Topic Value errors." % nTopicValueError
print "%d Topic errors." % nTopicError
print "%d Topic lengtherrors." % nTopicLengthError
print "%d Interaction hashtag errors." % nTagsError
print "%d Interaction mention errors." % nMentionsError
print "%d Interaction link errors." % nLinksError
print "%d Domain errors." % nDomainsError
print "%d Description errors." % nDescriptionError
print "%d Screen name errors." % nScreenNameError

0 ID errors.
0 Date errors.
0 Content errors.
0 Type errors.
9112 DataSift language errors.
192371 Twitter language errors.
295712 Twitter Location errors.
263646 UNGP Location errors.
187428 Gender errors.
205330 UNGP gender errors.
205330 UNGP gender probability errors.
192371 Follower errors.
192371 Friends errors.
0 Topic Key errors.
0 Topic Value errors.
0 Topic errors.
0 Topic lengtherrors.
404584 Interaction hashtag errors.
367573 Interaction mention errors.
404903 Interaction link errors.
414985 Domain errors.
227110 Description errors.
192371 Screen name errors.


In [1]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)