# Setup and function declarations

In [1]:
%pylab inline
import pandas as pd
import json
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import urllib2
import numpy as np
import os
import os.path
from scipy import misc
from time import strftime,strptime
import re
import math
import pymysql
import csv
import MySQLdb
def getimage(url, full_name):

    if os.path.isfile(full_name):
        return mpimg.imread(full_name)
    
    try:
        f = urllib2.urlopen(url)
    except:
        return None
    
    data = f.read()
    with open(full_name, "wb") as code:
        code.write(data)
    return mpimg.imread(full_name)


def getallimages(panda_name, column_name):
    downloaded_images = []
    for row in panda_name.loc[panda_name[column_name].notnull(),column_name]:
        for image in row:
            img = getimage(image['media_url'],image['media_url'].split('/')[-1])
            if not(img is None):
                downloaded_images.append(img)
    return downloaded_images

def klout_getId(screenname):
    url = 'http://api.klout.com/v2/identity.json/twitter?screenName={0}&key=memp3ncn4qvp6c8guzjcc8dp'.format(screenname)    
    try:
        return json.load(urllib2.urlopen(url))
    except:
        return None
    
def klout_getScore(kloutId):
    url = 'http://api.klout.com/v2/user.json/{0}/score?key=memp3ncn4qvp6c8guzjcc8dp'.format(kloutId)
    try:
        return json.load(urllib2.urlopen(url))
    except:
        return None
def extractImageUrl(cell):
    if cell is None:
        return None
    return cell[0]['media_url']

def convertDTToDB(cell):
    return strftime('%Y-%m-%d %H:%M:%S', strptime(cell,'%a %b %d %H:%M:%S +0000 %Y'))

def normalizeTweetText(text):
    if text is None or type(text) is float:
        return None
    return re.sub(r'[^\w#:@/\.\-\,]', ' ', text)

## Note that if we may put NULL for any of the attributes
## we should remove the single quotation marks from around them.
def ensureDBNull(cell):
    try:
        return 'NULL' if cell is None else ("'{0}'".format(cell))
    except:
        print(cell)
        return 'NULL' if cell is None else ("'{0}'".format(cell))

def extractHashTags(hashTags):
    if hashTags is None or hashTags == []:
        return None
    mylist = ''
    for hashtag in hashTags:
        mylist = mylist + ',' + hashtag['text']
    return mylist[1:]

def getTweetsInsertQuery(tweets):
    query = ""
    for tweet in tweets:
        temp = "REPLACE INTO tweet VALUES('{0}', '{1}', '{2}', '{3}', {4}, \
          '{5}', {6}, {7}, '{8}');\n".format(tweet[0],
                                     tweet[1],
                                     tweet[2],
                                     tweet[3],
                                     ensureDBNull(tweet[4]),
                                     tweet[5],
                                     ensureDBNull(tweet[6]),
                                     ensureDBNull(tweet[7]),
                                     tweet[8])
        
        query = "{0}{1}".format(query, temp)                
    return query

def getUsersInsertQuery(users):
    query = ""
    for user in users:
        temp = "REPLACE INTO user VALUES('{0}', '{1}', {2}, {3}, \
          '{4}', '{5}', '{6}', {7}, '{8}');\n".format(user[0],
                                                   user[1],
                                                   ensureDBNull(user[2]),
                                                   user[3],
                                                   user[4],
                                                   user[5],
                                                   user[6],
                                                   ensureDBNull(user[7]),
                                                   user[8])

        query = "{0}{1}".format(query, temp)                
    return query

def getPlacesInsertQuery(places):
    query = ""
    for place in places:
        temp = "REPLACE INTO place VALUES ('{0}', '{1}', '{2}', '{3}');\n".format(place[0],
                                                     place[1],
                                                     place[2],
                                                     place[3])

        query = "{0}{1}".format(query, temp)
    return query

def convertNanToNone(text):
    try:
        if str(text) == 'nan':
            return None
    except:
        pass
    return text
def getDBInstance():
    return None

def insertIntoDB(cursor, dataset, table):
    if table == "tweet":
        getInsertionQueries = getTweetsInsertQuery
    elif table == "place":
        getInsertionQueries = getPlacesInsertQuery
    elif table == "user":
        getInsertionQueries = getUsersInsertQuery
        
    first = 0;
    last = 10000;
    end = len(dataset.values)
    if last > end:
        mycursor.execute(getInsertionQueries(dataset.values))
    else:
        while first < end:
            mycursor.execute(getInsertionQueries(dataset.values[first:last]))
            first = last;
            last += last;
            if last > len(dataset.values):
                last = end + 1
    mycursor.close()
    mycursor = mydb.cursor()
                
def normalizeData(mydata):
    ## Choosing the needed columns, removing duplicates,
    ## remove all-null rows and renaming the columns
    mydata = mydata.drop_duplicates(subset = ['id_str'])
    mydata = mydata[['id_str','created_at','text','coordinates.coordinates',
                      'entities.media', 'entities.hashtags','lang','user.id_str','user.screen_name',
                      'user.location','user.verified','user.followers_count','user.friends_count',
                      'user.statuses_count','user.created_at','place.id','place.full_name',
                      'place.country_code','place.bounding_box.coordinates']]
    mydata = mydata.dropna(axis = 0, how = 'all')
    mydata = mydata.loc[mydata['id_str'].notnull()]
    mydata = mydata.reset_index(drop = True)
    mydata = mydata.rename(index=str, columns={'id_str' : 'tweet.id', 'coordinates.coordinates' : 'tweet.coordinates',
                                      'created_at' : 'tweet.created_at', 'text' : 'tweet.text', 'lang' : 'tweet.lang',
                                      'entities.media' : 'tweet.media', 'user.id_str' : 'user.id',
                                      'entities.hashtags' : 'tweet.hashtags',
                                      'place.full_name' : 'place.name', 'place.country_code' : 'place.country',
                                      'place.bounding_box.coordinates' : 'place.polygon'})
    mydata = mydata.applymap(convertNanToNone)
    mydata['tweet.created_at'] = mydata['tweet.created_at'].apply(convertDTToDB)
    mydata['user.created_at'] = mydata['user.created_at'].apply(convertDTToDB)
    mydata['tweet.coordinates'] = mydata['tweet.coordinates'].apply(convertNanToNone)
    mydata['tweet.text'] = mydata['tweet.text'].apply(normalizeTweetText)
    mydata['user.location'] = mydata['user.location'].apply(normalizeTweetText)
    mydata['tweet.media'] = mydata['tweet.media'].apply(extractImageUrl)
    mydata['tweet.hashtags'] = mydata['tweet.hashtags'].apply(extractHashTags)
    mydata['tweet.hashtags'] = mydata['tweet.hashtags'].apply(normalizeTweetText)
    return mydata.copy()

Populating the interactive namespace from numpy and matplotlib


In [2]:
## Instantiate a connection with the DB and make an instance of the cursor
mydb = getDBInstance()
mydb.autocommit(True)
mycursor = mydb.cursor()

In [None]:
## Upload the data as chunks, apparently the max at once is around 10k insertion for this case.
myjson = []
myfile = open("/media/saed/Data/ubuntu data/data.json", 'r')
a = myfile.readlines()
j = 0
for i in a:
    myjson.append(json.loads(i))
    j += 1
    if j%10000 == 0 or j == len(a):
        
        mydata = pd.io.json.json_normalize(myjson)
        mydata = normalizeData(mydata)
        
        ## Tweets
        tweets = mydata[['tweet.id','tweet.created_at','tweet.text','user.id','tweet.coordinates','place.id','tweet.media','tweet.hashtags','tweet.lang']]
        insertIntoDB(mycursor, tweets, 'tweet')
        
        ##Users
        users = mydata[['user.id','user.screen_name','user.location','user.verified','user.followers_count','user.friends_count','user.statuses_count','user.created_at']]
        users = users.drop_duplicates(subset = ['user.id'])
        users['klout_score'] = [0.0] * len(users)
        insertIntoDB(mycursor, users, 'user')
        
        ##Places
        places = mydata[['place.id','place.name','place.country','place.polygon']]
        places = places.drop_duplicates(subset = ['place.id'])
        places['place.name'] = places['place.name'].apply(normalizeTweetText)
        places = places.dropna(axis = 0, how = 'any')
        insertIntoDB(mycursor, places, 'place')
        myjson = []
        
myfile.close()