# Setup and function declarations

In [1]:
%pylab inline
import pandas as pd
import json
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import urllib2
import numpy as np
import os
import os.path
from scipy import misc
from time import strftime,strptime
import re
import math
import pymysql


def getimage(url, full_name):

    if os.path.isfile(full_name):
        return mpimg.imread(full_name)
    
    try:
        f = urllib2.urlopen(url)
    except:
        return None
    
    data = f.read()
    with open(full_name, "wb") as code:
        code.write(data)
    return mpimg.imread(full_name)


def getallimages(panda_name, column_name):
    downloaded_images = []
    for row in panda_name.loc[panda_name[column_name].notnull(),column_name]:
        for image in row:
            img = getimage(image['media_url'],image['media_url'].split('/')[-1])
            if not(img is None):
                downloaded_images.append(img)
    return downloaded_images

def klout_getId(screenname):
    url = 'http://api.klout.com/v2/identity.json/twitter?screenName={0}&key=memp3ncn4qvp6c8guzjcc8dp'.format(screenname)    
    try:
        return json.load(urllib2.urlopen(url))
    except:
        return None
    
def klout_getScore(kloutId):
    url = 'http://api.klout.com/v2/user.json/{0}/score?key=memp3ncn4qvp6c8guzjcc8dp'.format(kloutId)
    try:
        return json.load(urllib2.urlopen(url))
    except:
        return None
def extractImageUrl(cell):
    if cell is None:
        return None
    return cell[0]['media_url']

def convertDTToDB(cell):
    return strftime('%Y-%m-%d %H:%M:%S', strptime(cell,'%a %b %d %H:%M:%S +0000 %Y'))

def normalizeTweetText(text):
    if text is None or type(text) is float:
        return None
    return re.sub(r'[^\w#:@/\.\-\,]', ' ', text)

## Note that if we may put NULL for any of the attributes
## we should remove the single quotation marks from around them.
def ensureDBNull(cell):
    return 'NULL' if cell is None else ("'" + cell + "'")

def getTweetsInsertQuery(tweets):
    query = ""
    for tweet in tweets:
        temp = "('{0}', '{1}', '{2}', '{3}', {4}, \
          '{5}', {6}, '{7}')".format(tweet[0],
                                     tweet[1],
                                     tweet[2],
                                     tweet[3],
                                     ensureDBNull(tweet[4]),
                                     tweet[5],
                                     ensureDBNull(tweet[6]),
                                     tweet[7])
        
        query = "{0},\n{1}".format(query, temp)                
    query = "INSERT INTO tweet VALUES\n{0}".format(query[2:])
    return query

def getUsersInsertQuery(users):
    query = ""
    for user in users:
        temp = "('{0}', '{1}', {2}, {3}, \
          '{4}', '{5}', '{6}', {7}, '{8}')".format(user[0],
                                                   user[1],
                                                   ensureDBNull(user[2]),
                                                   user[3],
                                                   user[4],
                                                   user[5],
                                                   user[6],
                                                   ensureDBNull(user[7]),
                                                   user[8])

        query = "{0},\n{1}".format(query, temp)                
    query = "INSERT INTO user VALUES\n{0}".format(query[2:])
    return query

def getPlacesInsertQuery(places):
    query = ""
    for place in places:
        try:
            temp = "('{0}', '{1}', '{2}', '{3}')".format(place[0],
                                                         place[1],
                                                         place[2],
                                                         place[3])

            query = "{0},\n{1}".format(query, temp)
        except:
            print place
    query = "INSERT INTO place VALUES\n{0}".format(query[2:])
    return query

def convertNanToNone(text):
    try:
        if str(text) == 'nan':
            return None
    except:
        pass
    return text
def getDBInstance():
    return pymysql.connect(host="uncg.saadmtsa.club",    # your host, usually localhost
                         user="root",         # your username
                         passwd="vJnVubg49U",  # your password
                         db="twitter")

Populating the interactive namespace from numpy and matplotlib


In [2]:
## Instantiate a connection with the DB and make an instance of the cursor
mydb = getDBInstance()
mycursor = mydb.cursor()

In [3]:
## Get the data and put it in a panda dataframe
myjson = []
myfile = open("/media/saed/Data/ubuntu data/data.json", 'r')
for i in range(10000):
    myjson.append(json.loads(myfile.readline()))
myfile.close()
mypanda = pd.io.json.json_normalize(myjson)

In [4]:
mydata = mypanda.copy()

In [5]:
## Choosing the needed columns, removing duplicates,
## remove all-null rows and renaming the columns
mydata = mydata.drop_duplicates(subset = ['id_str'])
mydata = mydata[['id_str','created_at','text','coordinates.coordinates',
                  'entities.media','lang','user.id_str','user.screen_name',
                  'user.location','user.verified','user.followers_count','user.friends_count',
                  'user.statuses_count','user.created_at','place.id','place.full_name',
                  'place.country_code','place.bounding_box.coordinates']]
mydata = mydata.dropna(axis = 0, how = 'all')
mydata = mydata.loc[mydata['id_str'].notnull()]
mydata = mydata.reset_index(drop = True)
mydata = mydata.rename(index=str, columns={'id_str' : 'tweet.id', 'coordinates.coordinates' : 'tweet.coordinates',
                                  'created_at' : 'tweet.created_at', 'text' : 'tweet.text', 'lang' : 'tweet.lang',
                                  'entities.media' : 'tweet.media', 'user.id_str' : 'user.id',
                                  'place.full_name' : 'place.name', 'place.country_code' : 'place.country',
                                  'place.bounding_box.coordinates' : 'place.polygon'})

In [6]:
## Normalizing data representation a little
mydata = mydata.applymap(convertNanToNone)
mydata['tweet.created_at'] = mydata['tweet.created_at'].apply(convertDTToDB)
mydata['user.created_at'] = mydata['user.created_at'].apply(convertDTToDB)
mydata['tweet.coordinates'] = mydata['tweet.coordinates'].apply(convertNanToNone)
mydata['tweet.text'] = mydata['tweet.text'].apply(normalizeTweetText)
mydata['user.location'] = mydata['user.location'].apply(normalizeTweetText)
mydata['tweet.media'] = mydata['tweet.media'].apply(extractImageUrl)
mydata

Unnamed: 0,tweet.id,tweet.created_at,tweet.text,tweet.coordinates,tweet.media,tweet.lang,user.id,user.screen_name,user.location,user.verified,user.followers_count,user.friends_count,user.statuses_count,user.created_at,place.id,place.name,place.country,place.polygon
0,962724762760839168,2018-02-11 16:27:24,@Gabbyxclairee get it right... https://t.co/ju...,,http://pbs.twimg.com/tweet_video_thumb/DVxJDvT...,en,2191035103,oscarmart91232,,False,202.0,766.0,6106.0,2013-11-12 20:54:06,650ad90d3abfb53b,"Junction City, OR",US,"[[[-123.22293, 44.193041], [-123.22293, 44.233..."
1,962724763566211072,2018-02-11 16:27:24,Going to be an annoying last 35 minutes.,,,en,30269393,TJ_Pittinger,"Tampa, FL",False,5415.0,606.0,164117.0,2009-04-10 17:47:42,00b6a1c7611dcf24,"Connerton, FL",US,"[[[-82.520132, 28.251311], [-82.520132, 28.324..."
2,962724763822108672,2018-02-11 16:27:24,i have a credit card now and idk what to do wi...,,,en,908661288,ashcucs,"Boston, MA",False,472.0,299.0,22791.0,2012-10-27 18:27:18,67b98f17fdcf20be,"Boston, MA",US,"[[[-71.191421, 42.227797], [-71.191421, 42.399..."
3,962724763910266880,2018-02-11 16:27:24,I m essentially being paid to sit here and wat...,,,en,2491889299,lainey_bugs,"Cincinnati, OH",False,803.0,550.0,41082.0,2014-05-12 19:34:11,e444ecd51bd16ff3,"Cincinnati, OH",US,"[[[-84.710722, 39.052962], [-84.710722, 39.221..."
4,962724764157661184,2018-02-11 16:27:24,Hmm I don t think I understand https://t.co/...,,,en,3326890636,FandomFeliciano,"Dallas-Fort Worth, TX",False,1853.0,1499.0,23005.0,2015-06-15 12:17:58,42e46bc3663a4b5f,"Fort Worth, TX",US,"[[[-97.538285, 32.569477], [-97.538285, 32.990..."
5,962724764635860992,2018-02-11 16:27:25,I miss my Daugther,,,en,42798593,simplyySemaj,"Philadelphia, PA",False,398.0,352.0,23572.0,2009-05-27 02:54:33,e4a0d228eb6be76b,"Philadelphia, PA",US,"[[[-75.280284, 39.871811], [-75.280284, 40.137..."
6,962724765919334400,2018-02-11 16:27:25,I haven t worn any makeup for like 2 or 3 week...,,,en,2816396605,sarahbeth102938,"Hawesville, KY",False,576.0,1281.0,11044.0,2014-09-18 04:46:22,c596737f61af0189,"Hawesville, KY",US,"[[[-86.774577, 37.882528], [-86.774577, 37.906..."
7,962724765969547269,2018-02-11 16:27:25,Now on tap: Rusty Rail Wolf King Imperial Stou...,"[-80.0439575, 40.3870542]",,en,703971530,KornerPub,"Mt. Lebanon, PA",False,604.0,239.0,1410.0,2012-07-18 23:19:11,7cd18caa8c0ebb9f,"Mount Lebanon, PA",US,"[[[-80.082398, 40.350594], [-80.082398, 40.401..."
8,962724765801811971,2018-02-11 16:27:25,girl @ this rain and it s cold af,"[-96.7380196, 32.91375244]",,en,107784207,sirsIaysalot,,False,2266.0,269.0,12695.0,2010-01-23 18:36:19,18810aa5b43e76c7,"Dallas, TX",US,"[[[-96.977527, 32.620678], [-96.977527, 33.019..."
9,962724766112153601,2018-02-11 16:27:25,One Thing I ve Learned Is To Not Ask For Shit ...,,,en,995423779,LoveKashhh,Sc: WhyHateMe4,False,488.0,468.0,3888.0,2012-12-07 16:36:07,dc62519fda13b4ec,"Tampa, FL",US,"[[[-82.620093, 27.821353], [-82.620093, 28.171..."


In [7]:
tweets = mydata[['tweet.id','tweet.created_at','tweet.text','user.id','tweet.coordinates','place.id','tweet.media','tweet.lang']]
tweets

Unnamed: 0,tweet.id,tweet.created_at,tweet.text,user.id,tweet.coordinates,place.id,tweet.media,tweet.lang
0,962724762760839168,2018-02-11 16:27:24,@Gabbyxclairee get it right... https://t.co/ju...,2191035103,,650ad90d3abfb53b,http://pbs.twimg.com/tweet_video_thumb/DVxJDvT...,en
1,962724763566211072,2018-02-11 16:27:24,Going to be an annoying last 35 minutes.,30269393,,00b6a1c7611dcf24,,en
2,962724763822108672,2018-02-11 16:27:24,i have a credit card now and idk what to do wi...,908661288,,67b98f17fdcf20be,,en
3,962724763910266880,2018-02-11 16:27:24,I m essentially being paid to sit here and wat...,2491889299,,e444ecd51bd16ff3,,en
4,962724764157661184,2018-02-11 16:27:24,Hmm I don t think I understand https://t.co/...,3326890636,,42e46bc3663a4b5f,,en
5,962724764635860992,2018-02-11 16:27:25,I miss my Daugther,42798593,,e4a0d228eb6be76b,,en
6,962724765919334400,2018-02-11 16:27:25,I haven t worn any makeup for like 2 or 3 week...,2816396605,,c596737f61af0189,,en
7,962724765969547269,2018-02-11 16:27:25,Now on tap: Rusty Rail Wolf King Imperial Stou...,703971530,"[-80.0439575, 40.3870542]",7cd18caa8c0ebb9f,,en
8,962724765801811971,2018-02-11 16:27:25,girl @ this rain and it s cold af,107784207,"[-96.7380196, 32.91375244]",18810aa5b43e76c7,,en
9,962724766112153601,2018-02-11 16:27:25,One Thing I ve Learned Is To Not Ask For Shit ...,995423779,,dc62519fda13b4ec,,en


In [8]:
try:
    mycursor.execute(getTweetsInsertQuery(tweets.values))
    mydb.commit()
except:
    pass

In [9]:
users = mydata[['user.id','user.screen_name','user.location','user.verified','user.followers_count','user.friends_count','user.statuses_count','user.created_at']]
users = users.drop_duplicates(subset = ['user.id'])
users['klout_score'] = [0.0] * len(users)
users

Unnamed: 0,user.id,user.screen_name,user.location,user.verified,user.followers_count,user.friends_count,user.statuses_count,user.created_at,klout_score
0,2191035103,oscarmart91232,,False,202.0,766.0,6106.0,2013-11-12 20:54:06,0.0
1,30269393,TJ_Pittinger,"Tampa, FL",False,5415.0,606.0,164117.0,2009-04-10 17:47:42,0.0
2,908661288,ashcucs,"Boston, MA",False,472.0,299.0,22791.0,2012-10-27 18:27:18,0.0
3,2491889299,lainey_bugs,"Cincinnati, OH",False,803.0,550.0,41082.0,2014-05-12 19:34:11,0.0
4,3326890636,FandomFeliciano,"Dallas-Fort Worth, TX",False,1853.0,1499.0,23005.0,2015-06-15 12:17:58,0.0
5,42798593,simplyySemaj,"Philadelphia, PA",False,398.0,352.0,23572.0,2009-05-27 02:54:33,0.0
6,2816396605,sarahbeth102938,"Hawesville, KY",False,576.0,1281.0,11044.0,2014-09-18 04:46:22,0.0
7,703971530,KornerPub,"Mt. Lebanon, PA",False,604.0,239.0,1410.0,2012-07-18 23:19:11,0.0
8,107784207,sirsIaysalot,,False,2266.0,269.0,12695.0,2010-01-23 18:36:19,0.0
9,995423779,LoveKashhh,Sc: WhyHateMe4,False,488.0,468.0,3888.0,2012-12-07 16:36:07,0.0


In [10]:
try:
    mycursor.execute(getUsersInsertQuery(users.values))
    mydb.commit()
except:
    pass

In [11]:
places = mydata[['place.id','place.name','place.country','place.polygon']]
places = places.drop_duplicates(subset = ['place.id'])
places['place.name'] = places['place.name'].apply(normalizeTweetText)
places

Unnamed: 0,place.id,place.name,place.country,place.polygon
0,650ad90d3abfb53b,"Junction City, OR",US,"[[[-123.22293, 44.193041], [-123.22293, 44.233..."
1,00b6a1c7611dcf24,"Connerton, FL",US,"[[[-82.520132, 28.251311], [-82.520132, 28.324..."
2,67b98f17fdcf20be,"Boston, MA",US,"[[[-71.191421, 42.227797], [-71.191421, 42.399..."
3,e444ecd51bd16ff3,"Cincinnati, OH",US,"[[[-84.710722, 39.052962], [-84.710722, 39.221..."
4,42e46bc3663a4b5f,"Fort Worth, TX",US,"[[[-97.538285, 32.569477], [-97.538285, 32.990..."
5,e4a0d228eb6be76b,"Philadelphia, PA",US,"[[[-75.280284, 39.871811], [-75.280284, 40.137..."
6,c596737f61af0189,"Hawesville, KY",US,"[[[-86.774577, 37.882528], [-86.774577, 37.906..."
7,7cd18caa8c0ebb9f,"Mount Lebanon, PA",US,"[[[-80.082398, 40.350594], [-80.082398, 40.401..."
8,18810aa5b43e76c7,"Dallas, TX",US,"[[[-96.977527, 32.620678], [-96.977527, 33.019..."
9,dc62519fda13b4ec,"Tampa, FL",US,"[[[-82.620093, 27.821353], [-82.620093, 28.171..."


In [14]:
try:
    mycursor.execute(getPlacesInsertQuery(places.values))
    mydb.commit()
except:
    pass