In [246]:
import pandas as pd
import media_mapper as mm
import nltk
from ast import literal_eval
import json
import numpy as np
from string import punctuation
from nltk.corpus import stopwords
import re

##gather data and tweets

In [202]:
#not sure if I should keep this 
def modify_tokens(df):
    '''Tokens in dataframe were created by the library twokenize /
    (https://github.com/myleott/ark-twokenize-py). This function modifes the tokens.
    It also removed neighborhood blocks that are located in the ocean to clean
    the data. The modifed dataframe is returned.
    '''
    df['tokens'] = df['tokens'].apply(lambda x: literal_eval(x)[0])

    stop = stopwords.words('english')
    stop.extend([p for p in punctuation])
    stop.extend(['...',',,',',,,','..', 't','y','(@',')', 'c','i','I','a','…',\
                '@','.', 'co', 'com','amp', 'via','http','htt','https', '()',']'])
    stop =[unicode(word) for word in stop]

    #remove geoids located in the ocean 
    odd_ids = ['060750601001016', '060750179021003','060759901000003',\
               '060759901000002', '060750179021000','060750601001000',\
               '060759804011003', '060750201001001']
    df = df[~df['geoid10'].isin(odd_ids)]
    
    #format tweets per hour 
    df['tph'] = df.tph.apply(lambda x: format(x, '.2f'))
    
    return df

In [44]:
def get_tweets_per_hour(df):
    '''
    INPUT: a dataframe with tweets tagged with time information.
    OUTPUT: a transformed dataframe, where dataframe has been grouped
        to obtain the rate of tweets per hour for each day.
        Tokenized tweet text for every hour has been appended to one mater list.'''
    
    #set a count of tweets to determine tweet rate
    df['tweetcnt'] = 1
    #get a total count of tweets
    dfh = df.groupby(['geoid10','date', 'hour']).agg(sum).reset_index().drop('id', 1)
    #append the tokenized tweet data together
    d_txt = df.groupby(['geoid10', 'date','hour'])['text'].apply(lambda x: ','.join(x)).reset_index()
    #merge dataframes
    dfh['tokens'] = d_txt['text']
    dfh['tph'] = dfh['tweetcnt']
    dfh.drop('tweetcnt', 1, inplace = True)
    return dfh

###break data into hours and regions. add tweet corpus to the data. 

In [66]:
def tweets_by_hour(df):
    hdf = df.groupby(['geoid10', 'hour']).agg(np.mean).reset_index()
    #get a grouped sum of the words
    hour_df_txt = df.groupby(['geoid10', 'hour'])['tokens'].apply(lambda x: ','.join(x)).reset_index()
    #merge these two dataframes together
    hdf['tokens'] = hour_df_txt['tokens']
    hdf['hr_bin'] = pd.cut(hdf.hour, bins = 5, labels = ['latenight', 'dawn','morning','afternoon','evening'])
    return hdf

In [205]:
df = mm.pipeline.retrieve_and_merge_tweet_data()
df = mm.pipeline.transform_timestamp(df, hour = True)
df = get_tweets_per_hour(df)
df_hour = tweets_by_hour(df)


In [208]:
df_hour = retrieve_geometry_information(df_hour)

###merge sf tweets and counts with shape geometry

In [206]:
def retrieve_geometry_information(df):
    '''Obtains the geometry data for each geoid10. 
    Returns the dataframe with an extra geometry column.'''
    ###Retrieve the Shape Files for Each Block:
    geodf = pd.read_csv('../../../data/intermediate_data/sf_only_sql_shapes.csv')
    #format the dataframe
    geodf['geoid10'] = geodf.geoid10.astype('str')
    geodf.drop('Unnamed: 0', axis = 1, inplace = True)
    #set the index as the geoid
    #need to alter the geoid10 column to merge with shape files
    
    df['geoid10'] =df['geoid10'].apply(lambda x: x[1:])
    #create a new dataframe 
    hourlydf = pd.merge(geodf, df, on='geoid10', how='outer')
    #fill no tweets with a zero value
    hourlydf.dropna(subset = ['hour'], inplace = True)
    
    return hourlydf

In [173]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['...',',,',',,,','..', 't','y','(@',')', 'c','i','I','a', ',',\
                '@','.', 'co', 'com','amp','?' 'via','http','htt','https', '()',']'])
stopwords.extend([str(char) for char in punctuation])
sstopwords=[unicode(word) for word in stopwords]

def top_tokens(corpus_list, stopwords= stopwords, number=10):
    '''Takes a list of tokens. Returns the top ten, unless a different number given.'''
    #Checks to make sure the tokens are in a list, and not a string
    tokens = literal_eval(corpus_list)
    #If there are multiple tweets, flatten the list
    if type(tokens) ==tuple:
        tokens =[item for sublist in tokens for item in sublist]  
    tokens = [re.sub(r'http.*$', '', item) for item in tokens]
    allWordExceptStopDist = nltk.FreqDist(w.lower() for w in tokens if w not in stopwords) 
    mostCommon= allWordExceptStopDist.most_common(number)
    top_ten_string = ' '.join([tup[0] for tup in mostCommon])
    return top_ten_string

In [110]:
def add_properties_geo(row):
    geoid = row['geoid10']
    tweetrate = row['tph']
    top_ten = row['top_ten']
    geo_json = {"type": "Feature", "geometry": json.loads(row['geometry']), \
                "properties": {'geoid': geoid ,'tweetrate': tweetrate, 'top_ten': top_ten }}
    return geo_json

def dataframe_to_geojson(df, outfilename):
    '''Takes in a dataframe with a count, geoid10, and list of tokens. Dumps it into a json geojason file'''
    df['geoid10'] = df['geoid10'].astype('str')
    df["tph"] = df['tph'].astype('str')
    list_to_export = []
    for idx, row in df.iterrows():
        list_to_export.append(add_properties_geo(row))
    with open(outfilename, 'w') as outfile:
        json.dump(list_to_export, outfile)

In [209]:
#run this on every column of the text 
df_hour['top_ten'] = df_hour.tokens.apply(top_tokens)

odd_ids = ['060750601001016', '060750179021003','060759901000003',\
               '060759901000002', '060750179021000','060750601001000',\
               '060759804011003', '060750201001001']
    
df_test = df_hour[~df_hour['geoid10'].isin(odd_ids)]


In [210]:
df_hour.hr_bin.unique()

array(['latenight', 'dawn', 'afternoon', 'evening', 'morning'], dtype=object)

In [211]:
for time in df_hour.hr_bin.unique():
    print time
    df_hour = df_hour[df_hour['hr_bin']== time]
    output = '../../../../app/data/test_times/' + time + '.json'
    print output
    dataframe_to_geojson(df_hour, output)

latenight
../../../../app/data/test_times/latenight.json
dawn
../../../../app/data/test_times/dawn.json
afternoon
../../../../app/data/test_times/afternoon.json
evening
../../../../app/data/test_times/evening.json
morning
../../../../app/data/test_times/morning.json


#Map By Weekend/WeekDay

###break the data into weekday/weekend

In [39]:
#goldengate=['060750601001000', '060750601001007'*, '060750601001016'
crissyfield=['06075060100102', '060750601001052', '060750601001009']


prosidiooffthegrid =['060750601001102']

dolorespark= ['060750206001004', '06075026001009']

pier= ['060750101001001', '060750101001001']

remove= ['060750607001000', '060750179021003', '060750179021000', '060759804011003', '060750102001002', '060750102001000', '060759804011000']

In [144]:
def get_tweets_per_day(df):
    '''
    INPUT: a dataframe with tweets tagged with time information.
    OUTPUT: a transformed dataframe, where dataframe has been grouped
        to obtain the rate of tweets per hour for each day.
        Tokenized tweet text for every hour has been appended to one mater list.'''
    
    #set a count of tweets to determine tweet rate
    df['tweetcnt'] = 1
    #get a total count of tweets
    dfh = df.groupby(['geoid10','DOW', 'date']).agg(sum).reset_index().drop('id', 1)
    #append the tokenized tweet data together
    d_txt = df.groupby(['geoid10', 'DOW', 'date'])['text'].apply(lambda x: ','.join(x)).reset_index()
    #merge dataframes
    dfh['tokens'] = d_txt['text']
    #tweetcnt is now the tweets per day (tpd)
    dfh['tpd'] = dfh['tweetcnt']
    dfh.drop('tweetcnt', 1, inplace = True)
    return dfh

In [247]:
df = mm.pipeline.retrieve_and_merge_tweet_data()
wkd_df = mm.pipeline.transform_timestamp(df, DOW = True)
wkd_df = get_tweets_per_day(wkd_df)


In [248]:
#remove geoids that are in the ocean
odd_ids = ['060750601001016', '060750179021003','060759901000003',\
               '060759901000002', '060750179021000','060750601001000',\
               '060759804011003', '060750201001001']  
wkd_df = wkd_df[~wkd_df['geoid10'].isin(odd_ids)]

In [249]:

wknd_df = wkd_df.groupby(['geoid10', 'DOW']).agg(np.mean).reset_index()
#get a grouped sum of the words
wkd_df_txt = wkd_df.groupby(['geoid10', 'DOW'])['tokens'].apply(lambda x: ','.join(x)).reset_index()
#merge these two dataframes together
wknd_df['tokens'] = wkd_df_txt['tokens']

In [250]:
wknd_df.head()

Unnamed: 0,geoid10,DOW,tpd,tokens
0,60750101001000,0,4.0,"[u'#Repost', u'@robtalcual', u'with', u'repost..."
1,60750101001000,1,2.5,"[u'Dinner', u'in', u'San', u'Fran', u'with', u..."
2,60750101001000,2,2.5,"[u'@scomassf', u'http://t.co/WnccwChDbQ', u'We..."
3,60750101001000,3,4.0,"[u'*', u'crab', u'emoji', u'*', u'@', u'Pier',..."
4,60750101001000,4,7.0,"[u'The', u'#rough', u'#wild', u'#choppy', u'#p..."


###get weekend, weekdays 

In [251]:
weeknd_df = wknd_df[wknd_df['DOW'] > 5].drop('DOW', 1)
df_weeknd = weeknd_df.groupby('geoid10').agg(np.mean).reset_index()
wknd_txt = weeknd_df.groupby(['geoid10'])['tokens'].apply(lambda x: ','.join(x)).reset_index()
#merge these two dataframes together
df_weeknd['tokens'] = wknd_txt['tokens']

In [252]:
df_weeknd.head()

Unnamed: 0,geoid10,tpd,tokens
0,60750101001000,6.333333,"[u'Acabo', u'de', u'publicar', u'una', u'foto'..."
1,60750101001001,53.666667,"[u'a', u""gentleman's"", u'quarters', u'at', u'#..."
2,60750101001004,26.0,"[u'BucketHats', u',', u'BreadBowls', u',', u'a..."
3,60750101001005,1.0,"[u'Just', u'posted', u'a', u'photo', u'@', u""P..."
4,60750101001006,2.0,"[u'Idk', u'how', u'many', u'shabby', u'ass', u..."


In [253]:
dfweekday = wknd_df[wknd_df['DOW'] < 6].drop('DOW', 1)

dfwkday = dfweekday.groupby('geoid10').agg(np.mean).reset_index()
wkdy_txt = dfweekday.groupby('geoid10')['tokens'].apply(lambda x: ','.join(x)).reset_index()
#merge these two dataframes together
dfwkday['tokens'] = wkdy_txt['tokens']

In [254]:
dfwkday.head()

Unnamed: 0,geoid10,tpd,tokens
0,60750101001000,4.666667,"[u'#Repost', u'@robtalcual', u'with', u'repost..."
1,60750101001001,40.194444,"[u""It's"", u'been', u'a', u'good', u'Sunday', u..."
2,60750101001002,1.0,"[u'Remember', u'why', u'you', u'started', u'ht..."
3,60750101001004,13.416667,"[u'My', u'last', u'#SFO', u'post', u'\U0001f62..."
4,60750101001005,1.6,"[u'I', u'want', u'one', u'for', u'my', u'birth..."


In [255]:
def seperate_weekends(df, weekend):
    '''Takes a dataframe with a column marked with the day of week.
    If weekend is True, returns a dataframe with just the weekend values
    If false, returns a dataframe with just the weekday values.
    Performs groupby to get mean tweet per day based on this grouping.'''
    if weekend == True:
        daysofweek = [6,7]
    else:
        threshold = [1,2,3,4,5]
    df = df[df['DOW'].isin(daysofweek)].drop('DOW', 1)
    dfweek = df.groupby('geoid10').agg(np.mean).reset_index()
    dfweek_txt = df.groupby('geoid10')['tokens'].apply(lambda x: ','.join(x)).reset_index()
    #merge these two dataframes together
    dfweek['tokens'] = dfweek_txt['tokens']

###tokenize the text

In [256]:
#run this on every column of the text 
dfwkday['top_ten'] = dfwkday.tokens.apply(top_tokens)
#make a new column of the top tweets
df_weeknd['top_ten'] = df_weeknd.tokens.apply(top_tokens)



In [261]:

def retrieve_geometry_information(df):
    '''Obtains the geometry data for each geoid10. 
    Returns the dataframe with an extra geometry column.'''
    ###Retrieve the Shape Files for Each Block:
    geodf = pd.read_csv('../../../data/intermediate_data/sf_only_sql_shapes.csv')
    #format the dataframe
    geodf['geoid10'] = geodf.geoid10.astype('str')
    geodf.drop('Unnamed: 0', axis = 1, inplace = True)
    #set the index as the geoid
    #need to alter the geoid10 column to merge with shape files
    
    df['geoid10'] =df['geoid10'].apply(lambda x: x[1:])
    #create a new dataframe 
    weekdf = pd.merge(geodf, df, on='geoid10', how='outer')
    #fill no tweets with a zero value
    weekdf.dropna(subset = ['tpd'], inplace = True)
    return weekdf


In [245]:
#dfwkday['geoid10'] = df_weeknd.geoid10.apply(lambda x: str(6) + x )

###get shape geometry

In [262]:
df_end = retrieve_geometry_information(df_weeknd)
df_day = retrieve_geometry_information(dfwkday)

In [270]:
df_day.head()

Unnamed: 0,geometry,geoid10,tpd,tokens,top_ten
5,"{""type"":""MultiPolygon"",""coordinates"":[[[[-122....",60750179021023,1.333333,"[u'Not', u'a', u'bad', u'view', u'on', u'my', ...",bridge bay san wherever #sanfran shopping tri...
6,"{""type"":""MultiPolygon"",""coordinates"":[[[[-122....",60750179021008,2.0,"[u'Two', u'lanes', u'blocked', u'in', u'#BayBr...",buena yerba back mins lanes island two eb dela...
14,"{""type"":""MultiPolygon"",""coordinates"":[[[[-122....",60750179021057,1.0,"[u'raw', u'n', u'true']",raw true n
19,"{""type"":""MultiPolygon"",""coordinates"":[[[[-122....",60750179021054,1.0,"[u'@chrisjrn', u'wait', u'what', u'were', u'yo...",@chrisjrn thinking wait
26,"{""type"":""MultiPolygon"",""coordinates"":[[[[-122....",60750179021060,1.0,"[u'The', u'cool', u',', u'grey', u'city', u'of...",bridge bay city … treasure love bottling isla...


###now make them a geojason!

In [272]:
dataframe_to_geojson(df_end, '../../../../app/data/test_times/weekend.json')
dataframe_to_geojson(df_day, '../../../../app/data/test_times/weekday.json')

In [271]:
def add_properties_geo(row):
    '''Translates a row of a dataframe into a geo_json string'''

    geoid = row['geoid10']
    tweetrate = row['tpd']
    top_ten = row['top_ten']
    geo_json = {"type": "Feature", "geometry": json.loads(row['geometry']), \
                "properties": {'geoid': geoid ,'tweetrate': tweetrate, 'top_ten': top_ten }}
    return geo_json

def dataframe_to_geojson(df, outfilename):
    '''Takes in a dataframe with a count, geoid10, and list of tokens. 
    Dumps it into a geojson file for mapping.ß'''
    
    df['geoid10'] = df['geoid10'].astype('str')
    df["tpd"] = df['tpd'].astype('str')
    list_to_export = []
    for idx, row in df.iterrows():
        list_to_export.append(add_properties_geo(row))
    with open(outfilename, 'w') as outfile:
        json.dump(list_to_export, outfile)