In [25]:
import pandas as pd
import media_mapper as mm
import nltk
from ast import literal_eval
import json
import numpy as np

##gather data and tweets

In [5]:
def retrieve_and_merge_tweet_data():
    '''Retrieves twitter geo data from SQL, and tweet text. 
    Returns the merged dataframe.'''
    #get SF Data From SQL
    df = mm.pipeline.retrieve_sql_tweets('tweets_with_geoV6')
    #get text data from picke
    dftxt = pd.read_csv('../../data/intermediate_data/json_tweets_in_df_twitokend.csv')
    df = df.set_index('id')
    dftxt = dftxt.set_index('id')
    dfall = df.join(dftxt).reset_index()
    dfall.drop('Unnamed: 0', 1, inplace = True)
    return dfall

In [6]:
df = retrieve_and_merge_tweet_data()

###break data into hours and regions. add tweet corpus to the data. 

In [282]:
hour_df = mm.pipeline.transform_timestamp(df, hour = True)
hour_df['tweetcnt'] = 1
#get a total count of tweets
hdf = hour_df.groupby(['geoid10', 'hour']).agg(sum).reset_index().drop('id', 1)
#get a grouped sum of the words
hour_df_txt = hour_df.groupby(['geoid10', 'hour'])['text'].apply(lambda x: ','.join(x)).reset_index()
#merge these two dataframes together
hdf['tokens'] = hour_df_txt['text']

In [49]:
hdf.head()

Unnamed: 0,geoid10,hour,tweetcnt,tokens
0,60750101001000,0,4,"[u'#ship', u'#photo', u'#edit', u'#sail', u'@'..."
1,60750101001000,1,4,"[u'My', u'lovey', u'@', u'Danville', u',', u'C..."
2,60750101001000,2,3,"[u'What', u'up', u',', u'San', u'Francisco', u..."
3,60750101001000,3,8,"[u'The', u'#rough', u'#wild', u'#choppy', u'#p..."
4,60750101001000,4,2,"[u""I'm"", u'at', u""Scoma's"", u'Restaurant', u'-..."


###get top ten tokens as test

In [129]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend([',', '-', ',', '?', '@', '&', ')', '(', ';','.',':','!','..','...'] )

def top_tokens(corpus_list, stopwords= stopwords, number=10):
    '''Takes a list of tokens. Returns the top ten, unless a different number given.'''
    #Checks to make sure the tokens are in a list, and not a string
    tokens = literal_eval(corpus_list)
    #If there are multiple tweets, flatten the list
    if type(tokens) ==tuple:
        tokens =[item for sublist in tokens for item in sublist]  
    allWordExceptStopDist = nltk.FreqDist(w.lower() for w in tokens if w not in stopwords) 
    mostCommon= allWordExceptStopDist.most_common(number)
    top_ten_string = ' '.join([tup[0] for tup in mostCommon])
    return top_ten_string

In [291]:
#run this on every column of the text 
top_ten = hdf.tokens.apply(top_tokens)
#make a new column of the top tweets
hdf['top_ten'] = top_ten


###merge sf tweets and counts with shape geometry

In [285]:
###Retrieve the Shape Files for Each Block:
geodf = pd.read_csv('../../data/intermediate_data/sf_only_sql_shapes.csv')
#format the dataframe
geodf['geoid10'] = geodf.geoid10.astype('str')
geodf.drop('Unnamed: 0', axis = 1, inplace = True)
#set the index as the geoid


In [286]:
# need to alter the geoid10 column to merge with shape files
hdf['geoid10'] =hdf['geoid10'].apply(lambda x: x[1:])
#create a new dataframe 
hourlydf = pd.merge(geodf, hdf, on='geoid10', how='outer')
#fill no tweets with a zero value
hourlydf.tweetcnt.fillna(0, inplace = True)
#drop empty hour columns
hourlydf.dropna(subset = ['hour'], inplace = True)

##make a different map for each hour! 

In [128]:
def add_properties_geo(row):
    geoid = row['geoid10']
    tweetrate = row['tweetcnt']
    top_ten = row['top_ten']
    geo_json = {"type": "Feature", "geometry": json.loads(row['geometry']),  "properties": {'geoid': geoid ,'tweetrate': tweetrate, 'top_ten': top_ten }}
    return geo_json

def dataframe_to_geojson(df, outfilename):
    '''Takes in a dataframe with a count, geoid10, and list of tokens. Dumps it into a json geojason file'''
    df['geoid10'] = df['geoid10'].astype('str')
    df["tweetcnt"] = df['tweetcnt'].astype('str')
    list_to_export = []
    for idx, row in df.iterrows():
        list_to_export.append(add_properties_geo(row))
    with open(outfilename, 'w') as outfile:
        json.dump(list_to_export, outfile)

###divide time into five segments for visualization

In [297]:
hourlydf['hr_bin'] = pd.cut(hourlydf.hour, bins = 5, labels = ['latenight', 'dawn','morning','afternoon','evening'])

In [304]:
for time in hourlydf.hr_bin.unique():
    time_df = hourlydf[hourlydf['hr_bin']== time]
    output = 'data/' + time + '.json'
    dataframe_to_geojson(time_df, output)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [289]:
dataframe_to_geojson(test_hour, 'data/testhour.json')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


#Map By Weekend/WeekDay

In [139]:
df = retrieve_and_merge_tweet_data()

###break the data into weekday/weekend

In [39]:
#goldengate=['060750601001000', '060750601001007'*, '060750601001016'
crissyfield=['06075060100102', '060750601001052', '060750601001009']


prosidiooffthegrid =['060750601001102']

dolorespark= ['060750206001004', '06075026001009']

pier= ['060750101001001', '060750101001001']

remove= ['060750607001000', '060750179021003', '060750179021000', '060759804011003', '060750102001002', '060750102001000', '060759804011000']

In [140]:
hour_df = mm.pipeline.transform_timestamp(df, DOW = True)


In [141]:
hour_df['tweetcnt'] = 1
#get a total count of tweets
hdf = hour_df.groupby(['geoid10', 'DOW']).agg(sum).reset_index().drop('id', 1)
#get a grouped sum of the words
hour_df_txt = hour_df.groupby(['geoid10', 'DOW'])['text'].apply(lambda x: ','.join(x)).reset_index()
#merge these two dataframes together
hdf['tokens'] = hour_df_txt['text']

###get weekend, weekdays 

In [142]:
dfweekend = hdf[hdf['DOW'] > 5].drop('DOW', 1)
dfwnd = dfweekend.groupby('geoid10').agg(np.sum).reset_index()
wknd_txt = dfweekend.groupby(['geoid10'])['tokens'].apply(lambda x: ','.join(x)).reset_index()
#merge these two dataframes together
dfwnd['tokens'] = wknd_txt['tokens']

In [143]:
dfwnd.head()

Unnamed: 0,geoid10,tweetcnt,tokens
0,60750101001000,19,"[u'Acabo', u'de', u'publicar', u'una', u'foto'..."
1,60750101001001,161,"[u'a', u""gentleman's"", u'quarters', u'at', u'#..."
2,60750101001004,78,"[u'BucketHats', u',', u'BreadBowls', u',', u'a..."
3,60750101001005,2,"[u'Just', u'posted', u'a', u'photo', u'@', u""P..."
4,60750101001006,6,"[u'Idk', u'how', u'many', u'shabby', u'ass', u..."


In [144]:
dfweekday = hdf[hdf['DOW'] < 6].drop('DOW', 1)

dfwkday = dfweekday.groupby('geoid10').agg(np.sum).reset_index()
wkdy_txt = dfweekday.groupby('geoid10')['tokens'].apply(lambda x: ','.join(x)).reset_index()
#merge these two dataframes together
dfwkday['tokens'] = wkdy_txt['tokens']

In [145]:
dfwkday.head()

Unnamed: 0,geoid10,tweetcnt,tokens
0,60750101001000,52,"[u'#Repost', u'@robtalcual', u'with', u'repost..."
1,60750101001001,557,"[u""It's"", u'been', u'a', u'good', u'Sunday', u..."
2,60750101001002,1,"[u'Remember', u'why', u'you', u'started', u'ht..."
3,60750101001004,183,"[u'My', u'last', u'#SFO', u'post', u'\U0001f62..."
4,60750101001005,9,"[u'I', u'want', u'one', u'for', u'my', u'birth..."


In [146]:
#remove the columns I want to remove 
dfweekend = dfweekend[~dfweekend['geoid10'].isin(remove)]
dfwkday = dfwkday[~dfwkday['geoid10'].isin(remove)]

###tokenize the text

In [147]:
#run this on every column of the text 
top_ten_wknd = dfweekend.tokens.apply(top_tokens)
#make a new column of the top tweets
dfweekend['top_ten'] = top_ten_wknd

top_ten_wkdy = dfwkday.tokens.apply(top_tokens)
#make a new column of the top tweets
dfwkday['top_ten'] = top_ten_wkdy

###get shape geometry

In [148]:
###Retrieve the Shape Files for Each Block:
geodf = pd.read_csv('../../data/intermediate_data/sf_only_sql_shapes.csv')
#format the dataframe
geodf['geoid10'] = geodf.geoid10.astype('str')
geodf.drop('Unnamed: 0', axis = 1, inplace = True)
#set the index as the geoid

In [149]:
dfwkday['geoid10'] =dfwkday['geoid10'].apply(lambda x: x[1:])
#create a new dataframe 
dfwkday = pd.merge(geodf, dfwkday, on='geoid10', how='outer')
#fill no tweets with a zero value
dfwkday.tweetcnt.fillna(0, inplace = True)
#drop empty hour columns


In [151]:
dfweekend['geoid10'] =dfweekend['geoid10'].apply(lambda x: x[1:])
#create a new dataframe 
dfweekend = pd.merge(geodf, dfweekend, on='geoid10', how='outer')
#fill no tweets with a zero value
dfweekend.tweetcnt.fillna(0, inplace = True)

###now make them a geojason!

In [152]:
dataframe_to_geojson(dfweekend, 'data/weekend.json')
dataframe_to_geojson(dfwkday, 'data/weekday.json')