# Map Tweet to Census Region

This notebook maps tweets to the corresponding census regions

*Author: Koki Sasagawa*  
*Date: 4/14/2019*

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from shapely.geometry import Polygon, Point
from datetime import datetime
from pytz import timezone

sys.path.insert(0, '../utils/')
from twitter_processing import tweet_coordinate_mapper
from api import reverse_geocode
from decorators import timer

%matplotlib inline

## 1. Load Tweet data

1. Tweet data containing SF, or mention SF
2. Filtered tweets from each month that contain coordinates which map to greater area of San Fransisco

In [2]:
input_dir = '../../raw_data/tweets/'

files = sorted(os.listdir(input_dir))
for i, fname in enumerate(files):
    print(i, fname)

0 tweet_2016_02_01.csv
1 tweet_2016_02_02.csv
2 tweet_2016_02_03.csv
3 tweet_2016_02_04.csv
4 tweet_2016_02_05.csv
5 tweet_2016_02_06.csv
6 tweet_2016_02_07.csv
7 tweet_2016_02_08.csv
8 tweet_2016_02_09.csv
9 tweet_2016_02_10.csv
10 tweet_2016_02_11.csv
11 tweet_2016_02_12.csv
12 tweet_2016_02_13.csv
13 tweet_2016_02_14.csv
14 tweet_2016_02_15.csv
15 tweet_2016_02_16.csv
16 tweet_2016_02_17.csv
17 tweet_2016_02_18.csv
18 tweet_2016_02_19.csv
19 tweet_2016_02_20.csv
20 tweet_2016_02_21.csv
21 tweet_2016_02_22.csv
22 tweet_2016_02_23.csv
23 tweet_2016_02_24.csv
24 tweet_2016_02_25.csv
25 tweet_2016_02_26.csv
26 tweet_2016_02_27.csv
27 tweet_2016_02_28.csv
28 tweet_2016_02_29.csv


In [3]:
tweet1 = pd.read_csv(input_dir + files[0],
                     parse_dates=['time'],
                     # date_parser=dateparse,
                     infer_datetime_format=True)

print(files[0])
print("Dimensions: {}".format(tweet1.shape))
tweet1.head()

tweet_2016_02_01.csv
Dimensions: (3944, 13)


Unnamed: 0,lon,lat,time,time_stamp,text,hashtags,urls,user_mentions,favorite_count,retweet_count,user_followers_count,user_friends_count,user_statuses_count
0,-119.306608,37.269176,2016-02-01 05:00:15+00:00,1454302815740,"A guy at the airport:\n""Idk why he cut me off ...",0,0,0,0,0,160,189,1119
1,0.3143,42.308346,2016-02-01 05:01:01+00:00,1454302861683,"@JoptanElMagno @apranorte y lo mejor,no caer e...",0,0,2,0,0,302,727,14007
2,-122.435978,37.770657,2016-02-01 05:01:14+00:00,1454302874659,I rather him call me his friend then just a wh...,2,0,0,0,0,87,187,1362
3,-122.435978,37.770657,2016-02-01 05:01:39+00:00,1454302899722,https://t.co/WNKPn1itmN,0,0,0,0,0,10,47,1
4,-122.466364,37.780964,2016-02-01 05:01:50+00:00,1454302910678,attended @starparish Speaker Series: a gr8 pre...,0,1,1,0,0,751,748,16076


## 2. Match the timezone

The datetime used in the tweets are UTC.  
Convert these datetime objects to match San Francisco timezone (Pacific Standard Time) 

In [4]:
tweet1['time'].iloc[0]

Timestamp('2016-02-01 05:00:15+0000', tz='UTC')

In [5]:
def time_zone_converter(date, zone):
    """Convert the datetime object to a specified timezone
    
    :param date: datetime 
    :type date: pandas._libs.tslibs.timestamps.Timestamp
    :param str zone: desired timezone 
    :return: datetime in specified timezone 
    :rtype: pandas._libs.tslibs.timestamps.Timestamp
    """
    
    date_format = '%Y-%m-%d %H:%M:%S %Z'
    date.strftime(date_format)
    date = date.astimezone(timezone(zone))
    return date.strftime('%Y-%m-%d %H:%M:%S')

In [6]:
tweet1.loc[:, 'time'] = tweet1.apply(lambda row: time_zone_converter(date=row['time'], zone='US/Pacific'), axis=1)
tweet1.head()

Unnamed: 0,lon,lat,time,time_stamp,text,hashtags,urls,user_mentions,favorite_count,retweet_count,user_followers_count,user_friends_count,user_statuses_count
0,-119.306608,37.269176,2016-01-31 21:00:15,1454302815740,"A guy at the airport:\n""Idk why he cut me off ...",0,0,0,0,0,160,189,1119
1,0.3143,42.308346,2016-01-31 21:01:01,1454302861683,"@JoptanElMagno @apranorte y lo mejor,no caer e...",0,0,2,0,0,302,727,14007
2,-122.435978,37.770657,2016-01-31 21:01:14,1454302874659,I rather him call me his friend then just a wh...,2,0,0,0,0,87,187,1362
3,-122.435978,37.770657,2016-01-31 21:01:39,1454302899722,https://t.co/WNKPn1itmN,0,0,0,0,0,10,47,1
4,-122.466364,37.780964,2016-01-31 21:01:50,1454302910678,attended @starparish Speaker Series: a gr8 pre...,0,1,1,0,0,751,748,16076


There seems to be something off with the coordinates.
Check to see what address they map to

In [3]:
reverse_geocode(lon=-119.306608, lat=37.269176)

'Fresno, CA, United States'

In [4]:
reverse_geocode(lon=0.314300, lat=42.308346)

'La Fueva, Aragón, España'

In [6]:
reverse_geocode(lon=-122.435978, lat=37.770657)

'Lower Haight, San Francisco, CA, United States'

## 3. Convert lon, lat coordinates into GIS point

Configuring the coordinate_mapper function used to map speed data for twitter data. 

In [7]:
shp_file = '../../temp_data/sf_GEOID_GIS_data.shp'

# Load census data 
shp_file = gpd.GeoDataFrame.from_file(shp_file)
print('Size of census zones df: {}'.format(shp_file.shape))
shp_file.head()

Size of census zones df: (196, 2)


Unnamed: 0,geoid10,geometry
0,6075016500,"POLYGON ((-122.446471 37.775802, -122.44478 37..."
1,6075016400,"POLYGON ((-122.44034 37.77658, -122.439844 37...."
2,6075016300,"POLYGON ((-122.429152 37.778007, -122.428909 3..."
3,6075016100,"POLYGON ((-122.428909 37.778039, -122.429152 3..."
4,6075016000,"POLYGON ((-122.420425 37.780583, -122.420336 3..."


Remove the region ID 6075990100 from the shp file

In [8]:
shp_file = shp_file[shp_file["geoid10"] != 6075990100]
print(shp_file.shape)

(195, 2)


In [9]:
# add timer 
tweet_coordinate_mapper = timer(tweet_coordinate_mapper)

In [10]:
# Directory to store files created
output_dir = "../../temp_data/tweets_mapped/"

In [11]:
cols = np.delete(np.arange(12), 3)
cols

array([ 0,  1,  2,  4,  5,  6,  7,  8,  9, 10, 11])

In [12]:
# Test
tweet_coordinate_mapper(shp_file=shp_file,
                        input_dir=input_dir, 
                        output_dir=output_dir, 
                        file_name="tweet_2016_02_01.csv",
                        columns=cols,
                        col_time="time",
                        zone="US/Pacific")

tweet_coordinate_mapper.head()

Running tweet_coordinate_mapper...
Finished in 1.5909s


Unnamed: 0,lon,lat,time,text,hashtags,urls,user_mentions,favorite_count,retweet_count,user_followers_count,user_friends_count,geometry,index_right,geoid10
2,-122.435978,37.770657,2016-01-31 21:01:14,I rather him call me his friend then just a wh...,2,0,0,0,0,87,187,POINT (-122.4359785 37.7706565),194,6075016700
3,-122.435978,37.770657,2016-01-31 21:01:39,https://t.co/WNKPn1itmN,0,0,0,0,0,10,47,POINT (-122.4359785 37.7706565),194,6075016700
7,-122.435978,37.770657,2016-01-31 21:03:44,Damn. Does Sandy love across the street from t...,1,0,0,0,0,887,956,POINT (-122.4359785 37.7706565),194,6075016700
8,-122.435978,37.770657,2016-01-31 21:04:04,@angeles19820928 @AsiaPrince_JKS MIRA MI PERFI...,0,0,2,0,0,464,695,POINT (-122.4359785 37.7706565),194,6075016700
9,-122.435978,37.770657,2016-01-31 21:04:10,"""We could butt chug two bottles of #wine, no p...",1,0,0,0,0,1218,2067,POINT (-122.4359785 37.7706565),194,6075016700
12,-122.435978,37.770657,2016-01-31 21:04:38,"“It’s been a wonderful wonderful evening, I me...",0,0,0,0,0,1211,129,POINT (-122.4359785 37.7706565),194,6075016700
17,-122.435978,37.770657,2016-01-31 21:05:26,day out with my suga &amp; papi D https://t.co...,0,0,0,0,0,168,289,POINT (-122.4359785 37.7706565),194,6075016700
19,-122.435978,37.770657,2016-01-31 21:05:39,Great events for #BlackHistoryMonth at #UCHast...,2,0,0,0,0,6271,524,POINT (-122.4359785 37.7706565),194,6075016700
20,-122.435978,37.770657,2016-01-31 21:05:54,"@thehipsch Andre, same surname, Civil Engineer...",0,0,1,0,0,933,676,POINT (-122.4359785 37.7706565),194,6075016700
22,-122.435978,37.770657,2016-01-31 21:06:21,@GavuL_ fero is gonna get shit on,0,0,1,0,0,351,265,POINT (-122.4359785 37.7706565),194,6075016700


In [8]:
for fname in files:
    tweet_coordinate_mapper(shp_file, input_dir, output_dir="../../temp_data/tweets_mapped/", fname, [])

tweet_2016_02_01.csv
tweet_2016_02_02.csv
tweet_2016_02_03.csv
tweet_2016_02_04.csv
tweet_2016_02_05.csv
tweet_2016_02_06.csv
tweet_2016_02_07.csv
tweet_2016_02_08.csv
tweet_2016_02_09.csv
tweet_2016_02_10.csv
tweet_2016_02_11.csv
tweet_2016_02_12.csv
tweet_2016_02_13.csv
tweet_2016_02_14.csv
tweet_2016_02_15.csv
tweet_2016_02_16.csv
tweet_2016_02_17.csv
tweet_2016_02_18.csv
tweet_2016_02_19.csv
tweet_2016_02_20.csv
tweet_2016_02_21.csv
tweet_2016_02_22.csv
tweet_2016_02_23.csv
tweet_2016_02_24.csv
tweet_2016_02_25.csv
tweet_2016_02_26.csv
tweet_2016_02_27.csv
tweet_2016_02_28.csv
tweet_2016_02_29.csv
