# Map Tweet to Census Region

This notebook maps tweets to the corresponding census regions

*Author: Koki Sasagawa*  
*Date: 4/14/2019*

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from shapely.geometry import Polygon, Point
from datetime import datetime
from pytz import timezone

%matplotlib inline

## 1. Load all Tweet data

1. Tweet data containing SF, or mention SF
2. Filtered tweets from each month that contain coordinates which map to greater area of San Fransisco

In [2]:
input_dir = '../../raw_data/tweets/'

files = sorted(os.listdir(input_dir))
for i, fname in enumerate(files):
    print(i, fname)

0 tweet_2016_02_01.csv
1 tweet_2016_02_02.csv
2 tweet_2016_02_03.csv
3 tweet_2016_02_04.csv
4 tweet_2016_02_05.csv
5 tweet_2016_02_06.csv
6 tweet_2016_02_07.csv
7 tweet_2016_02_08.csv
8 tweet_2016_02_09.csv
9 tweet_2016_02_10.csv
10 tweet_2016_02_11.csv
11 tweet_2016_02_12.csv
12 tweet_2016_02_13.csv
13 tweet_2016_02_14.csv
14 tweet_2016_02_15.csv
15 tweet_2016_02_16.csv
16 tweet_2016_02_17.csv
17 tweet_2016_02_18.csv
18 tweet_2016_02_19.csv
19 tweet_2016_02_20.csv
20 tweet_2016_02_21.csv
21 tweet_2016_02_22.csv
22 tweet_2016_02_23.csv
23 tweet_2016_02_24.csv
24 tweet_2016_02_25.csv
25 tweet_2016_02_26.csv
26 tweet_2016_02_27.csv
27 tweet_2016_02_28.csv
28 tweet_2016_02_29.csv


In [3]:
tweet1 = pd.read_csv(input_dir + files[0],
                     parse_dates=['time'],
                     # date_parser=dateparse,
                     infer_datetime_format=True)

print("Dimensions: {}".format(tweet1.shape))
tweet1.head()

Dimensions: (3944, 13)


Unnamed: 0,lon,lat,time,time_stamp,text,hashtags,urls,user_mentions,favorite_count,retweet_count,user_followers_count,user_friends_count,user_statuses_count
0,-119.306608,37.269176,2016-02-01 05:00:15+00:00,1454302815740,"A guy at the airport:\n""Idk why he cut me off ...",0,0,0,0,0,160,189,1119
1,0.3143,42.308346,2016-02-01 05:01:01+00:00,1454302861683,"@JoptanElMagno @apranorte y lo mejor,no caer e...",0,0,2,0,0,302,727,14007
2,-122.435978,37.770657,2016-02-01 05:01:14+00:00,1454302874659,I rather him call me his friend then just a wh...,2,0,0,0,0,87,187,1362
3,-122.435978,37.770657,2016-02-01 05:01:39+00:00,1454302899722,https://t.co/WNKPn1itmN,0,0,0,0,0,10,47,1
4,-122.466364,37.780964,2016-02-01 05:01:50+00:00,1454302910678,attended @starparish Speaker Series: a gr8 pre...,0,1,1,0,0,751,748,16076


## 2. Match the timezone

The datetime used in the tweets are UTC.  
Convert these datetime objects to match San Francisco timezone (Pacific Standard Time) 

In [4]:
tweet1['time'].iloc[0]

Timestamp('2016-02-01 05:00:15+0000', tz='UTC')

In [5]:
def time_zone_converter(date, zone):
    """Convert the datetime object to a specified timezone
    
    :param date: datetime 
    :type date: pandas._libs.tslibs.timestamps.Timestamp
    :param str zone: desired timezone 
    :return: datetime in specified timezone 
    :rtype: pandas._libs.tslibs.timestamps.Timestamp
    """
    
    date_format = '%Y-%m-%d %H:%M:%S %Z'
    date.strftime(date_format)
    date = date.astimezone(timezone(zone))
    return date.strftime('%Y-%m-%d %H:%M:%S')

In [6]:
tweet1.loc[:, 'time'] = tweet1.apply(lambda row: time_zone_converter(date=row['time'], zone='US/Pacific'), axis=1)
tweet1.head()

Unnamed: 0,lon,lat,time,time_stamp,text,hashtags,urls,user_mentions,favorite_count,retweet_count,user_followers_count,user_friends_count,user_statuses_count
0,-119.306608,37.269176,2016-01-31 21:00:15,1454302815740,"A guy at the airport:\n""Idk why he cut me off ...",0,0,0,0,0,160,189,1119
1,0.3143,42.308346,2016-01-31 21:01:01,1454302861683,"@JoptanElMagno @apranorte y lo mejor,no caer e...",0,0,2,0,0,302,727,14007
2,-122.435978,37.770657,2016-01-31 21:01:14,1454302874659,I rather him call me his friend then just a wh...,2,0,0,0,0,87,187,1362
3,-122.435978,37.770657,2016-01-31 21:01:39,1454302899722,https://t.co/WNKPn1itmN,0,0,0,0,0,10,47,1
4,-122.466364,37.780964,2016-01-31 21:01:50,1454302910678,attended @starparish Speaker Series: a gr8 pre...,0,1,1,0,0,751,748,16076


## 3. Convert lon, lat coordinates into GIS point

In [56]:
geom = pd.Series(zip(tweet1.lon, tweet1.lat)).apply(Point)
tweet1 = gpd.GeoDataFrame(tweet1, geometry=geom)

In [57]:
tweet1.head()

Unnamed: 0,lon,lat,time,time_stamp,text,hashtags,urls,user_mentions,favorite_count,retweet_count,user_followers_count,user_friends_count,user_statuses_count,geometry
0,-119.306608,37.269176,2016-01-31 21:00:15,1454302815740,"A guy at the airport:\n""Idk why he cut me off ...",0,0,0,0,0,160,189,1119,POINT (-119.3066075 37.2691755)
1,0.3143,42.308346,2016-01-31 21:01:01,1454302861683,"@JoptanElMagno @apranorte y lo mejor,no caer e...",0,0,2,0,0,302,727,14007,POINT (0.3142995000000071 42.308346)
2,-122.435978,37.770657,2016-01-31 21:01:14,1454302874659,I rather him call me his friend then just a wh...,2,0,0,0,0,87,187,1362,POINT (-122.4359785 37.7706565)
3,-122.435978,37.770657,2016-01-31 21:01:39,1454302899722,https://t.co/WNKPn1itmN,0,0,0,0,0,10,47,1,POINT (-122.4359785 37.7706565)
4,-122.466364,37.780964,2016-01-31 21:01:50,1454302910678,attended @starparish Speaker Series: a gr8 pre...,0,1,1,0,0,751,748,16076,POINT (-122.46636427 37.78096427)
