# Features - Time

Mapping time intervals to tweets.

In [1]:
# Env
import os
from dotenv import find_dotenv, load_dotenv

# Data
import pandas as pd
import pandas.io.sql as pdsql

# Database
import psycopg2 as pg

# Faster computations
from multiprocessing import Pool

# For manipulating datetimes
import datetime
import pytz

In [2]:
# Environment
load_dotenv(find_dotenv())
db_url = os.environ.get('DATABASE_URL')

conn = pg.connect(db_url)
curr = conn.cursor()

**Define the time periods**

We need to do some mathemagic to convert Twitter's timestamp (`UTC`) to a localized time. This involves calculating the cut off times in UTC.

| Time Period | Min (`PST`) | Max (`PST`) |
|---|---|---|
| -4 | 
| -3 |
| -2 |
| -1 |
| 0 | 
| 1 |
| 2 |
| 3 |
| 4 |

**Epoch == January 27th, 2017**

Start should be midnight for PST, as midnight in EST doesnt account for West Coast which may still be awake.

In [3]:
# TIME ZONES
EST = pytz.timezone('US/Eastern')
PST = pytz.timezone('US/Pacific')
UTC = pytz.timezone('UTC')

In [4]:
# Beginning
epoch_start_pst = PST.localize(datetime.datetime(2017, 1, 27))
epoch_start_est = epoch_start_pst.astimezone(EST)
epoch_start_utc = epoch_start_pst.astimezone(UTC)

print ('Epoch starts')
print ('EST', epoch_start_est)
print ('PST', epoch_start_pst)
print ('UTC', epoch_start_utc)

Epoch starts
EST 2017-01-27 03:00:00-05:00
PST 2017-01-27 00:00:00-08:00
UTC 2017-01-27 08:00:00+00:00


In [5]:
# Ending
epoch_end_pst = PST.localize(datetime.datetime(2017, 1, 28))
epoch_end_est = epoch_end_pst.astimezone(EST)
epoch_end_utc = epoch_end_pst.astimezone(UTC)

print ('Epoch ends')
print ('EST', epoch_end_est)
print ('PST', epoch_end_pst)
print ('UTC', epoch_end_utc)

Epoch ends
EST 2017-01-28 03:00:00-05:00
PST 2017-01-28 00:00:00-08:00
UTC 2017-01-28 08:00:00+00:00


In [6]:
# TIME PERIODS
# Create time periods for quicker lookup

# Each period should last a week
time_period = datetime.timedelta(days=7)

# Hold the intervals
intervals = []

# Create intervals for 'BEFORE'
for i in range(-4, 0):
    i_start = epoch_start_utc - (abs(i) * time_period)
    i_end = epoch_start_utc - ((abs(i) - 1) * time_period)
    
    print ('{}\t[{},\t{}]'.format(i, i_start, i_end))
    intervals.append({
        'period': i,
        'start': i_start,
        'end': i_end
    })
    
# Create epoch interval
intervals.append({
    'period': 0,
    'start': epoch_start_utc,
    'end': epoch_end_utc
})
print ('{}\t[{},\t{}]'.format(0, epoch_start_utc, epoch_end_utc))

# Create intervals for 'AFTER'
for i in range(1, 5):
    i_start = epoch_end_utc + ((i-1) * time_period)
    i_end = epoch_end_utc + (i * time_period)
    
    print ('{}\t[{},\t{}]'.format(i, i_start, i_end))
    intervals.append({
        'period': i,
        'start': i_start,
        'end': i_end
    })

-4	[2016-12-30 08:00:00+00:00,	2017-01-06 08:00:00+00:00]
-3	[2017-01-06 08:00:00+00:00,	2017-01-13 08:00:00+00:00]
-2	[2017-01-13 08:00:00+00:00,	2017-01-20 08:00:00+00:00]
-1	[2017-01-20 08:00:00+00:00,	2017-01-27 08:00:00+00:00]
0	[2017-01-27 08:00:00+00:00,	2017-01-28 08:00:00+00:00]
1	[2017-01-28 08:00:00+00:00,	2017-02-04 08:00:00+00:00]
2	[2017-02-04 08:00:00+00:00,	2017-02-11 08:00:00+00:00]
3	[2017-02-11 08:00:00+00:00,	2017-02-18 08:00:00+00:00]
4	[2017-02-18 08:00:00+00:00,	2017-02-25 08:00:00+00:00]


In [7]:
intervals

[{'period': -4,
  'start': datetime.datetime(2016, 12, 30, 8, 0, tzinfo=<UTC>),
  'end': datetime.datetime(2017, 1, 6, 8, 0, tzinfo=<UTC>)},
 {'period': -3,
  'start': datetime.datetime(2017, 1, 6, 8, 0, tzinfo=<UTC>),
  'end': datetime.datetime(2017, 1, 13, 8, 0, tzinfo=<UTC>)},
 {'period': -2,
  'start': datetime.datetime(2017, 1, 13, 8, 0, tzinfo=<UTC>),
  'end': datetime.datetime(2017, 1, 20, 8, 0, tzinfo=<UTC>)},
 {'period': -1,
  'start': datetime.datetime(2017, 1, 20, 8, 0, tzinfo=<UTC>),
  'end': datetime.datetime(2017, 1, 27, 8, 0, tzinfo=<UTC>)},
 {'period': 0,
  'start': datetime.datetime(2017, 1, 27, 8, 0, tzinfo=<UTC>),
  'end': datetime.datetime(2017, 1, 28, 8, 0, tzinfo=<UTC>)},
 {'period': 1,
  'start': datetime.datetime(2017, 1, 28, 8, 0, tzinfo=<UTC>),
  'end': datetime.datetime(2017, 2, 4, 8, 0, tzinfo=<UTC>)},
 {'period': 2,
  'start': datetime.datetime(2017, 2, 4, 8, 0, tzinfo=<UTC>),
  'end': datetime.datetime(2017, 2, 11, 8, 0, tzinfo=<UTC>)},
 {'period': 3,
  's

**Get the tweets**

In [8]:
filter_cols_sql = """
SELECT 
    "tweetID", date
FROM 
    filter_tweets;
"""
date_df = pdsql.read_sql_query(filter_cols_sql, conn, index_col='date')

date_df.head()

Unnamed: 0_level_0,tweetID
date,Unnamed: 1_level_1
2017-01-31 19:22:01,826601467255595008
2017-01-31 19:22:02,826601469017194496
2017-01-31 19:22:02,826601470044803073
2017-01-31 19:22:02,826601470321639425
2017-01-31 19:22:02,826601470531219456


In [9]:
date_df.index.dtype_str

'datetime64[ns]'

### Map Tweets to periods

In [10]:
# Add a column initialized to null
# This way, after we're done assigning values we'll be able to easily find out of range tweets
date_df = date_df.assign(period=None)

for i in intervals:
    # Format datetimes in an indexable fashion
    start_fmt = i['start'].strftime('%Y-%m-%d %H:00:00')
    end_fmt = i['end'].strftime('%Y-%m-%d %H:00:00')
    
    # Get the counts
    cnt = len(date_df[start_fmt: end_fmt])
    print (i['period'], cnt)
    
    # Assign a time period to the dataframe subset
    date_df.loc[start_fmt: end_fmt, 'period'] = i['period']

-4 433931
-3 455915
-2 503966
-1 761361
0 256524
1 2720845
2 1345329
3 988963
4 1014865


In [11]:
# Number of null tweets
len(date_df[date_df.period.isnull()])

15796

In [12]:
# Reset columns before we write it out
date_df.reset_index(inplace=True)

# write that ish out
date_df[['tweetID', 'date', 'period']].to_csv('../time-periods.csv', index=False)

In [13]:
# close up shop
curr.close()
conn.close()