# Scrape Template

This is to allow for bootstrappping faster dataset production.


#### Utilities & Libraries

In [1]:
# Run Utilities script, it has common file names
%run utilities.py

# Matplotlib + Jupyter
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
# Password file, to keep secrets off Github

# Make the file if neccesary
#!echo "pw=%%" > secret_key.py

%run ../../src/secret_key.py

#### Database Connection

In [2]:
import psycopg2 as pg
import pandas.io.sql as psql

## Connect to the database
#connection = psycopg2.connect("dbname='template1' user='dbuser' host='localhost' password='dbpass'")
#dataframe = psql.frame_query("SELECT id, price FROM stock_price;", connection)

conn_str = "dbname='immigration' user='rlrson' password={}".format(pw)
conn = pg.connect(conn_str)

#### Time Periods

Caglar has broken them down into before/after the 'epoch' date. Lets pull them in.

In [3]:
time_periods = pd.read_sql("SELECT * FROM timeinterval;", con=conn)
time_periods

Unnamed: 0,id,startinterval,endinterval
0,1,2016-12-27 10:00:00+00:00,2017-01-27 10:00:00+00:00
1,2,2017-01-28 10:00:00+00:00,2017-02-27 10:00:00+00:00
2,3,2017-01-21 10:00:00+00:00,2017-01-27 10:00:00+00:00
3,4,2017-01-28 10:00:00+00:00,2017-02-03 10:00:00+00:00
4,5,2017-01-26 10:00:00+00:00,2017-01-27 10:00:00+00:00
5,6,2017-01-28 10:00:00+00:00,2017-01-29 10:00:00+00:00


In [4]:
time_periods.dtypes

id                             int64
startinterval    datetime64[ns, UTC]
endinterval      datetime64[ns, UTC]
dtype: object

In [5]:
# set variables so we can specify a beginning/end for our SQL queries
start = time_periods.iloc[0]['startinterval'].tz_convert(None)
end = time_periods.iloc[1]['endinterval'].tz_convert(None)

#### Week Categorization

We need to create a mapping that will return the week number for us. We'll start at the epoch and work our way backwards week by week. After a month we'll do 'after' the epoch.

In [10]:
# set the time variables
# epoch_start - (delta_week * 4) >> Timestamp('2016-12-30 10:00:00')
# start >> Timestamp('2016-12-27 10:00:00+0000', tz='UTC')

# Epoch
epoch_start = pd.to_datetime('2017-01-27 10:00:00+00:00', utc=True).tz_convert(None)
epoch_end = pd.to_datetime('2017-01-28 10:00:00+00:00', utc=True).tz_convert(None)

# delta
delta_week = pd.Timedelta('7 days')

# before
four_before = epoch_start - (delta_week * 4)
three_before = epoch_start - (delta_week * 3)
two_before = epoch_start - (delta_week * 2)
one_before = epoch_start - (delta_week * 1)

# after
four_after = epoch_end + (delta_week * 4)
three_after = epoch_end + (delta_week * 3)
two_after = epoch_end + (delta_week * 2)
one_after = epoch_end + (delta_week * 1)


def get_week_number(week):
    try:
        week = pd.to_datetime(week, infer_datetime_format=True)
    
        # catch, before/after?
        if (week < start) or (week > end):
            return None

        if (week < four_before):
            return -5
        elif (week < three_before):
            return -4
        elif (week < two_before):
            return -3
        elif (week < one_before):
            return -2
        elif (week < epoch_start):
            return -1
        elif (week < epoch_end):
            return 0
        elif (week < one_after):
            return 1
        elif (week < two_after):
            return 2
        elif (week < three_after):
            return 3
        elif (week < four_after):
            return 4
        else:
            return 5
    except Exception as e:
        print (e, week)
        return None

#### Tweets

We need a count of tweets from every county, for every week (8: 4-before/after). The `tweet` table has tweetid, tweet, date, userid, id, lon, lat, cntyid.

In [63]:
query_test = "SELECT tweetID, date::date, tweet  FROM tweet LIMIT 10;"
test_df = pd.read_sql(query_test, con=conn)

test_df.head(10)

Unnamed: 0,tweetid,date,tweet
0,817797356456476676,2017-01-07,RT : The Muslim invasion of the west makes me upset every day! Out country will never be the same!
1,817797367529410561,2017-01-07,RT : Muslim Beats White Woman — Cops Arrest HER After Attacker's 5-Word Excuse
2,817797369525927936,2017-01-07,ربِّ أسألك خير ما في هذا اليوم وخير ما بعده ♻️
3,817797376505221121,2017-01-07,RT : US invasions have robbed Muslim countries bone dry...millions dead; millions more homeless; refugees...& war still…
4,817797378958884865,2017-01-07,ربِّ أسألك خير ما في هذا اليوم وخير ما بعده ♻️
5,817797383916519425,2017-01-07,اللهم لا سهل إلا ما جعلته سهلاً وأنت تجعل الحزن إذا شئت سهلا ♻️
6,817797389268434944,2017-01-07,RT : You probably heard about a Muslim mob setting fire to Germany's oldest church. It's not true
7,817797395182391296,2017-01-07,أذكار الأذان:اللهم رب هذه الدعوة التامة والصلاة القائم ♻️
8,817797405789847552,2017-01-07,سبحان الله وبحمده سبحان الله العظيم ♻️
9,817797407626956802,2017-01-07,ربِّ أسألك خير ما في هذا اليوم وخير ما بعده ♻️


In [79]:
tweet_query = "SELECT tweetID, date::date, tweet FROM tweet LIMIT 75000;"
tweet_df = pd.read_sql(tweet_query, con=conn)

tweet_df.head(2)

Unnamed: 0,tweetid,date,tweet
0,818777709014896642,2017-01-10,اللهم إني أعوذ بك من الهم والحزن والعجز والكسل والبخل والجبن وغلبة الدين وقهر الرجال ♻️
1,818777721132240896,2017-01-10,اللهم أعذنا من عذاب القبر وعذاب جهنم ♻️


In [12]:
print ('Number rows: {}'.format(len(tweet_df)))
tweet_df.dtypes

date       object
userid      int64
lon       float64
lat       float64
cntyid      int64
dtype: object

In [None]:
# work here

#### Parallelize

This is going to take forever unless we do.

In [27]:
from multiprocessing import Pool  # faster

def parallelize_series(series, func):
    pool = Pool(6)

    df = pool.map(func, series)

    pool.close()
    pool.join()
    return df

#### Actual Scrape

In [70]:
# df = pd.DataFrame()
# for chunk in pd.read_sql('select * from table_name', con=conn, chunksize=5000):
#    df = df.append(chunk)


# Full data
full_query = ""

# Iteraction
counts = pd.DataFrame()
for chunk in pd.read_sql(tweet_query, con=conn, chunksize=5000):
    formatted = group_tweets(chunk)
    counts = counts.append(formatted)

In [82]:
# work here

In [75]:
# Write out
f = name_file_path('state-tweet-counts.csv', processed_web_dir)

final.rename(columns={'cnty': 'cnt'}, inplace=True)
final.to_csv(f, index=False)

---

Close the connection, add formatting to the cells.

In [76]:
conn.close()

In [48]:
# Wide cells
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

# number of text characters per column
pd.set_option('max_colwidth', 140)