## Importing packages

In [None]:
import os
import boto3
import re
import copy
import time
from time import gmtime, strftime
from sagemaker import get_execution_role
import pandas as pd
import tweepy as tw
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from io import StringIO
from datetime import datetime as dt

## Make sure to **! pip install tweepy** before importing it

import tweepy as tw

# bucket_crime ='mpls-crime-data' # Our s3 bucket name
data_key = 'all_data_for_prediction.csv'
# data_location = 's3://{}/{}'.format(bucket_crime, data_key)
# data = pd.read_csv(data_location)

We extract year, month and day to make a date column. This comes in handy for the joins with the twitter dataset (and later when we split data into train and test).

In [None]:
data['full_date'] = pd.to_datetime(data[['Year', 'Month', 'Day']])

data["week"] = data["full_date"].dt.week
data["dayofweek"] = data["full_date"].dt.dayofweek # sunday=0
data.head()

In [None]:
data['prev_year'] = data.groupby([data['full_date'].dt.year,data['full_date'].dt.week, 
             data['full_date'].dt.weekday, data['Neighborhood']])['count_incidents'].shift()

## Authorizing Twitter with the access tokens and consumer keys

Accessing the Twitter API requires creating a developer account, i.e. essentially telling Twitter that you're an app developer and you would be requiring access to the API for fetching data. This process typically takes a day and requires you to justify your need for using the data.

In [None]:
access_token = "2572984207-WDAs0bPwMsrybwxX8RFGNqugeQeUKpu6sBIWbqa"
access_token_secret = "cWnQlEF5m0zCdRcpOpFqtJVa7S9rG21zJiU2gZG9LsNun"
consumer_key = "7XWcRRoNdd1WegVN20wAdUMG4"
consumer_secret = "nxeXbBBnJoSf9cHA0Yv70cbCzPxHPNIsAmc49S0NfDoVdWwk8A"


auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tw.API(auth)

## Using user handles of news agencies to retrieve tweets 

We use the local news agencies here in the twin cities to gather tweets for a given day. Given the limitations of the free API, we chose to use news agencies and not users (since an average regular user would not be tweeting as much and we had a limitation on how far back we can go historically)

In [None]:
users = ["PioneerPress",
        "StarTribune",
        "WCCO",
        "KSTP",
        "FOX9",
        "MPRnews",
        "Jacob_Frey",
        "melvincarter3",
        "MnDPS_MSP",
        "kare11",
        "TCCrimeWatch",
        "CrimeStoppersMN",
         "UMNews"]

We hit the *user_timeline* end-point of the twitter API and put a filter on the number of days since when we want the data.

(I would like to mention here that even though the API mentions a cap of 7-14 days, we were able to get data dating back till the beginning of this year, even though it was very sparse)

In [None]:
all_tweets = []
for u in users:
    tweets_user_1 = tw.Cursor(api.user_timeline,
                   screen_name=u,since=date_since).items(2000)
    time.sleep(3)
    x = [[tweet.user.screen_name, tweet.text, tweet.created_at] for tweet in tweets_user_1]
    all_tweets.append(x)
    time.sleep(3)

In [None]:
## Parsing only the user name, text and the tweet time of the tweet
all_tweets = [[tweet.user.screen_name, tweet.text, tweet.created_at] for tweet in tweets_user_1]

In [None]:
## Making a dataframe out of the list of lists
data = []
for tw in all_tweets:
    for i in enumerate(tw):
        data.append(i[1])
        
tweets_data=pd.DataFrame(data=data, columns=['User','Tweet','Date'])

In [None]:
## printing a sample of the dataset and checking the number of rows

print(tweets_data.head())
print()
print(tweets_data.shape)

## Creating an Amazon comprehend client

boto3 is the package using which we can access all of AWS's APIs (in this case we hit AWS Comprehend's Sentiment analysis API). We define an empty list called `resp` which will store the reponses (each response contains a weight corresponding to how Positive/Negative/Neutral/Mixed the tweet is.

In [None]:
import boto3
client = boto3.client('comprehend')

In [None]:
resp=[]
## using lambda function with apply to apply sentiment analysis to each tweet in the dataframe
resp.append(tweets_data['Tweet'].apply(lambda x: client.detect_sentiment(Text=x, LanguageCode='en')))

In [None]:
resp

In [1]:
## we create a separate list for each score since we would be making a column out of each. The sentiment column is essentially
## the most prevailing sentiment of the day

label = []
positive = []
negative = []
mixed = []
neutral = []

for items in resp[0]:
    label.append(items['Sentiment'])
    positive.append(items['SentimentScore']['Positive'])
    negative.append(items['SentimentScore']['Negative'])
    mixed.append(items['SentimentScore']['Mixed'])
    neutral.append(items['SentimentScore']['Neutral'])

NameError: name 'resp' is not defined

In [None]:
tweets_data['Sentiment'] = label
tweets_data['PositiveWeight'] = positive
tweets_data['NegativeWeight'] = negative
tweets_data['MixedWeight'] = mixed
tweets_data['NeutralWeight'] = neutral

In [None]:
# from datetime import datetime as dt
import warnings
warnings.simplefilter('ignore')
## Converting Date column from string to Datetime format
tweets_data['Date'] = tweets_data['Date'].apply(lambda x: dt.strptime(x, '%m/%d/%Y %H:%M').date())

Since we are interested in the average sentiment of a day, we take a mean of all scores and roll-up the data to a daily level. Multiple other methods can be tried, for e.g. - 

* If you feel 2 highly positive tweets should weigh less than 10 moderately positive tweets, you can multiple them by their count (weighted average)
* Conversely, if you feel strongly weighted tweets (albeit less in number) should be more influential, multiply the sentiment by the inverse of count, etc.

In [None]:
tweets_data_agg = tweets_data.groupby('Date').agg({'PositiveWeight':['mean','count'], 'NegativeWeight':['mean','count'], 
                                 'NeutralWeight':['mean','count'], 'MixedWeight':['mean','count']})

### Reading in the crimes datafile and rolling up to a precinct-day level

In [None]:
## reading in training datafile
data_file = 'rik_prediction.csv'
data_location = 's3://{}/{}'.format(bucket_crime, data_file)
full_train = pd.read_csv(data_location)
crime_counts = full_train.groupby(['ReportedDate','Precinct']).agg({'count_incidents':'sum'}).reset_index()

### Reading in weather data file

In [None]:
## Reading in weather data
data_file = 'mpls_weather_data_2017-2019_12_07.csv'
data_location = 's3://{}/{}'.format(bucket_crime, data_file)
weather = pd.read_csv(data_location)

### Reading in dashboard data-file

This is a data file that contains data at a date, neighborhood and crime type level. This was utilized, by joining with the weather dataset, to generate the plots on the map for our dashboard

In [None]:
## Reading in crime data for dashboard
data_file = 'dashboard.csv'
data_location = 's3://{}/{}'.format(bucket_crime, data_file)
dashboard = pd.read_csv(data_location)

In [2]:
## Converting all string dates to datetime format
weather['DATE'] = weather['DATE'].apply(lambda x: dt.strptime(x,'%Y-%m-%d').date())
dashboard['reportedDateTime'] = dashboard['reportedDateTime'].apply(lambda x: dt.strptime(x,'%Y-%m-%d').date())

NameError: name 'weather' is not defined

In [None]:
dashboard.head()

In [None]:
## Taking only relevant columns
weather = weather[['PRCP','SNOW','SNWD','TAVG','DATE']]

In [None]:
weather.head()

In [None]:
## weather and crime merging for dashboard
dashboard_all_data = pd.merge(dashboard, weather, left_on='reportedDateTime', right_on='DATE', how='right')

In [None]:
# Write dataframe to buffer
csv_buffer = StringIO()
dashboard_all_data.to_csv(csv_buffer, index=False)

# Upload CSV to S3
s3_key = 'data_for_dashboard.csv'
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket_crime, s3_key).put(Body=csv_buffer.getvalue())

In [None]:
## Converting string to date - temp is the dataframe that contains crime count data at a daily-precinct level
crime_counts['ReportedDate'] = crime_counts['ReportedDate'].apply(lambda x: dt.strptime(x,'%Y-%m-%d').date())

In [None]:
## getting weather and crime data into a single dataframe
full_train = pd.merge(weather, crime_counts, left_on='DATE',right_on='ReportedDate',how='inner')

In [None]:
## Since the columns were grouped (as mean and count for each type of sentiment, we ungroup and create a named column for
## each of them as ** sentiment_metric **)

l0_cols = tweets_data_agg.columns.get_level_values(0)
l1_cols = tweets_data_agg.columns.get_level_values(1)
temp_cols = list(zip(l0_cols, l1_cols))
colnames = [i[0]+'_'+i[1] for i in a]
tweets_data_agg.columns = colnames

tweets_data_agg.reset_index(inplace=True)

Here we merge the tweets dataset with the full-train dataset (that contains weather and crime data at a daily-precinct level).

In [None]:
# full_train['ReportedDate_formatted'] = full_train['ReportedDate'].apply(lambda x: dt.strptime(x,'%Y-%m-%d').date())
final_dataset = pd.merge(full_train, tweets_data_agg, left_on='ReportedDate', right_on = 'Date', how='left')

In [None]:
final_dataset.head()

## Writing to S3 bucket

Now that we are done preparing the dataset (containing crimes, weather and sentiment data, from tweets) we write it out to an S3 location. This dataset is used in the prediction notebook to run our Xgboost model.

In [None]:
# Write dataframe to buffer and then write it out to an S3 location

csv_buffer = StringIO()
final_dataset.to_csv(csv_buffer, index=False)

# Upload CSV to S3
s3_key = 'data_with_sentiment.csv'
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket_crime, s3_key).put(Body=csv_buffer.getvalue())