In [None]:
import tweepy
import pickle
import time
from datetime import datetime

#-------------------------------------------------------------------------------
# This section gathers data using tweepy package. The file runs for 10 minutes.
# Additionally, I originally had this section running as an automated task 
# every hour on the hour.
#-------------------------------------------------------------------------------

d = datetime.now()
def create_api():
    con_key = ''
    con_secret = ''
    acc_token = ''
    acc_secret = ''

    auth = tweepy.OAuthHandler(consumer_key=con_key, consumer_secret=con_secret)
    auth.set_access_token(acc_token, acc_secret)

    return tweepy.API(auth)


class ListStreamListener(tweepy.StreamListener):
    '''This class overides tweepy.StreamListener'''
    def __init__(self, time_limit=600):
        '''The class is initialized with a start time and time limit'''
        self.start_time = time.time()
        self.limit = time_limit
        super(ListStreamListener, self).__init__()
        self.my_tweets = []

    def on_status(self, status):
        '''As long as time limit is not reached, data will be collected.
        At the point the time limit is reached, the data will be saved 
        as a pickle file
        '''
        #I store hashtags, urls, and user mentions as lists
        ent_ht = status.entities.get('hashtags')
        ht = [item.get('text') for item in ent_ht] 
        ent_url = status.entities.get('urls')
        url = [item.get('expanded_url')for item in ent_url]
        ent_um = status.entities.get('user_mentions')
        um = [item.get('screen_name') for item in ent_um]
        if (time.time() - self.start_time) < self.limit:
            self.my_tweets.append((status.id_str,
                                   status.created_at, 
                                   status.user.utc_offset,
                                   status.retweet_count,
                                   status.in_reply_to_screen_name,
                                   status.source,
                                   status.place.full_name,
                                   status.user.location,
                                   ht,
                                   url,
                                   um,
                                   status.text))
        else:
            with open('C:\\tools\\dat\\tweet_{0}.pickle'.format(d.strftime('%Y%m%d%H%M%S')), 'wb') as f:
                pickle.dump(my_stream_listener.my_tweets, f)

            return False
    
    def on_error(self, status_code):
        if status_code == 420:
            return False

api = create_api()
my_stream_listener = ListStreamListener(time_limit=600)
my_stream = tweepy.Stream(auth = api.auth, listener=my_stream_listener)
#The geographic coordinates are for the Twin Cities.
my_stream.filter(locations=[-93.4947038,44.8455126,-92.8447074,45.0582123])

In [None]:
import os
import pandas as pd
import numpy as np
import nltk
import string
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#-------------------------------------------------------------------------------
# This section creates and saves the data frame for further analysis in R.  
# There are numerous helper methods for natural language processing.
#-------------------------------------------------------------------------------

def load_pickle(file):
    with open('dat/{0}'.format(file), 'rb') as f:
        return pickle.load(f)

def create_df():
    utc_diff = 6
    cols=['id', 
          'created_at', 
          'utc_offset', 
          'retweet_count', 
          'reply_screen_name', 
          'source', 
          'place_full_name', 
          'user_location', 
          'hashtags', 
          'urls',
          'user_mentions',
          'text']

    my_tweets = []
    for file in os.listdir('dat'):
        my_tweets = my_tweets + load_pickle(file)
    df = pd.DataFrame(my_tweets, columns=cols)
    # I remove the hashtags, urls, and user mentions from the text because
    # it is contained in another file
    df['text_clean'] = df.text.replace('(@|#|http(s)?://)[^ ]+', '', regex=True)
    df['hour'] = df.created_at.dt.hour
    df['ToD'] = np.NaN
    df.loc[df.hour.ge(5+utc_diff) & df.hour.le(10+utc_diff), 'ToD'] = 'morning'
    df.loc[df.hour.ge(16+utc_diff) & df.hour.le(21+utc_diff), 'ToD'] = 'night'
    return df


def create_text(iterable):
    text = "".join(iterable)
    return text.lower()


def create_tokens(text):
    return nltk.word_tokenize(text)


def remove_punc(text):
    pun_dict = {ord(punc):None for punc in string.punctuation}
    text_clean = text.translate(pun_dict)
    return text_clean


def remove_stopwords(tokens):
    stopwords = nltk.corpus.stopwords.words('english')
    tokens_clean = [t for t in tokens if t not in stopwords]
    return tokens_clean
    

def lemmatize_tokens(tokens):
    #lemmatizer
    wnl = nltk.WordNetLemmatizer()
    return [wnl.lemmatize(t) for t in tokens]

    
# def stem_tokens(tokens):
    # #Stemmer
    # porter = nltk.PorterStemmer()
    # return [porter.stem(t) for t in tokens]


def create_pos(tokens):
    return nltk.pos_tag(tokens)


def create_fdist(tokens):
    return nltk.FreqDist(tokens)


def get_pos(pos, part='NN'):
    nouns = [word[0] for word in pos if word[1] == part]
    return nouns


def create_textobj(tokens):
    return nltk.Text(tokens)


def add_sentiment(df):
    sid = SentimentIntensityAnalyzer()
    sentiment = pd.DataFrame([sid.polarity_scores(s) for s in df.text_clean])
    return pd.concat([df,sentiment],axis=1)


def create_clean_tokens(df):
    text = remove_punc(create_text(df.text_clean))
    tokens = create_tokens(text)
    return lemmatize_tokens(remove_stopwords(tokens))


def create_wordcloud(df, part='NN'):
    ''' can be VB or NN '''
    tokens_clean = create_clean_tokens(df)
    if part:
        pos = create_pos(tokens_clean)
        part_list = get_pos(pos, part='NN')
        return pd.DataFrame(part_list, columns=[part])
    return pd.DataFrame(tokens_clean, columns=['tokens'])


def save_df(df, name):
    df.to_csv('{0}.csv'.format(name), index=False)
    print('I successfully saved the {}!'.format(name))


def get_morn_and_night(df):
    return df.loc[df.ToD.eq('morning') | df.ToD.eq('night'), :]


def get_morn(df):
    return df.loc[df.ToD.eq('morning'), :]


def get_night(df):
    return df.loc[df.ToD.eq('night'), :]

#The following loads and saves the dataframes 
df = add_sentiment(create_df())

df_mn = get_morn_and_night(df)
save_df(df_mn, 'data_sentiment')

wc_m = create_wordcloud(get_morn(df))
wc_n = create_wordcloud(get_night(df))

save_df(wc_m, 'data_wc_morn')
save_df(wc_n, 'data_wc_night')