In [2]:
import json
import pymongo
import tweepy
import time
import string
from collections import Counter
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import matplotlib.dates as mdates
import pandas as pd
import numpy as np
from time import sleep
from datetime import datetime
import re
import csv
from datetime import datetime, timedelta

import requests
import os


In [3]:
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers


def connect_to_endpoint(url, headers, params):
    time.sleep(1)
    response = requests.request("GET", url, headers=headers, params=params)
    #print(response.status_code)
    if response.status_code != 200:
        print(params, response.status_code, response.text)
        if 429 == response.status_code:
            sleep(600)
        return({})
    else:
        return response.json()

    
def preprocessTweet(text):
    regrex_pattern = re.compile(pattern = "["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    twt_noEmoji = regrex_pattern.sub(r'',text)
    #rm urls
    twt_noURL =  re.sub(r'(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*', '', twt_noEmoji)
    twt_noURL = re.sub(r'http\S+', '', twt_noEmoji)
    #rm @xxx
    twt_noAtXXX = ' '.join(word for word in twt_noURL.split() if word[0]!='@')
    #rem '#'
    #twt_noHashtag = twt_noAtXXX.replace('#', '')
    return twt_noAtXXX

def saveTwWithTag(json_response, mongo_db, collection_name, tag):
    if 'data' in json_response:
        twts = json_response['data']
        print(len(twts))
        for twt in twts:
            document_record = {'time':twt['created_at'], 'twt_id': twt['id'], 'text':preprocessTweet(twt['text']), 'tag': tag}
            if 'geo' in twt:
                if 'place_id' in twt['geo']:
                    document_record['geo'] = twt['geo']['place_id']
                else:
                    document_record['geo'] = twt['geo']
            if 'author_id' in twt:
                document_record['author_id'] = twt['author_id']
            if 'referenced_tweets' in twt:
                document_record['referenced_tweets'] = twt['referenced_tweets']
            # insert the document into mongoDB
            mongo_db[collection_name].insert_one(document_record)
    #save geo, if any
    if 'includes' in json_response:
        includes = json_response['includes']
        if 'places' in includes:
            for place in includes['places']:
                place_record = {'place_id': place['id'],
                                'name': place['name'],
                                'country': place['country'],
                                'place_type': place['place_type'],
                                'geo': place['geo']
                               }
                mongo_db['twt_api2_places'].insert_one(place_record)
        if 'users' in includes:
            for user in includes['users']:
                user_record = {'user_id': user['id'],
                                'name': user['name'],
                                'username': user['username'],
                                'description':user['description']
                               }
                mongo_db['twt_api2_users'].insert_one(user_record)
        if 'tweets' in includes:
            for reftweet in includes['tweets']:
                mongo_db['twt_api2_ref_tweets'].insert_one(reftweet)

'''Using full archive search for given period
get tweets that cantains at least one word in the given list'''
def saveTweetContainsList2Mongo(termlist, collection_name, start_time, end_time, is_retweet=False):
    for term in termlist: 
        print(term)
        query_str = term + ' lang:en'
        if not is_retweet:
            query_str = query_str + ' -is:retweet'
        
        query_params = {'query': query_str,
                        'tweet.fields':'created_at,geo,author_id,referenced_tweets',
                        'expansions':'geo.place_id,author_id,referenced_tweets.id',
                        'user.fields':'description',
                        'place.fields': 'country,geo,id,name,place_type',
                        'max_results':500,
                        'start_time': start_time,
                        'end_time': end_time}

        json_response = connect_to_endpoint(search_url, headers, query_params)
        if 'meta' in json_response: #could be empty
            saveTwWithTag(json_response, mongo_db, collection_name, term)

            #pagination 
            while ('next_token' in json_response['meta']):
                query_params['next_token'] = json_response['meta']['next_token']            
                json_response = connect_to_endpoint(search_url, headers, query_params)
                if json_response:
                    saveTwWithTag(json_response, mongo_db, collection_name, term)
                else:
                    break

In [4]:
#config the end point 
bearer_token = ""
search_url = "https://api.twitter.com/2/tweets/search/all"
count_url="https://api.twitter.com/2/tweets/counts/all"
headers = create_headers(bearer_token)

In [5]:
#mongo db connection
mongo_db = pymongo.MongoClient('localhost', 27017).sicss #local

In [None]:
#make  RFC 3339 timestamps for twitter api
date_format_str = '%Y-%m-%dT%H:%M:%S.%z000Z' #UTC time

In [None]:
#2016 election period
start_time = datetime(2016, 9, 7, 0, 0, 0)
end_time = datetime(2016, 11, 8, 0, 0, 0)
while end_time > start_time:
    twt_start = start_time
    twt_end = twt_start + timedelta(minutes = 3)
    #let's just take either only Clinton or only Trump
    saveTweetContainsList2Mongo(['(Hillary Clinton -Donald -Trump) OR (Donald Trump -Clinton -Hillary)'],
                                'ClintonTrump2016FallTweets', 
                                twt_start.strftime(date_format_str),
                                twt_end.strftime(date_format_str),
                                is_retweet=False)
    start_time = start_time + timedelta(minutes = 30)


In [None]:
#2008
start_time = datetime(2008, 10, 29, 0, 0, 0)
end_time = datetime(2008, 11, 5, 0, 0, 0)
while end_time > start_time:
    twt_start = start_time
    twt_end = twt_start + timedelta(minutes = 3)
    #let's just take either only Clinton or only Trump
    saveTweetContainsList2Mongo(['(Barack Obama -McCain ) OR (John McCain -Obama)'],
                                'ObamaMicain2008', 
                                twt_start.strftime(date_format_str),
                                twt_end.strftime(date_format_str),
                                is_retweet=False)
    start_time = start_time + timedelta(minutes = 30)


In [None]:
#manually add missed query back due to rate reaching 
'''saveTweetContainsList2Mongo(['(Barack Obama -McCain ) OR (John McCain -Obama)'],
                            'ObamaMicain2008', 
                            '2008-11-03T02:30:00.000Z',
                            '2008-11-03T02:33:00.000Z',
                            is_retweet=False)'''

In [None]:
#2012
start_time = datetime(2012, 10, 19, 0, 0, 0)
end_time = datetime(2012, 10, 26, 0, 0, 0)
while end_time > start_time:
    twt_start = start_time
    twt_end = twt_start + timedelta(minutes = 3)
    #let's just take either only Clinton or only Trump
    saveTweetContainsList2Mongo(['(Barack Obama -Romney ) OR (Mitt Romney -Obama)'],
                                'ObamaRomney2012', 
                                twt_start.strftime(date_format_str),
                                twt_end.strftime(date_format_str),
                                is_retweet=False)
    start_time = start_time + timedelta(minutes = 30)

In [None]:
#manually add missed query back due to rate reaching 

'''saveTweetContainsList2Mongo(['(Barack Obama -Romney ) OR (Mitt Romney -Obama)'],
                            'ObamaRomney2012', 
                            '2012-10-24T20:00:00.000Z',
                            '2012-10-24T20:03:00.000Z',
                            is_retweet=False)'''


In [None]:
#2020
start_time = datetime(2020, 9, 29, 3, 0, 0)
end_time = datetime(2020, 10, 6, 0, 0, 0)
while end_time > start_time:
    twt_start = start_time
    twt_end = twt_start + timedelta(minutes = 3)
    #let's just take either only Clinton or only Trump
    saveTweetContainsList2Mongo(['(Joe Biden -Donald -Trump) OR (Donald Trump -Joe -Biden)'],
                                'BidenTrump2020', 
                                twt_start.strftime(date_format_str),
                                twt_end.strftime(date_format_str),
                                is_retweet=False)
    start_time = start_time + timedelta(minutes = 30)

In [None]:
#manually add them back, and then drop duplicates from mongodb side
'''saveTweetContainsList2Mongo(['(Joe Biden -Donald -Trump) OR (Donald Trump -Joe -Biden)'],
                            'BidenTrump2020', 
                            '2020-10-03T16:00:00.000Z',
                            '2020-10-03T16:03:00.000Z',
                            is_retweet=False)'''

In [13]:
# get tweet from mongo
clt_pipeline= [
    {
        '$match': {
            'text': {
                '$regex': re.compile(r"(?i)Clinton"), 
                '$not': {
                    '$regex': re.compile(r"(?i)Trump")
                }
            }
        }
    },{
    '$project': {
      'twt_id': 1, 
      '_id': 0, 
      'time': 1, 
      'text': 1
    }
  }
]
df_clt_tweets = pd.DataFrame(list(mongo_db.get_collection('ClintonTrump2016FallTweets').aggregate(clt_pipeline)))

trp_pipeline= [
    {
        '$match': {
            'text': {
                '$regex': re.compile(r"(?i)Trump"), 
                '$not': {
                    '$regex': re.compile(r"(?i)Clinton")
                }
            }
        }
    },{
    '$project': {
      'twt_id': 1, 
      '_id': 0, 
      'time': 1, 
      'text': 1
    }
  }
]
df_trp_tweets = pd.DataFrame(list(mongo_db.get_collection('ClintonTrump2016FallTweets').aggregate(trp_pipeline)))

In [15]:
df_clt_tweets.to_csv('Clinton_twts_2016.csv')
df_trp_tweets.to_csv('Trump_twts_2016.csv')

In [11]:
del df_clt_tweets, df_trp_tweets