In [1]:
# Import Relevant Packages
import tweepy
import pandas as pd
import numpy as np
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import requests
from requests.structures import CaseInsensitiveDict
import datetime
import rfc3339
import iso8601

In [2]:
# Read API keys
with open("/Users/sihanzhang/Desktop/IEOR4524/twitter_credentials_v2.txt",'r') as token_file:
    contents = token_file.read().split('\n') # put them in different lines
    API_KEY = contents[0]
    API_KEY_SECRET = contents[1]
    BEARER_TOKEN = contents[2]
    ACCESS_TOKEN = contents[3]
    ACCESS_TOKEN_SECRET = contents[4]

In [3]:
# Define a function to create Twitter API v2 Client
def GetClient():
    client = tweepy.Client(bearer_token=BEARER_TOKEN,
                           consumer_key=API_KEY,
                           consumer_secret=API_KEY_SECRET,
                           access_token=ACCESS_TOKEN,
                           access_token_secret=ACCESS_TOKEN_SECRET)
    return client

In [4]:
# The function uses the company names as the search term and to obtain the Tweet IDs of the tweets
def SearchTweets(client, query, limit):
    
    twitter_ids = []
    
    for tweet in tweepy.Paginator(client.search_recent_tweets, query=query, max_results=100).flatten(limit=limit):
        twitter_ids.append(tweet.id)

    return twitter_ids

In [5]:
# Function used to transform a datetime object into RFC 3339 form
def get_date_string(date):
    return rfc3339.rfc3339(date)

# Function used to transform RFC 3339 form timestamp into a datetime object
def get_date_object(date_string):
    return iso8601.parse_date(date_string)

In [6]:
# Elastic Search

# search_term_list = ['polen capital', 'artisan partners', 'edgewood capital', 'western asset management', 
#                     'alger financial', 'sands capital', 'vanguard etf', 'vanguard fund', 'fidelity']

# Inelastic Search

# search_term_list = ['"polen capital"', '"artisan partners"', '"edgewood capital"', 
#                     '"western asset management"', '"alger financial"', '"sands capital"']

search_term_list = ['vanguard etf', 'vanguard fund', 'fidelity etf', 'fidelity fund']

In [7]:
# Get Tweet IDs
tweet_ids = []
for i in range(len(search_term_list)):
    tweet_ids.extend(SearchTweets(GetClient(), search_term_list[i], 600))
# tweet_ids

In [8]:
# Number of tweets
len(tweet_ids)

1846

In [9]:
# When we use Tweet IDs as the search query parameter of the command requests.get,
# there can be as many as 100 Tweet IDs in one request.
# So, we first split the Tweet IDs into groups of size 100 and convert them to strings

# Split the Tweet IDs into groups of size 100
lol = lambda lst, sz: [lst[i:i+sz] for i in range(0, len(lst), sz)]
tweet_ids_strings = lol(tweet_ids, 100)

In [10]:
# Convert Tweet IDs to strings
def convert(list):

    res = ",".join(map(str, list))
      
    return res

In [11]:
tweet_ids_100 = []
for i in range(len(tweet_ids_strings)):
    
    tweet_ids_100.append(convert(tweet_ids_strings[i]))
# tweet_ids_100

In [12]:
# The function uses the Tweet IDs as the search query parameter of the command requests.get 
# to obtain more detailed information about each tweet
def GetTweets(client, twitter_ids):
    dict_metrics = {'author_id':[], 'tweet_id':[], 'created_at':[], 'text':[], 
                    'retweet':[], 'like':[], 'reply':[], 'quote':[]}

    url = "https://api.twitter.com/2/tweets?ids={twitter_ids}".format(twitter_ids=twitter_ids) + \
    "&tweet.fields=attachments,author_id,context_annotations,conversation_id,created_at,entities," + \
    "geo,id,in_reply_to_user_id,lang,possibly_sensitive,public_metrics,referenced_tweets," + \
    "reply_settings,source,text,withheld"

    headers = CaseInsensitiveDict()
    headers["Authorization"] = "Bearer "+ BEARER_TOKEN

    resp = requests.get(url, headers=headers)
    if resp.status_code == 200:
        results = resp.json()['data']
        for i in range(len(results)):
            if results[i]['lang'] == 'en' and re.search('RT ', results[i]['text']) == None:
                dict_metrics['author_id'].append(results[i]['author_id'])
                dict_metrics['tweet_id'].append(results[i]['id'])
                dict_metrics['created_at'].append(get_date_object(results[i]['created_at']))
                dict_metrics['text'].append(results[i]['text'])
                dict_metrics['retweet'].append(results[i]['public_metrics']['retweet_count'])
                dict_metrics['like'].append(results[i]['public_metrics']['like_count'])
                dict_metrics['reply'].append(results[i]['public_metrics']['reply_count'])
                dict_metrics['quote'].append(results[i]['public_metrics']['quote_count'])      
        
        return dict_metrics

In [13]:
# Find the company that the tweet mentions
def FindCompany(text, search_term_list):
    for i in range(len(search_term_list)):
        words = search_term_list[i]
        words = words.replace('"', '')
        list_of_words = words.split(" ")
        num_words = len(list_of_words)
        count = 0
        while count < num_words:

            if re.search(list_of_words[count], text) == None:
                    break
            count += 1
        if count == num_words:
            return search_term_list[i]

In [14]:
# First construct a DataFrame without company info
df_metrics_nocomp = pd.DataFrame({'company':[],'author_id':[], 'tweet_id':[], 'created_at':[], 
                                  'text':[], 'retweet':[], 'like':[], 'reply':[], 'quote':[]})
for i in range(len(tweet_ids_100)):
    df_100 = pd.DataFrame(GetTweets(GetClient(), tweet_ids_100[i]))
    df_metrics_nocomp = pd.concat([df_metrics_nocomp, df_100], ignore_index=True)

In [15]:
# Then, find the company
df_metrics = df_metrics_nocomp.copy()
for i in range(len(df_metrics)):
    
    text = str(df_metrics_nocomp['text'][i]).lower()
    df_metrics.loc[i, 'company'] = FindCompany(text, search_term_list)

df_metrics = df_metrics.dropna()
df_metrics = df_metrics.reset_index(drop=True)

In [16]:
# Add a column showing VaderSentiment ratios
list_VaderSenti = []
for i in range(len(df_metrics['text'])):
    analyzer = SentimentIntensityAnalyzer()
    VaderSenti = analyzer.polarity_scores(df_metrics['text'][i])
    list_VaderSenti.append(VaderSenti)
df_metrics['VaderSenti'] = list_VaderSenti

In [17]:
# The DataFrame includes 10 columns: 
# the company mentioned in the tweet, Author ID, Tweet ID, time of tweet creation, 
# the text of the actual tweet, how many times it was retweeted, how many likes it got, 
# how many replies it got, how many times it was quoted, and VaderSenti sentiment analysis ratios 
df_metrics

Unnamed: 0,company,author_id,tweet_id,created_at,text,retweet,like,reply,quote,VaderSenti
0,vanguard etf,860018737,1522331625601871874,2022-05-05 21:45:17+00:00,The Vanguard Total Stock Market ETF (VTI) info...,0.0,0.0,0.0,0.0,"{'neg': 0.11, 'neu': 0.89, 'pos': 0.0, 'compou..."
1,vanguard etf,860018737,1522331621671817216,2022-05-05 21:45:16+00:00,The Vanguard S&amp;P 500 ETF (VOO) information...,0.0,0.0,0.0,0.0,"{'neg': 0.116, 'neu': 0.884, 'pos': 0.0, 'comp..."
2,vanguard etf,2416188264,1522321435485777921,2022-05-05 21:04:48+00:00,Vanguard Mid-Cap Growth ETF $NYSEARCA:VOT Reac...,0.0,0.0,0.0,0.0,"{'neg': 0.132, 'neu': 0.629, 'pos': 0.239, 'co..."
3,vanguard etf,1355628294490624000,1522320606133469185,2022-05-05 21:01:30+00:00,🌱🔋 $PBW Invesco WilderHill Clean Energy ETF\nv...,0.0,0.0,0.0,0.0,"{'neg': 0.0, 'neu': 0.734, 'pos': 0.266, 'comp..."
4,vanguard etf,89094168,1522316273069309952,2022-05-05 20:44:17+00:00,Vanguard cuts fees on $81.4bn bond ETF https:/...,0.0,0.0,0.0,0.0,"{'neg': 0.239, 'neu': 0.761, 'pos': 0.0, 'comp..."
...,...,...,...,...,...,...,...,...,...,...
884,fidelity fund,935550722363592706,1519936550850777088,2022-04-29 07:08:07+00:00,April has been a month of wins!\n\nWe are exci...,0.0,0.0,0.0,0.0,"{'neg': 0.0, 'neu': 0.703, 'pos': 0.297, 'comp..."
885,fidelity etf,1355628294490624000,1519919846796017666,2022-04-29 06:01:44+00:00,🌱🔋 $QCLN First Trust NASDAQ Clean Edge Green E...,0.0,0.0,0.0,0.0,"{'neg': 0.0, 'neu': 0.693, 'pos': 0.307, 'comp..."
886,fidelity fund,2254293637,1519897916135317505,2022-04-29 04:34:36+00:00,Referring to Musk “Either that means a separat...,0.0,0.0,1.0,0.0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
887,fidelity etf,1355628294490624000,1519874362907709440,2022-04-29 03:01:00+00:00,🌱🔋 $QCLN First Trust NASDAQ Clean Edge Green E...,0.0,0.0,0.0,0.0,"{'neg': 0.0, 'neu': 0.693, 'pos': 0.307, 'comp..."
