# Get Tweets related to hashtag/cashtag 

This module can pull data from twitter using tweepy . You must have access to twitter's developer API as it uses your credentials to connect & access data.

### Load libraries

In [16]:
import tweepy
import pandas as pd
import time
from datetime import date,timedelta
from pprint import pprint

### Function to get connection to API

In [21]:
def get_authorized(consumer_key,consumer_secret,access_token,access_token_secret):
    '''    
    This function helps to establish connection to twitter api via tweepy.
    
    Parameters:
    consumer_key : api key from twitter enter by user
    consumer_secret : api secret from twitter enter by user
    access_token : access token from twitter enter by user
    access_token_secret: access token secret from twitter enter by user
    
    Returns: 
    Tweepy api object which helps in connecting to twitter
    '''
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True) #waits on rate limit (15 mins)
    return api


### Function to get tweets related to hashtag

Twitter has a rate limit of 900 request / 15 min PER USER AUTH &  300 request / 15 min PER APP AUTH. And it only allows pulling data for past 1 week. We are pulling tweets made in the past 2 days for each company

In [23]:
def get_tweets(api,tag,num_tweets=10):
    '''
    This function gets tweets related to a cashtag made in the past two days.
    
    Parameters:
    api : tweepy api object which helps in connecting to twitter
    tag : cashtag
    num_tweets : Number of tweets to be scrapped. If this exceeds 300, the API will try to get all the tweets made in the past two days.
    
    Returns: 
    A list of tweets for the corresponding cashtag.
    '''
    today = date.today()
    text_query = tag 
    try:
        if num_tweets>300:
            tweets = tweepy.Cursor(api.search,q=text_query,since=today-timedelta(days=2), until=today).items()
        else:
            tweets = tweepy.Cursor(api.search,q=text_query,since=today-timedelta(days=2), until=today).items(num_tweets)
        # Pulling information from tweets iterable object
        tweets_list = [[tweet.created_at, tweet.id, tweet.text] for tweet in tweets]
        # Creation of dataframe from tweets list
        # Add or remove columns as you remove tweet information
        tweets_df = pd.DataFrame(tweets_list)
        tweets_df.rename(columns={0: 'Timestamp',1: 'Tweet ID',2:'Tweet'}, inplace=True)
        tweets_df.insert(0, 'Hashtag', tag)
    except tweepy.TweepError as te:
        print(te.reason)
        return None 
    return tweets_df

### Check connection to API 
For this you have to edit the file authentication.txt, add ur credentitals to it

In [44]:
#get from twitter after creating dev account
f=open("authentication.txt","r")
lines=f.readlines()
api_key=lines[0].strip() 
api_secret_key=lines[1].strip() 
access_token=lines[2].strip() 
access_token_secret=lines[3].strip() 
api = get_authorized(api_key,api_secret_key,access_token,access_token_secret)
print(api)
f.close()

<tweepy.api.API object at 0x0000018DDF4B40A0>


### Read Tech Companies list file and get tweets for corresponding tech company

In [27]:
#read file containing hashtag information
hashtags=pd.read_csv("FinalList_Tech.csv",header=0)
related_tweets=pd.DataFrame()
hashtags.head()
retry_later={}
print("Getting data from twitter ...")
for lab,row in hashtags.iterrows():
    #function gets 10 tweets related to cashtag in the past week
    tweets_df=get_tweets(api,'$'+row[10],2000)
    if (tweets_df is None):
        print(f"Skipped {row[3]} due to exception, will retry later")
        retry_later[row[10]]=row[3]
        continue
    if not(tweets_df.empty) :
        tweets_df.insert(0, 'Company', row[3])
        related_tweets=related_tweets.append(tweets_df, ignore_index=True)
print("Done... ")
if retry_later:
    print(f' Some of the companies are left due exception, run below code to try again: {retry_later}')

Getting data from twitter ...


Rate limit reached. Sleeping for: 831
Rate limit reached. Sleeping for: 829
Rate limit reached. Sleeping for: 824


Failed to send request: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Skipped Apple Inc. due to exception, will retry later
Done... 
 Some of the companies are left due exception, run below code to try again: {'AAPL': 'Apple Inc.'}


### Retrying the companies skipped above due to exceptions (ConnectionResetError etc)

The program runs loops 3 times at max, to avoid going into an infinite loop. Also it only pulls 300 tweets for the skipped companies.

In [28]:
remove_keys=[]
num_tries=0
wait_time =180
while len(retry_later)>0:
    print("Retrying for skipped ones...")
    num_tries+=1
    if num_tries>3:
        print("Max tries exceeded, exiting... ")
        break
    for key,val in retry_later.items():
        time.sleep(wait_time) #sleep in case we reached rate limit earlier
        wait_time += 1
        tweets_df=get_tweets(api,'$'+key,300)
        if (tweets_df is None):
            print(f"Skipped {key} due to exception, will retry later")
            continue
        if not(tweets_df.empty) :
            tweets_df.insert(0, 'Company', val)
            related_tweets=related_tweets.append(tweets_df, ignore_index=True)
            remove_keys.append(key)
    
    if remove_keys:
        for k in remove_keys: del retry_later[k]

Retrying for skipped ones...


### Display dataframe

In [29]:
related_tweets.head()

Unnamed: 0,Company,Hashtag,Timestamp,Tweet ID,Tweet
0,Teledyne Technologies,$TDY,2020-11-28 22:24:35,1332812659805196288,@investing_city $TDY $CGNX $IEX $APH
1,Teledyne Technologies,$TDY,2020-11-28 19:45:39,1332772663077588995,$TDY in Uptrend: 50-day Moving Average moved a...
2,Teledyne Technologies,$TDY,2020-11-27 20:42:52,1332424674148450304,Comcast Corp $CMCSA To Reveal Significant Inve...
3,Teledyne Technologies,$TDY,2020-11-27 17:11:35,1332371504030334977,$TDY in Uptrend: 50-day Moving Average moved a...
4,Teledyne Technologies,$TDY,2020-11-27 14:52:28,1332336496183967745,my long holdings:\n$aapl\n$adbe\n$amd\n$amzn\n...


### Save data as csv

In [46]:
related_tweets.to_csv(f"Twitter_Tech_data_{date.today()}.csv", index=False)

### Add new column number of tweets to original list of tech companies, and save as csv

In [58]:
num_tweets_per_company=related_tweets[['Company','Tweet']].groupby('Company').agg({'Tweet':'count'}).reset_index()
hashtags=pd.merge(hashtags,num_tweets_per_company,on='Company',how='left')
hashtags.to_csv(f"FinalList_Tech_{date.today()}.csv", index=False)

### Write the entire notebook contents into a script file (.py)
This file can be executed automatically daily if we we add the following line to the crontab using the command - <br>
@daily python3 /Projects/scraper/getTwitterData.py <br>


In [59]:
%%writefile getTwitterData.py
import tweepy
import pandas as pd
import time
from datetime import date,timedelta
from pprint import pprint

def get_authorized(consumer_key,consumer_secret,access_token,access_token_secret):
    '''    
    This function helps to establish connection to twitter api via tweepy.
    
    Parameters:
    consumer_key : api key from twitter enter by user
    consumer_secret : api secret from twitter enter by user
    access_token : access token from twitter enter by user
    access_token_secret: access token secret from twitter enter by user
    
    Returns: 
    Tweepy api object which helps in connecting to twitter
    '''
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True) #waits on rate limit (15 mins)
    return api

def get_tweets(api,tag,num_tweets=10):
    '''
    This function gets tweets related to a cashtag made in the past two days.
    
    Parameters:
    api : tweepy api object which helps in connecting to twitter
    tag : cashtag
    num_tweets : Number of tweets to be scrapped. If this exceeds 300, the API will try to get all the tweets made in the past two days.
    
    Returns: 
    A list of tweets for the corresponding cashtag.
    '''
    today = date.today()
    text_query = tag 
    try:
        if num_tweets>300:
            tweets = tweepy.Cursor(api.search,q=text_query,since=today-timedelta(days=2), until=today).items()
        else:
            tweets = tweepy.Cursor(api.search,q=text_query,since=today-timedelta(days=2), until=today).items(num_tweets)
        # Pulling information from tweets iterable object
        tweets_list = [[tweet.created_at, tweet.id, tweet.text] for tweet in tweets]
        # Creation of dataframe from tweets list
        # Add or remove columns as you remove tweet information
        tweets_df = pd.DataFrame(tweets_list)
        tweets_df.rename(columns={0: 'Timestamp',1: 'Tweet ID',2:'Tweet'}, inplace=True)
        tweets_df.insert(0, 'Hashtag', tag)
    except tweepy.TweepError as te:
        print(te.reason)
        return None 
    return tweets_df

#get from twitter after creating dev account
f=open("authentication.txt","r")
lines=f.readlines()
api_key=lines[0].strip() 
api_secret_key=lines[1].strip() 
access_token=lines[2].strip() 
access_token_secret=lines[3].strip() 
api = get_authorized(api_key,api_secret_key,access_token,access_token_secret)
f.close()

#read file containing hashtag information
hashtags=pd.read_csv("FinalList_Tech.csv",header=0) #file location
related_tweets=pd.DataFrame()
hashtags.head()
retry_later={}
print("Getting data from twitter ...")
for lab,row in hashtags.iterrows():
    #function gets 10 tweets related to cashtag in the past week
    tweets_df=get_tweets(api,'$'+row[10],2000)
    if (tweets_df is None):
        print(f"Skipped {row[3]} due to exception, will retry later")
        retry_later[row[10]]=row[3]
        continue
    if not(tweets_df.empty) :
        tweets_df.insert(0, 'Company', row[3])
        related_tweets=related_tweets.append(tweets_df, ignore_index=True)
print("Done... ")
if retry_later:
    print(f' Some of the companies are left due exception, run below code to try again: {retry_later}')
    
#Code to retry getting data for failed companies    
remove_keys=[]
num_tries=0
wait_time =180
while len(retry_later)>0:
    print("Retrying for skipped ones...")
    num_tries+=1
    if num_tries>3:
        print("Max tries exceeded, exiting... ")
        break
    for key,val in retry_later.items():
        time.sleep(wait_time) #sleep in case we reached rate limit earlier
        wait_time += 1
        tweets_df=get_tweets(api,'$'+key,300)
        if (tweets_df is None):
            print(f"Skipped {key} due to exception, will retry later")
            continue
        if not(tweets_df.empty) :
            tweets_df.insert(0, 'Company', val)
            related_tweets=related_tweets.append(tweets_df, ignore_index=True)
            remove_keys.append(key)
    
    if remove_keys:
        for k in remove_keys: del retry_later[k]
            
#Save list of tweets            
related_tweets.to_csv(f"Twitter_Tech_data_{date.today()}.csv", index=False)

#Update original tech list with number of tweets and save the file
num_tweets_per_company=related_tweets[['Company','Tweet']].groupby('Company').agg({'Tweet':'count'}).reset_index()
hashtags=pd.merge(hashtags,num_tweets_per_company,on='Company',how='left')
hashtags.to_csv(f"FinalList_Tech_{date.today()}.csv", index=False)

Writing getTwitterData.py
