In [1]:
import tweepy as tw
import pandas as pd
import snscrape.modules.twitter as sntwitter

In [2]:
consumer_key= 'xxxxxxx'
consumer_secret= 'xxxxxxx'
access_token= 'xxxxxxx'
access_token_secret= 'xxxxxxx'

auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)

### **The basic method to pull tweet using Twitter API**

In [3]:
search_word = 'Lebron'

tweets = tw.Cursor(api.search,q=search_word,
                           lang="en",               # search for only English tweet
                           since='2021-08-07',
                           tweet_mode='extended',   # we want the full tweet text, default setting will hide part of the tweet if it's too long
                           result_type='mixed'    # three option: popular, recent, or mixed
                  ).items(100)

tweets_list = [[tweet.id, tweet.created_at, tweet.full_text] for tweet in tweets]

pd.DataFrame(tweets_list,columns=['id','date','tweet']).head()

Unnamed: 0,id,date,tweet
0,1438196936016809987,2021-09-15 17:44:04,Miami LeBron was on a MISSION. 😤🔥 (via @Courts...
1,1438529411784101898,2021-09-16 15:45:13,Blake Griffin is right. LeBron James wasn’t th...
2,1438285444756779013,2021-09-15 23:35:46,Russ and LeBron getting that offseason work in...
3,1438686807256879104,2021-09-17 02:10:39,RT @INBBallSource: I'm ready to see players li...
4,1438686799795298305,2021-09-17 02:10:37,RT @RussTakeover: This man now has LEBRON JAME...


### **The basic code to scrape tweets using snscrape library**

In [4]:
tweets_list=[]

for i,tweet in enumerate(sntwitter.TwitterSearchScraper('olympics -filter:retweets since:2021-07-29 until:2021-07-30 lang:en').get_items()):
    if i>=100:       # the max count we want
        break
    tweets_list.append([tweet.id,tweet.date,tweet.content])
    
pd.DataFrame(tweets_list,columns=['id','date','tweet']).head()

Unnamed: 0,id,date,tweet
0,1420896918494928897,2021-07-29 23:59:59+00:00,I’ve been waiting all day to watch tonight’s #...
1,1420896917140213765,2021-07-29 23:59:58+00:00,@usatodaysports @_janiereed Well said. I real...
2,1420896915089084418,2021-07-29 23:59:58+00:00,@Nguyen_anime3 Read this instead…Yay for the f...
3,1420896910156591106,2021-07-29 23:59:57+00:00,@Karensk8ergirl 😰August16～&amp;September💉I liv...
4,1420896907690483715,2021-07-29 23:59:56+00:00,"Is it just me, or does it bother anyone that t..."


### **This is the method we used to pull tweets, which mixes in both methods above**

In [5]:
def get_tweet(query,date,date1,maxcount):
    
    '''
    Enter the query, start & end date, the max tweet count we want within the period
    '''
    
    # use the Twitter API approach first, if if pull enough tweets then stop right away
    
    tweets= tw.Cursor(api.search,
                      q=query+' -filter:retweets',
                      count=500,
                      lang="en",
                      since=date,
                      until=date1,
                      tweet_mode='extended',
                      result_type='mixed').items(maxcount)
    
    list1 = [[tweet.id, tweet.created_at, tweet.full_text] for tweet in tweets]
    list2 = []
    
    if len(list1)<maxcount:      # if the Twitter API approach doesn't pull enough tweets
        
        supply_count=maxcount-len(list1)      # how many more tweets we want
    
        for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query +' -filter:retweets since:'+ date +' until:'+ date1 +' lang:en').get_items()):
            if i>=supply_count:
                break
            list2.append([tweet.id,tweet.date, tweet.content])
        
        list1+=list2            # concatenate both list
    
    return list1
    
    '''
    This function will return a list of lists, each sub-list contains information about the tweet id/date/content
    '''
    

### **Create a list of the dates for the period (event) we want to research on**

One troublesome thing about this tweet pulling process is that both methods will pull tweets starting from either the 00:00 of the start date or the 24:00 of the end date (namely, start gathering data based from the very end of the time line), so if we set the date like 2020-09-01 to 2020-09-07 and set we want 50000 tweets, like this: 

#### **get_tweet('lebron', '2020-09-01', '2020-09-07', 50000)**

chances are those 50000 tweets are only going to be on one end (like all in 9/1,2 or 9/6,7).

So we need to iterate through each day.

In [6]:
l=[]
for i in pd.date_range('2020-09-30',periods=16, freq='D'):
    l.append(str(i)[0:10])
    
l           # this is the duration dates for the 2020 nba finals

['2020-09-30',
 '2020-10-01',
 '2020-10-02',
 '2020-10-03',
 '2020-10-04',
 '2020-10-05',
 '2020-10-06',
 '2020-10-07',
 '2020-10-08',
 '2020-10-09',
 '2020-10-10',
 '2020-10-11',
 '2020-10-12',
 '2020-10-13',
 '2020-10-14',
 '2020-10-15']

#### **Then iterate through each date**

In [7]:
tweets_list=[]

for j in range(len(l)-1):
    s=l[j]
    e=l[j+1]
    t=get_tweet('Lebron',s,e,100)       # how many tweets we want for each day, here the 100 is just for example
    tweets_list += t

df=pd.DataFrame(tweets_list,columns=['id','date','text'])
df.head()
    
# So here, the dataframe should contain the tweet data with 100 tweets for each day during the period 

Unnamed: 0,id,date,text
0,1311455787101949952,2020-09-30 23:59:59+00:00,#QuestionOfTheDay \nWho will win game 1 of the...
1,1311455785973624833,2020-09-30 23:59:59+00:00,@8lackJezus @WokeLotus @egchico3 @WoaXMamba @s...
2,1311455776221941762,2020-09-30 23:59:57+00:00,@netorarefanclub @nigel_dylan @stephenasmith @...
3,1311455758777806849,2020-09-30 23:59:53+00:00,@stephenasmith @KingJames Why still debate thi...
4,1311455748673744896,2020-09-30 23:59:50+00:00,@Homeoffree61 How about Stephen Colbert Alec B...


### **Then save the data into csv file**

In [8]:
df.to_csv('name of the file.csv',index=False)

### **For those who don't have a Twitter developer account, this is the alternative approach that solely use snscrape module to scrape tweets**

In [9]:
tweets_list=[]

for j in range(len(l)-1):
    s=l[j]
    e=l[j+1]
    t=[]
    
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper('lebron james' + ' -filter:retweets' + ' since:' + s +' until:' + e +' lang:en').get_items()):
        if i>=100:                                # how many we want for each day     
            break
        t.append([tweet.id,tweet.date,tweet.content])
    
    tweets_list += t
    
pd.DataFrame(tweets_list,columns=['id','date','text']).head()

Unnamed: 0,id,date,text
0,1311455787101949952,2020-09-30 23:59:59+00:00,#QuestionOfTheDay \nWho will win game 1 of the...
1,1311455785973624833,2020-09-30 23:59:59+00:00,@8lackJezus @WokeLotus @egchico3 @WoaXMamba @s...
2,1311455776221941762,2020-09-30 23:59:57+00:00,@netorarefanclub @nigel_dylan @stephenasmith @...
3,1311455758777806849,2020-09-30 23:59:53+00:00,@stephenasmith @KingJames Why still debate thi...
4,1311455748673744896,2020-09-30 23:59:50+00:00,@Homeoffree61 How about Stephen Colbert Alec B...


### **Below are the code we ran for pulling data**

For each topic, the "l" list has been set to the duration of each events/topics we want to research, and the desired daily count is set depending on the topic and length of the time duration. But since we filter out retweets, on many occasions some date it won't satisfy the count. (for example during the 2020 world series, other than the game 1 & 5 that Clayton Kershaw started for the Dodgers, for the other days the count of the tweets with 'clayton kershaw' are far less)

This situation will be taken into account during the analysis part, if some days the total tweet count are too few, we might ignore and only focus on the dates that there're sufficient amount of tweets.

In [None]:
# For each topic, the base code look like this

tweets_list=[]

for j in range(len(l)-1):
    s=l[j]
    e=l[j+1]
    t=    # get_tweet(....)
    tweets_list += t
    
df = pd.DataFrame(tweets_list,columns=['id','date','tweet'])
df.to_csv("filename.csv",index=False)

    
# The only difference would be the setting of the get_tweet() function, and below and the setting for each topics

    
# for simone biles olympics       from 2021/7/22 ~ 2021/8/8 
get_tweet('simone biles',s,e,50000)        # idealy we hope to get 50000 tweets each day during the period of olympics

# kershaw 2020 World series      from 2020/10/18 ~ 2020/10/31
get_tweet('clayton kershaw',s,e,30000)

# lebron 2020 playoffs          from 2020/8/17 ~ 2020/10/12
get_tweet('lebron james',s,e,10000)

# lebron 2020 finals            from 2020/9/30 ~ 2020/10/14 
get_tweet('LeBron',s,e,100000)

# 2020 nba finals               from 2020/9/30 ~ 2020/10/12
get_tweet('NBAFinals',s,e,50000)

# 2020 world series             from 2020/10/18 ~ 2020/10/31
get_tweet('WorldSeries',s,e,50000)