# Download Tweets

In [1]:
import tweepy
import pandas as pd
import matplotlib.pyplot as plt
import time

# Authentication


In [2]:
bearer_token = "#"
client = tweepy.Client(bearer_token=bearer_token, wait_on_rate_limit=True)


# Get Tweet Counts


In [None]:
def get_tweet_counts(keyword, start_time, end_time, retweet=False):
    
    if retweet:
        query = f'{keyword} lang:tr'
    else:
        query = f'{keyword} lang:tr -is:retweet'
        
    counts = client.get_all_tweets_count(query=query, granularity='day',
                                         start_time=start_time, end_time=end_time)
    
    df = pd.DataFrame(counts.data)
    df = df[["start", "tweet_count"]]
    df["start"] = pd.to_datetime(df["start"]).dt.date
    df.rename({"start":"date"}, axis=1, inplace=True)
    df["keyword"] = keyword
    return df

In [None]:
get_tweet_counts("#", start_time="2022-03-01T00:00:00Z", end_time="2022-04-01T00:00:00Z", retweet = True)

In [None]:
client.get_all_tweets_count(query="#", granularity='day',
                                         start_time="2022-03-01T00:00:00Z", end_time="2022-04-01T00:00:00Z").data

In [None]:
keywords = []

_keywords = []
_keywords = []
_keywords = []
_keywords = []
_keywords = []

keywords_lists = [_keywords, _keywords, _keywords, _keywords,_keywords]
keywords_lists_str = ["", "", "", "", ""]

In [None]:
keywords = keywords[:-4]
tweet_counts = tweet_counts[tweet_counts["keyword"].isin(keywords)].sort_values(["keyword", "date"]).reset_index(drop=True)

In [None]:
pd.DataFrame(tweet_counts.groupby("keyword")["tweet_count"].sum().sort_values(ascending=False)).reset_index()


In [None]:
tweet_counts["tweet_count"].sum()


In [None]:
start_time="2022-03-01T00:00:00Z", end_time="2022-04-01T00:00:00Z"

In [None]:
def time_formatter(time):
    return f"{'T'.join(str(time).split())}Z"

# Download

In [None]:
keywords = []

#_keywords = []
#_keywords = []
#_keywords = []
#_keywords = []
#_keywords = []

keywords_lists = [_keywords, _keywords, _keywords,_keywords, _keywords]
keywords_lists_str = ["", "", "", "", ""]

In [None]:
def download_tweets(keyword, start_time, end_time, n_tweets, retweet=False):
    
    # Formatting start_time and end_time
    start_time = time_formatter(start_time)
    end_time = time_formatter(end_time)
    
    
    # Defining query based on whether to include retweets or not
    if retweet:
        query = f'{keyword} lang:tr'
    else:
        query = f'{keyword} lang:tr -is:retweet'
        
        
    # Get tweet texts, created_at, tweet_id, author_id
    tweets = client.search_all_tweets(query=query, tweet_fields=["created_at"], expansions=['author_id'],
                         start_time=start_time, end_time=end_time, max_results=n_tweets)
    
    tweet_texts = []
    tweet_created_at = []
    tweet_ids = []
    author_ids = []
    
    print(f"Keyword: {keyword} | Time Range: {str(pd.to_datetime(start_time))[:-6]} - {str(pd.to_datetime(end_time))[:-6]} | # of tweets: {n_tweets}")
    for i, tweet in enumerate(tweets.data):
        #print(f"[{i+1}]\nTweet: {tweet.text}\nCreated at: {tweet.created_at}\nTweet ID: {tweet.id}\nAuthor ID: {tweet.author_id}\n")
        tweet_texts.append(tweet.text)
        tweet_created_at.append(tweet.created_at)
        tweet_ids.append(tweet.id)
        author_ids.append(tweet.author_id)
        
    # Get names and usernames from author_ids
    names = []
    usernames = []
    
    i = 0
    while i < len(author_ids):
        ids = author_ids[i:i+100]
        for user in client.get_users(ids=ids).data:
            names.append(user.name)
            usernames.append(user.username)   
        i += 100
        
    
    # Creating DataFrame
    df = pd.DataFrame([pd.Series(tweet_texts, name="tweet_text"),
                       pd.Series(tweet_created_at, name="tweet_created_at"),
                       pd.Series([keyword]*len(tweet_texts), name="keyword"),
                       pd.Series(tweet_ids, name="tweet_id"),
                       pd.Series(author_ids, name="author_id"),
                       pd.Series(names, name="author_name"),
                       pd.Series(usernames, name="author_username")]).T
    
    return df

In [None]:
time_range = pd.date_range(start=pd.to_datetime("2021/11/01"), end=pd.to_datetime("2022/01/01"), freq="H")


In [None]:
tweets_df = pd.DataFrame(columns=['tweet_text', 'tweet_created_at', 'keyword', 'tweet_id',
                                  'author_id', 'author_name', 'author_username'])

In [None]:
time_range

In [None]:
for i in range(len(time_range)-1):
    print(f"Iteration #{i} in {len(time_range)-1} | Progress: {i/(len(time_range)-1)*100:.2f}%")
    for keyword in keywords:
        try:
            tweets_df = tweets_df.append(download_tweets(keyword=keyword,
                                                         start_time=time_range[i],
                                                         end_time=time_range[i+1],
                                                         n_tweets=1000,
                                                         retweet=False))
            print(f"Total Tweets Downloaded: {tweets_df.shape[0]}\n")
        except:
            print(f"No tweet for {keyword} in time range {time_range[i]} - {time_range[i+1]}")
        time.sleep(3)
    time.sleep(5)
        
tweets_df.reset_index(drop=True, inplace=True)

In [None]:
tweets_df.info()

In [None]:
tweets_df.to_csv("tweets_all_day.csv", index=False)