# Twitter - Historical Data Extraction

Readme.md
    
    1. Get Python >= 3.8.0
    2. Install:
            $:> pip install git+https://github.com/JustAnotherArchivist/snscrape.git 
        OR
        $:> pip install snscrape
        
    3. Set `start_date` = "YYYY-MM-DD", `end_date` = "YYYY-MM-DD"
    4. Run function get_tweets
    5. Save output df in CSV format.

## Imports

In [1]:
import os
import time
from ast import literal_eval
from datetime import datetime
import pandas as pd
import snscrape.modules.twitter as sntwitter # the magic
import warnings
warnings.filterwarnings('ignore')

## Extractor Module

In [2]:
class ExtractTweets:

    def __init__(self, 
                 minTweetCountPerDay=10, 
                 minRetweetCount=0,
                 minLikeCount=0, 
                 minFollowersCount=0, 
                 VerifiedStatus=None, 
                 saveBufferDuration=3600):
        """
        Accepts basic input params each of integer datatype, except for VerifiedStatus which accepts boolean or None.
        """
        self.start_timer = datetime.strptime(time.strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S")
        self.min_tweet_count_perDay = minTweetCountPerDay
        self.minRetweetCount = minRetweetCount
        self.minLikeCount = minLikeCount
        self.minFollowersCount = minFollowersCount
        self.VerifiedStatus = VerifiedStatus
        self.tweets_df = pd.DataFrame(columns=['date', 'tweet', 'lang', 'retweetCount', 'likeCount', 'replyCount', 
                                               'username', 'user_followersCount','user_friendsCount', 'user_statusesCount', 
                                               'user_favouritesCount', 'user_mediaCount',  'longitude', 'latitude', 
                                               'verifiedStatus',  'tweet_url', 'hastags', 'chr_count', 'topic'])                        
        self.save_buffer_duration = saveBufferDuration
        return

    def save_copy(self):
        """
        Saves a temp copy for restoration and prevent API time limit exceed error.
        :return:
        pandas dataframe containing twitter record-data.
        """
        data = self.tweets_df.reset_index(drop=True)
        data['date'] = data['date'].apply(lambda x: pd.to_datetime(x).strftime('%Y-%m-%d'))
        for filename in os.listdir("."):
            if filename.endswith('local.csv'):
                os.remove(filename)
        data.to_csv("./save_{}_local.csv".format(data.date.max()), index=False)
        return data

    def getTweets(self, start_date, end_date, keywords):
        """
        Extracts historical twitter data.

        :params:
        start_date - str in "YYYY-MM-DD" format
        end_date - str in "YYYY-MM-DD" format
        keywords - list of tuples, 
            e.g, [('recession'), ('football, 'worldcup', 'fifa'), ('war', 'ukraine')]
            e.g. ['recession']

        :return:
        pandas dataframe with features as:
         date: Tweet Timestamp
         tweet: tweet content
         lang: language classifer used by parent api
         retweetCount: tweet retweeted count
         likeCount: tweet like count
         replyCount: number of replies to original tweet
         username: user who tweeted
         user_followersCount: number of followers user has (tells you how popular the avg tweets are)
         user_friendsCount: number of friends user has
         user_statusesCount: 
         user_favouritesCount:
         user_mediaCount:
         tweet_coordinates_long_lat:
         verifiedStatus: If the user is Verified or not (i.e. pays 8 bucks every month!)
         tweet_url: Link of original tweet (click and see)
         hastags: If any hastags were used (hastags are important for search and info retrieval)
         chr_count: number of english characters in the original tweet
         topic: keywords you used for searching tweets (kind of labels)
        """

        if not(isinstance(keywords, list) or isinstance(keywords, tuple)):
            raise Exception("Incorrect Input Format! Please pass a list")

        for topic in keywords:
            # for saving local copies every buffer_hour
            st_time = datetime.strptime(time.strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S")
            date = pd.to_datetime(start_date, format='%Y-%m-%d')
            e_date = pd.to_datetime(end_date, format='%Y-%m-%d') + pd.to_timedelta(1, unit='d')
            if isinstance(topic, tuple) or isinstance(topic, list):
                topic = " ".join(topic)
            search_query = topic
            print("search_query:", search_query)

            while date != e_date:
                nxt_date = date + pd.to_timedelta(1, unit='d')
                content = '{} since:{} until:{}'.format(search_query, date.strftime('%Y-%m-%d'), nxt_date.strftime('%Y-%m-%d'))
                print(content)

                # check for save buffer duration (set to 1 Hr by default)
                delta_buffer = (datetime.strptime(time.strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S") - st_time).seconds
                if delta_buffer >= self.save_buffer_duration:
                    self.save_copy()
                    # reset buffer
                    st_time = datetime.strptime(time.strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S")

                lst_tweets = []
                for counter, tweet in enumerate(sntwitter.TwitterSearchScraper(content).get_items()):
                    if counter+1 > self.min_tweet_count_perDay: 
                        break
                    if tweet.likeCount >= self.minLikeCount \
                        or tweet.retweetCount >= self.minRetweetCount \
                        or tweet.user.followersCount >= self.minFollowersCount \
                        or (tweet.user.verified and isinstance(tweet.user.verified, bool) and tweet.user.verified == self.VerifiedStatus):

                        # ----------------------------------------------------------------
                        # Potential custom preprocessing module here: 
                        # 1. Simple and short: https://www.kaggle.com/code/zenbird01/pranjalpathak-semantic-clustering-v1-0/notebook
                        # 2. Advanced: ./NLP_basics_preprocessing_vectorization_similarity.ipynb
                        # 3. Best: Check github - https://github.com/pranzell/NLP_Tools
                        # ----------------------------------------------------------------
                        if tweet.coordinates:
                            tweet_longitude, tweet_latitude = tweet.coordinates.longitude, tweet.coordinates.latitude
                        else:
                            tweet_longitude, tweet_latitude = -1, -1
                        lst_tweets.append([
                            tweet.date, 
                            tweet.content, 
                            tweet.lang,
                            tweet.retweetCount,
                            tweet.likeCount,
                            tweet.replyCount,
                            tweet.user.username, 
                            tweet.user.followersCount, 
                            tweet.user.friendsCount, 
                            tweet.user.statusesCount,
                            tweet.user.favouritesCount,
                            tweet.user.mediaCount,
                            tweet_longitude,
                            tweet_latitude,
                            tweet.user.verified,
                            tweet.url,
                            tweet.hashtags,
                            len(str(tweet.content).strip()),
                            topic])

                self.tweets_df = self.tweets_df.append(pd.DataFrame(lst_tweets, columns=self.tweets_df.columns))
                date = nxt_date

        print("\n\nTOTAL TIME TAKEN {} minutes".format(((datetime.strptime(time.strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S") - self.start_timer).seconds)/60.0))
        return self.save_copy()

    def preprocess_shortText(self, text_col):
        # refer to Preprocessing ipynb file
        # https://github.com/pranzell/NLP_Tools
        pass

## <ins>Execute</ins>

### Configuration

In [3]:
# :: settings ::
minTweetCountPerDay=200
minRetweetCount=0
minLikeCount=0
minFollowersCount=0
VerifiedStatus=None
saveBufferDuration=3600  # in seconds

start_date = "2020-01-01"
end_date = "2022-12-31"

# Intents:
keywords = [
    ('CUBoulder', 'football team'),
    ('CUBoulder', 'admission'), 
    ('CUBoulder', 'online'), 
    ('CUBoulder', 'raplphie'), 
    ('CUBoulder', 'dining'), 
    ('CUBoulder', 'night ride'), 
    ('CUBoulder', 'hiking'), 
    ('CUBoulder', 'night life'), 
    ('CUBoulder', 'research'), 
    ('CUBoulder', 'restaurants')
]

### Run

In [None]:
et = ExtractTweets(minTweetCountPerDay, minRetweetCount, minLikeCount, minFollowersCount, VerifiedStatus, saveBufferDuration)
twitter_data = et.getTweets(start_date, end_date, keywords)

search_query: CUBoulder football team
CUBoulder football team since:2020-01-01 until:2020-01-02
CUBoulder football team since:2020-01-02 until:2020-01-03
CUBoulder football team since:2020-01-03 until:2020-01-04


In [None]:
print(twitter_data.shape)

## Output

In [305]:
def read_copy(path="."):
    for f in os.listdir(path):
        if f.endswith('local.csv'):
            df = pd.read_csv(f, lineterminator='\n')
            df.hastags = df.hastags.apply(lambda x: literal_eval(x) if str(x) not in ['none', 'nan', 'np.nan', 'null', ''] else None)
            return df

In [308]:
df = read_copy(".")
df

Unnamed: 0,date,tweet,lang,retweetCount,likeCount,replyCount,username,user_followersCount,user_friendsCount,verifiedStatus,tweet_url,hastags,chr_count,topic
0,2022-12-01,@Jeffdc5 It hurts the middle class on down. No...,en,1,12,1,StephanieRade18,9559,9333,False,https://twitter.com/StephanieRade18/status/159...,,153,recession
1,2022-12-01,"The latest update for #Zendesk includes ""Retai...",en,0,0,0,opsmatters_uk,2892,2789,False,https://twitter.com/opsmatters_uk/status/15984...,"[Zendesk, customerservice, sales, helpdesk, co...",202,recession
2,2022-12-01,How would you react to another Depression? Mat...,en,0,0,0,speropictures,1282,1329,False,https://twitter.com/speropictures/status/15984...,,160,recession
3,2022-12-01,The #1 Thing Businesses Get Wrong During A Rec...,en,0,0,0,ABNewswire,1288,125,False,https://twitter.com/ABNewswire/status/15984668...,"[Australia, Business, MarketingSales, MediaCom...",158,recession
4,2022-12-01,@pnjaban Stop fighting #ClimateChange in a way...,en,0,0,1,_NewsView,273,820,False,https://twitter.com/_NewsView/status/159846678...,"[ClimateChange, malnutrition, inflation, reces...",288,recession
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2560,2023-01-20,"Leading up to the recent #FIFAWorldCup, a few ...",en,2,13,0,Nagarro,24397,502,False,https://twitter.com/Nagarro/status/16164270125...,"[FIFAWorldCup, ThinkingBreakthroughs]",272,football fifaworldcup
2561,2023-01-20,Messi is a king ....... goat of football \nHat...,en,0,1,1,mohdafsar_13,243,376,False,https://twitter.com/mohdafsar_13/status/161642...,,73,football fifaworldcup
2562,2023-01-20,The famous soccer player Dani #Alves has been ...,en,0,0,0,Twetter_Trends,251,72,False,https://twitter.com/Twetter_Trends/status/1616...,"[Alves, Spain, FIFA, Barcelona, football, Braz...",302,football fifaworldcup
2563,2023-01-20,Successfully hosted the Ancient8 FIFA World Cu...,en,0,2,1,Ancient8_gg,115767,153,False,https://twitter.com/Ancient8_gg/status/1616403...,,219,football fifaworldcup


### Citations

Credits to the awesome social media mining tool SNScrape (https://github.com/JustAnotherArchivist/snscrape)