In [1]:
### Importing required packages 
import os
from pprint import pprint
from dotenv import load_dotenv
import tweepy
import pandas as pd
import sqlite3
import re
import numpy as np

In [2]:
### Loading in the environment variables

load_dotenv()

### For Twitter API v2 
TWITTER_BEARER_TOKEN = os.getenv("TWITTER_BEARER_TOKEN", default = "OOPS")


In [4]:
### Establishing the client 

client = tweepy.Client(bearer_token = TWITTER_BEARER_TOKEN)

In [5]:
### Pulling in the most recent tweets from past 7 day
### Pulling in all tweets needs higher level access

### Pulling in only tweets and not retweets 
### The query will instruct the API to fetch the tweets needed. 
### For syntax and uses, refer to https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query#availability
query = '#COP26 lang:en'

### Maximum tweets that can be pulled in per request are 100(Use Paginator if more is required)
### When using expansions or tweet_fields, create list with no spaces and only separated by commas
tweets = client.search_recent_tweets(query = query, expansions=['author_id','attachments.media_keys','referenced_tweets.id','geo.place_id'], tweet_fields=['created_at','entities','context_annotations'], media_fields=['url','preview_image_url'], user_fields=['verified'], max_results = 100)


In [7]:
### Tweet Parser 

def parse_tweets_to_df(tweets) :
    """
    Function to parse through the tweets and return a dataframe with separate elements of the tweets.
    
    Arguments : 
    ~~~~~~~~~~~~~~~~
    tweets : tweepy response object containing the pulled in tweets
    
    Returns :
    ~~~~~~~~~~~~~~~~
    df : dataframe containing parsed tweets 
    
    """
    ### Intializing empty lists that will contain portions of parsed tweets 
    ### These lists will act as columns for the data frame 
    tweet_id = []
    tweet_text = []
    tweet_created_at = []
    user_id =[]
    user_name = []
    user_username = []
    user_verified = []
    tweet_links = []
    tweet_mentions = []
    tweet_mentions_id = []
    tweet_annotations = []
    tweet_media_types = []
    tweet_media_links = []
    tweet_hashtags = []
    tweet_reference_id = []
    tweet_reference_type = []
    tweet_entities_annotation_type = []
    tweet_entities_annotation_text = []
    tweet_context_annotations_domain_id = []
    tweet_context_annotations_domain_name =[]
    tweet_context_annotations_entity_id = []
    tweet_context_annotations_entity_name = []
    
    ### Getting the expanded tweets from includes dictionary for referenced tweets  
    Tweets = tweets.includes['tweets']
    expanded_tweets_id = [exp_tweet.id for exp_tweet in tweets.includes['tweets']]
    
    ### Getting the user details from includes dictionary
    Users = tweets.includes['users']
    
    ### Getting the media details from includes dictionary 
    Media = tweets.includes['media'] ## Media is a list of tweet response objects 

    ### Iterating across every scraped tweet 
    for tweet in tweets.data :
        tweet_id.append(tweet.id)
        
        ### If they are retweets the expanded tweets have to be accessed through the includes dictionary
        ### For referenced tweets 
        if 'referenced_tweets' in tweet : 
            referenced_tweet_type = []
            referenced_tweet_id = []
            for ref_tweet in tweet['referenced_tweets'] :
                referenced_tweet_id.append(ref_tweet.id) 
                referenced_tweet_type.append(ref_tweet.type)
                ### Getting the index of ref_tweet id in expanded tweet object 
                index = [i for i, values in enumerate(Tweets) if values.id == ref_tweet.id]
                
            tweet_text.append(Tweets[index[0]].text)    
            tweet_reference_id.append(referenced_tweet_id)
            tweet_reference_type.append(referenced_tweet_type)
        ### For original tweets 
        else :
            tweet_text.append(tweet.text)
            tweet_reference_id.append(np.NaN)
            tweet_reference_type.append(np.NaN)
   
        tweet_created_at.append(tweet.created_at)
        
        user_id.append(tweet.author_id)
        for User in Users :
            if User.id == tweet.author_id :
               user_name.append(User.name) 
               user_username.append(User.username) 
               user_verified.append(User.verified) 
               break
            else :
                continue
        
        
        ### For links in the tweet 
        if 'urls' in tweet.entities : 
            urls = [] ### Empty list to contain list of urls 
            for url in tweet.entities['urls'] : 
                urls.append(url['expanded_url'])
            tweet_links.append(urls) ### Appending to main list     
        else :
            tweet_links.append(np.NaN)
        
        ### For hashtags
        if 'hashtags' in tweet.entities :
            hashtags = [] ### Empty list to contain list of hashtags 
            for tag in tweet.entities['hashtags'] :
                hashtags.append(tag['tag'])
            tweet_hashtags.append(hashtags) 
        else :
            tweet_hashtags.append(np.NaN)
            
        ### For mentions 
        if 'mentions' in tweet.entities :
            mentions = [] ### Empty list to contain list of mentions
            mentions_id = [] ### Empty list to contain list of mention ids 
            for mention in tweet.entities['mentions'] :
                mentions.append(mention['username'])
                mentions_id.append(mention['id'])
            tweet_mentions.append(mentions)
            tweet_mentions_id.append(mentions_id)
        else :
            tweet_mentions.append(np.NaN)
            tweet_mentions_id.append(np.NaN)
        
        ### For annotations 
        if 'annotations' in tweet.entities :
            annotations_text = []
            annotations_type = []
            for annotation in tweet.entities['annotations'] :
                annotations_text.append(annotation['normalized_text'])
                annotations_type.append(annotation['type'])
            tweet_entities_annotation_text.append(annotations_text)
            tweet_entities_annotation_type.append(annotations_type)
        else :
            tweet_entities_annotation_text.append(np.NaN)
            tweet_entities_annotation_type.append(np.NaN)
            
        ### For media links
        if 'attachments' in tweet :
            media_types = [] ### Intializing empty list to contain the media types for a particular tweet 
            media_urls = [] ### Intializing empty list to contain the media urls for a particular tweet 
            media_keys_list = tweet['attachments']['media_keys'] ### List of media keys associated with the tweet
            ### Cross-referencing media keys with Media list to get the urls
            for key in media_keys_list :
                index = [i for i, values in enumerate(Media) if values.media_key == str(key)] ## Getting index from Media list
                media_types.append(Media[index[0]].type)
                media_urls.append(Media[index[0]].url)
            tweet_media_types.append(media_types)
            tweet_media_links.append(media_urls)
        else :
            tweet_media_types.append(np.NaN)
            tweet_media_links.append(np.NaN)
            
        
        ### For context annotations 
        if 'context_annotations' in tweet :
            domain_id = []
            domain_name = []
            entity_id = []
            entity_name = []
            for annotation in tweet.context_annotations :
                domain_id.append(annotation['domain']['id'])
                domain_name.append(annotation['domain']['name'])
                entity_id.append(annotation['entity']['id'])
                entity_name.append(annotation['entity']['name'])
            tweet_context_annotations_domain_id.append(domain_id)   
            tweet_context_annotations_domain_name.append(domain_name) 
            tweet_context_annotations_entity_id.append(entity_id)     
            tweet_context_annotations_entity_name.append(entity_name)     
        else :
            tweet_context_annotations_domain_id.append(np.NaN) 
            tweet_context_annotations_domain_name.append(np.NaN)     
            tweet_context_annotations_entity_id.append(np.NaN)     
            tweet_context_annotations_entity_name.append(np.NaN)     
                
                
                
    ### Dictionary to convert to dataframe 
    data = {'Tweet_ID' : tweet_id,
            'Text' : tweet_text,
            'Created_at' : tweet_created_at, 
            'User_ID' : user_id,
            'Name' : user_name,
            'Username' : user_username, 
            'Verified_User' : user_verified, 
            'Included_Links' : tweet_links,
            'Hashtags' : tweet_hashtags,
            'Mentions' : tweet_mentions,
            'Mentions_ID' : tweet_mentions_id,
            'Included_Media_Type' : tweet_media_types,
            'Media_Links' : tweet_media_links,
            'Explicit_Annotation_Text' : tweet_entities_annotation_text,
            'Explicit_Annotation_Type' : tweet_entities_annotation_type, 
            'Context_Annotation_Domain_ID' : tweet_context_annotations_domain_id,
            'Context_Annotation_Domain_Type' : tweet_context_annotations_domain_name,
            'Context_Annotation_Entity_ID' : tweet_context_annotations_entity_id,
            'Context_Annotation_Entity_Type' : tweet_context_annotations_entity_name,
            'Referenced_tweet_ID' : tweet_reference_id,
            'Referenced_Tweet_Type' : tweet_reference_type}   
    
    df = pd.DataFrame(data)
    return df


d = parse_tweets_to_df(tweets)
    
  

In [13]:
d.head(10)

Unnamed: 0,Tweet_ID,Text,Created_at,User_ID,Name,Username,Verified_User,Included_Links,Hashtags,Mentions,...,Included_Media_Type,Media_Links,Explicit_Annotation_Text,Explicit_Annotation_Type,Context_Annotation_Domain_ID,Context_Annotation_Domain_Type,Context_Annotation_Entity_ID,Context_Annotation_Entity_Type,Referenced_tweet_ID,Referenced_Tweet_Type
0,1543609475772420096,‘Catastrophic climate dislocation’. That’s wha...,2022-07-03 14:55:52+00:00,274253864,Tom Dougherty,cbcwatcher8,False,,"[cop26, carbontracker]",[lucysiegle],...,,,,,[29],[Events [Entity Service]],[1450764012677857294],[The UN Climate Change Conference 2021 has sta...,[1458332806728269824],[retweeted]
1,1543605473651429377,Taking action on climate change can deliver hu...,2022-07-03 14:39:58+00:00,392420981,JustBlazeIT™ .Iota🕸✨,JustBlazeDro,False,,,[PatConroy1],...,,,,,,,,,[1456556773763006466],[retweeted]
2,1543604939678695426,Brazilian Amazon saw a 64% jump in #deforestat...,2022-07-03 14:37:51+00:00,4149387017,Colleen❤️🇨🇦🇺🇦,gomindshine,False,,[deforestation],[SEIclimate],...,,,[Amazon],[Organization],,,,,[1543110473024946177],[retweeted]
3,1543604716072042496,Brazilian Amazon saw a 64% jump in #deforestat...,2022-07-03 14:36:57+00:00,1159498179324370944,Milagro Bravo 🆘 BoycottGold4Yanomami! 🌎 !!,BravoMilagro1,False,,[deforestation],[SEIclimate],...,,,[Amazon],[Organization],,,,,[1543110473024946177],[retweeted]
4,1543604279873007618,First of the many in the series of short and p...,2022-07-03 14:35:13+00:00,1184143605679542272,DataLoy,_DataLoy,False,,[Viz4climateaction],[VinodhDataArt],...,,,,,,,,,[1542612251189956610],[retweeted]
5,1543603764770312192,STORY\n\nBoris Johnson sent for ministerial je...,2022-07-03 14:33:10+00:00,923692803435257856,Sara Colman💙💚🟨🟥,SarahCo1167,False,[https://twitter.com/mikeysmith/status/1543284...,[COP26],[AlokSharma_RDG],...,,,,,"[10, 29, 35, 10, 35]","[Person, Events [Entity Service], Politician, ...","[1070735163343032320, 1450764012677857294, 107...","[Alok Sharma, The UN Climate Change Conference...",[1543284258227232768],[quoted]
6,1543603332417200129,Brazilian Amazon saw a 64% jump in #deforestat...,2022-07-03 14:31:27+00:00,379391810,Vote!! ☮✌,LouisaDvotes,False,,[deforestation],[SEIclimate],...,,,[Amazon],[Organization],,,,,[1543110473024946177],[retweeted]
7,1543602671617282053,Brazilian Amazon saw a 64% jump in #deforestat...,2022-07-03 14:28:50+00:00,1366706535758516226,Born a Princess,BornaPrincess2,False,,[deforestation],[SEIclimate],...,,,[Amazon],[Organization],,,,,[1543110473024946177],[retweeted]
8,1543601770714349568,Brazilian Amazon saw a 64% jump in #deforestat...,2022-07-03 14:25:15+00:00,39296373,Barbara Navarro #BoycottGold4Yanomami! 🆘 🌊,BarbaraNavarro,False,,[deforestation],[SEIclimate],...,,,[Amazon],[Organization],,,,,[1543110473024946177],[retweeted]
9,1543597317420331010,Akhtar told me this is the worst flood his fam...,2022-07-03 14:07:33+00:00,150202773,Marjan Nur,marjannur,False,,,[mattjcannell],...,,,[Akhtar],[Person],,,,,[1543597107344334848],[retweeted]
