## Load the required libraries

Using
- [jsonlies]('https://jsonlines.readthedocs.io/en/latest/) to save data to `.jsonl` format.
- [tweepy]('https://github.com/tweepy/tweepy') as twitter API wrapper.
- [pandas]('https://pandas.pydata.org/') to create tabular data.

In [9]:
from tweepy import API, Cursor
from tweepy import OAuthHandler
import pandas as pd
import jsonlines

## Credentials

You need to define the twitter api credentials here

In [2]:
CREDENTIALS = {
    'ACCESS_TOKEN': '',
    'ACCESS_TOKEN_SECRET': '',
    'CONSUMER_KEY': '',
    'CONSUMER_SECRET': ''
}

## Paths

Paths to save the files

In [3]:
PATH_JSONL = 'midas_tweets.jsonl'
PATH_MIDAS_DF = 'midas_tweets.csv'

## Twitter Client

In [4]:
class TwitterClient():
    """
    Class which that authenticate and fetch tweets of a user from twitter
    """
    def __init__(self, credentials, twitter_user=None):
        "Initialize the authentication process and twitter client"
        self.auth = None
        self.tweets = None
        self._authenticate(credentials)
        self.twitter_client = API(self.auth)

    def _authenticate(self, credentials):
        "Authenticate using the credentials"
        self.auth = OAuthHandler(credentials['CONSUMER_KEY'],
                                 credentials['CONSUMER_SECRET'])
        self.auth.set_access_token(credentials['ACCESS_TOKEN'],
                                   credentials['ACCESS_TOKEN_SECRET'])

    def fetch_user_tweets(self, user, ntweets=0):
        "Fetch tweets of a user"
        self.tweets = []
        c = Cursor(self.twitter_client.user_timeline, tweet_mode='extended', id=user).items(ntweets)
        
        # Keep fetching the tweets until StopIteration Exception is raised by Tweepy
        while True:
            try:
                self.tweets.append(c.next()._json)
            except StopIteration:
                break
                
        print(f'[INFO]: Fetched {len(self.tweets)} tweets.')    
        
    def dump_json(self, fpath):
        "Dumps the tweets to .json file defined by fpath"
        with jsonlines.open(fpath, 'w') as f:
            f.write(self.tweets)

## Parse tweets

In [5]:
def parse_tweets_from_json(fpath):
    """
    Extracts required info from .jsonl file and creates a dataframe
    """
    
    # Parsing .jsonl file containing the tweets 
    tweets = None
    with jsonlines.open(fpath, 'r') as f:
        tweets = f.read()

    # Creating a dataframe and saving the required info
    df = pd.DataFrame(
        data=[[tweet['full_text'], tweet['created_at'], tweet['favorite_count'], tweet['retweet_count']] for tweet in tweets],
        columns=['text', 'created_at', 'num_likes', 'num_retweets']
    )
    
    # Extracting the number of images from a tweet
    df['num_images'] = [len([t for t in tweet['extended_entities']['media'] if t['type'] == 'photo'])
                        if 'extended_entities' in tweet else None for tweet in tweets]

    return df

## Workflow

In [6]:
# Creating twitter client object using Tweepy
tw_client = TwitterClient(CREDENTIALS)

# Fetching the tweets
tw_client.fetch_user_tweets('midasIIITD')

# Saving the tweets to .jsonl file
tw_client.dump_json(PATH_JSONL)

[INFO]: Fetched 333 tweets.


In [7]:
# Extrating data and saving it to .csv
midas_df = parse_tweets_from_json(PATH_JSONL)
midas_df.to_csv(PATH_MIDAS_DF, header=True, index=False)

In [8]:
# Reading the saved .csv file
midas_df = pd.read_csv(PATH_MIDAS_DF)
print(f'Size: {midas_df.shape}')
midas_df.head(7)

Size: (333, 5)


Unnamed: 0,text,created_at,num_likes,num_retweets,num_images
0,"Other queries: ""none of the Tweeter Apis give ...",Sun Apr 07 06:55:19 +0000 2019,3,2,
1,"Other queries: ""do we have to make two differe...",Sun Apr 07 06:53:38 +0000 2019,3,1,
2,"Other queries: ""If using Twitter api, it does ...",Sun Apr 07 05:32:27 +0000 2019,4,1,
3,Response to some queries asked by students on ...,Sun Apr 07 05:29:40 +0000 2019,6,1,
4,RT @kdnuggets: Top 8 #Free Must-Read #Books on...,Sat Apr 06 17:11:29 +0000 2019,0,2,
5,@nupur_baghel @PennDATS Congratulation @nupur_...,Sat Apr 06 16:43:27 +0000 2019,14,3,1.0
6,We have emailed the task details to all candid...,Fri Apr 05 16:08:37 +0000 2019,10,1,
