## Load the required libraries

Using
- [jsonlies]('https://jsonlines.readthedocs.io/en/latest/) to save data to `.jsonl` format.
- [tweepy]('https://github.com/tweepy/tweepy') as twitter API wrapper.
- [pandas]('https://pandas.pydata.org/') to create tabular data.

In [1]:
from tweepy import API, Cursor
from tweepy import OAuthHandler
import pandas as pd
import jsonlines
from datetime import datetime

## Credentials

You need to define the twitter api credentials here:

In [2]:
CREDENTIALS = {
    'ACCESS_TOKEN': '',
    'ACCESS_TOKEN_SECRET': '',
    'CONSUMER_KEY': '',
    'CONSUMER_SECRET': ''
}

## Paths

Paths to save the files

In [3]:
PATH_JSONL = 'midas_tweets.jsonl'
PATH_MIDAS_DF = 'midas_tweets.csv'

## Twitter Client

In [4]:
class TwitterClient():
    """
    Class which that authenticate and fetch tweets of a user from twitter
    """
    def __init__(self, credentials, twitter_user=None):
        "Initialize the authentication process and twitter client"
        self.auth = None
        self.tweets = None
        self._authenticate(credentials)
        self.twitter_client = API(self.auth)

    def _authenticate(self, credentials):
        "Authenticate using the credentials"
        self.auth = OAuthHandler(credentials['CONSUMER_KEY'],
                                 credentials['CONSUMER_SECRET'])
        self.auth.set_access_token(credentials['ACCESS_TOKEN'],
                                   credentials['ACCESS_TOKEN_SECRET'])

    def fetch_user_tweets(self, user, ntweets=0):
        "Fetch tweets of a user"
        self.tweets = []
        c = Cursor(self.twitter_client.user_timeline, tweet_mode='extended', id=user).items(ntweets)
        
        # Keep fetching the tweets until StopIteration Exception is raised by Tweepy
        print(f'[INFO]: Fetching tweets of {user} ...')
        while True:
            try:
                self.tweets.append(c.next()._json)
            except StopIteration:                
                break
                
        print(f'[INFO]: Fetched {len(self.tweets)} tweets.')    
        
    def dump_json(self, fpath):
        "Dumps the tweets to .json file defined by fpath"
        with jsonlines.open(fpath, 'w') as f:
            f.write(self.tweets)

## Parse tweets

In [5]:
def parse_tweets_from_json(fpath):
    """
    Extracts required info from .jsonl file and creates a dataframe
    """
    
    # Parsing .jsonl file containing the tweets 
    tweets = None
    with jsonlines.open(fpath, 'r') as f:
        tweets = f.read()

    # Creating a dataframe and saving the required info
    df = pd.DataFrame(
        data=[[
            tweet['full_text'],
            datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +%f %Y'),
            tweet['favorite_count'],
            tweet['retweet_count']
        ] for tweet in tweets],
        columns=['text', 'created_at', 'num_likes', 'num_retweets']
    )
    
    # Extracting the number of images from a tweet
    df['num_images'] = [len(list(filter(lambda m: m['type'] == 'photo', tweet['extended_entities']['media'])))
                        if 'extended_entities' in tweet else None for tweet in tweets]

    return df

## Workflow

In [6]:
# Creating twitter client object using Tweepy
tw_client = TwitterClient(CREDENTIALS)

# Fetching the tweets
tw_client.fetch_user_tweets('midasIIITD')

# Saving the tweets to .jsonl file
tw_client.dump_json(PATH_JSONL)

[INFO]: Fetching tweets of midasIIITD ...
[INFO]: Fetched 338 tweets.


In [7]:
# Extrating data and saving it to .csv
midas_df = parse_tweets_from_json(PATH_JSONL)
midas_df.to_csv(PATH_MIDAS_DF, header=True, index=False)

In [8]:
# Reading the saved .csv file
midas_df = pd.read_csv(PATH_MIDAS_DF)
print(f'Size: {midas_df.shape}')
midas_df.head(7)

Size: (338, 5)


Unnamed: 0,text,created_at,num_likes,num_retweets,num_images
0,"Many Congratulations to @midasIIITD student, S...",2019-04-08 07:08:12,12,2,1.0
1,@midasIIITD thanks all students who have appea...,2019-04-08 03:27:42,3,0,1.0
2,"@himanchalchandr Meanwhile, complete CV/NLP ta...",2019-04-07 14:17:29,0,0,
3,@sayangdipto123 Submit as per the guideline ag...,2019-04-07 14:17:09,0,0,
4,We request all students whose interview are sc...,2019-04-07 11:43:24,1,1,
5,"Other queries: ""none of the Tweeter Apis give ...",2019-04-07 06:55:19,5,2,
6,"Other queries: ""do we have to make two differe...",2019-04-07 06:53:38,4,1,
