In [127]:
# You need to fill this with your own api key to enable the below codes to make api call
my_key = {
    'app_key':'',
    'app_secret':'',
    'oauth_token':'',
    'oauth_token_secret':''
}

In [3]:
# Run this to make sure you have tweepy installed in your environment
!pip install tweepy

Collecting tweepy
  Downloading https://files.pythonhosted.org/packages/d5/5f/daac4b4e9b30d7d2a6fdd16a880ff79f27918fe388e4dfc1983dec3a9876/tweepy-3.7.0-py2.py3-none-any.whl
Collecting requests-oauthlib>=0.7.0 (from tweepy)
  Downloading https://files.pythonhosted.org/packages/c2/e2/9fd03d55ffb70fe51f587f20bcf407a6927eb121de86928b34d162f0b1ac/requests_oauthlib-1.2.0-py2.py3-none-any.whl
Collecting oauthlib>=3.0.0 (from requests-oauthlib>=0.7.0->tweepy)
[?25l  Downloading https://files.pythonhosted.org/packages/16/95/699466b05b72b94a41f662dc9edf87fda4289e3602ecd42d27fcaddf7b56/oauthlib-3.0.1-py2.py3-none-any.whl (142kB)
[K    100% |████████████████████████████████| 143kB 498kB/s ta 0:00:01
Installing collected packages: oauthlib, requests-oauthlib, tweepy
Successfully installed oauthlib-3.0.1 requests-oauthlib-1.2.0 tweepy-3.7.0


# Main Code

### Importing the libraries

All the libraries needed are imported here 

**tweepy** => the library used for accessing the twitter API

**numpy and pandas** => making the data in a presentable format and exporting to csv

In [128]:
import tweepy
from tweepy import OAuthHandler

import numpy as np
import pandas as pd

Now we need to attach our api key with each request we make with tweepy. For this we make a auth object which help the api for the authorization and then set up tweepy to use the auth object for every request.

The final object twitterAPI is our entry point to the twitter API using tweepy

In [129]:
# Here I am creating the auth object
auth = OAuthHandler(my_key['app_key'], my_key['app_secret'])
auth.set_access_token(my_key['oauth_token'], my_key['oauth_token_secret'])

# Linking the auth object with tweepy
twitterAPI = tweepy.API(auth)

##### getAllTweets Function
The Function **getAllTweets** take in the screen name(Default = 'midasIIITD'). 

Then it gets one tweet to intiate the loop. It then keeps querying for older tweets until the tweets are exhausted. This way we finally get all the tweets in a list called alltweets which we finally return

In [141]:
def getAllTweets(screenName = 'midasIIITD'):
    alltweets = []
    
    # Getting only 1 tweet to initiate the connection and to have atleast one element in alltweets
    tweets = twitterAPI.user_timeline(screen_name = screenName, exclude_replies = True, count = 1)
    # Adding it to alltweets
    alltweets.extend(tweets)
        
    #save the id of the oldest tweet(currently the only tweet if any)
    oldest = alltweets[-1].id - 1
    
    # While we are able to find any older tweet
    while len(tweets) > 0:
        # Getting next 200 tweets as max tweet that can be queried is 200, 
        # also we need older than the id stored in oldest
        tweets = twitterAPI.user_timeline(screen_name = screenName,count=200,max_id=oldest)
        
        # add them to alltweets
        alltweets.extend(tweets)
        # updating oldtweets to prepare for next iteration
        oldest = alltweets[-1].id - 1
    
    return alltweets

##### extractAllDataFromTweetList Function
The Function **extractAllDataFromTweetList** take in the tweet list as input parameter

Then it gets one by one through all the tweets and removes all retweeted tweets and replies. Then it counts if any media file is present. Finally it adds all the desired data from current tweet into **data** object and adds it to tweet_data

It returns tweet_data a list of desrired data from all the fields

In [150]:
def extractAllDataFromTweetList(tweet_list):
    # Beginning with a empty list
    tweet_data = []
    
    #Going through each tweet
    for tweet in tweet_list:
        # Removing any retweets
        if (not tweet.retweeted) and ('RT @' not in tweet.text):
            # Removing any replies in the list
            if tweet.in_reply_to_status_id is None:
                
                # In the try block we are accessing the entities with tag media. 
                try:
                    count = 0
                    for medium in tweet.entities['media']:
                        # If it is an photo type then increment count
                        if medium['type'] == 'photo':
                            count += 1
                
                    # If count is 0 update to None as given in question
                    if(count == 0):
                          count = None
                except:
                    # If there was an exception accessing entities or media 
                    # we land here and set count to be None
                    count = None
                
                finally:
                    # Finally we append all the data in a list object called data
                    data = [tweet.text, 
                            tweet.created_at.date(), 
                            tweet.created_at.time(), 
                            tweet.favorite_count, 
                            tweet.retweet_count, 
                            count]
                    # Append it to tweet_data which is used to store all the tweets data in a list
                    tweet_data.append(data)
    
    
    return tweet_data

In [143]:
# call getAllTweets using default value midasIIITD for screenName in getAllTweets
all_tweets = getAllTweets()

In [148]:
# call extractAllDataFromTweetList to get the list of infromation extracted from the tweets list
tweets_data = extractAllDataFromTweetList(all_tweets)

In [151]:
# Converting the list to a dataframe
tweets_data_frame = pd.DataFrame(np.array(tweets_data), columns=['Text', 'Date', 'Time', 'Favorite Count', 'Retweets Count', 'No of Images'])

In [152]:
# Visualising the result
tweets_data_frame

Unnamed: 0,Text,Date,Time,Favorite Count,Retweets Count,No of Images
0,"Many Congratulations to @midasIIITD student, S...",2019-04-08,07:08:12,13,2,
1,@midasIIITD thanks all students who have appea...,2019-04-08,03:27:42,5,0,
2,We request all students whose interview are sc...,2019-04-07,11:43:24,1,1,
3,"Other queries: ""none of the Tweeter Apis give ...",2019-04-07,06:55:19,5,2,
4,"Other queries: ""do we have to make two differe...",2019-04-07,06:53:38,4,1,
5,"Other queries: ""If using Twitter api, it does ...",2019-04-07,05:32:27,6,1,
6,Response to some queries asked by students on ...,2019-04-07,05:29:40,7,1,
7,We have emailed the task details to all candid...,2019-04-05,16:08:37,11,1,
8,Dear @midasIIITD internship candidates who hav...,2019-04-02,04:20:13,8,1,
9,Looking forward to your paper submission to @I...,2019-04-02,02:44:54,5,1,


In [155]:
# Write the data frame to a csv file
tweets_data_frame.to_csv('midasIIITD_Tweets.csv', index = False)

In [156]:
# Check if reading the csv gives same output
pd.read_csv('midasIIITD_Tweets')

Unnamed: 0,Text,Date,Time,Favorite Count,Retweets Count,No of Images
0,"Many Congratulations to @midasIIITD student, S...",2019-04-08,07:08:12,13,2,
1,@midasIIITD thanks all students who have appea...,2019-04-08,03:27:42,5,0,
2,We request all students whose interview are sc...,2019-04-07,11:43:24,1,1,
3,"Other queries: ""none of the Tweeter Apis give ...",2019-04-07,06:55:19,5,2,
4,"Other queries: ""do we have to make two differe...",2019-04-07,06:53:38,4,1,
5,"Other queries: ""If using Twitter api, it does ...",2019-04-07,05:32:27,6,1,
6,Response to some queries asked by students on ...,2019-04-07,05:29:40,7,1,
7,We have emailed the task details to all candid...,2019-04-05,16:08:37,11,1,
8,Dear @midasIIITD internship candidates who hav...,2019-04-02,04:20:13,8,1,
9,Looking forward to your paper submission to @I...,2019-04-02,02:44:54,5,1,


## The scripts below are commented out version of the trail code

## The working behind the code

The code below is the one that I executed to make the runnig model of the script. This is the rough version of the code and should not be executed

In [5]:
# import tweepy #https://github.com/tweepy/tweepy
# import csv
# from tweepy import OAuthHandler

In [13]:
# auth = OAuthHandler(auth_params['app_key'], auth_params['app_secret'])
# auth.set_access_token(auth_params['oauth_token'], auth_params['oauth_token_secret'])

In [60]:
# api = tweepy.API(auth, wait_on_rate_limit = True)

In [131]:
# alltweets = []

In [132]:
# tweets = api.user_timeline(screen_name = 'midasIIITD', exclude_replies = True, count = 1)

In [133]:
# alltweets.extend(tweets)

In [134]:
# #save the id of the oldest tweet less one
# oldest = alltweets[-1].id - 1

In [135]:
#  while len(tweets) > 0:
#     tweets = api.user_timeline(screen_name = 'midasIIITD',count=200,max_id=oldest)
#     alltweets.extend(tweets)
#     oldest = alltweets[-1].id - 1

In [136]:
# tweet_data = []

In [137]:
# for tweet in alltweets:
#     if (not tweet.retweeted) and ('RT @' not in tweet.text):
#         if tweet.in_reply_to_status_id is None:
#             try:
#                 count = 0
#                 for medium in tweet.entities['media']:
#                       if medium['type'] == 'photo':
#                           count += 1
                
#                 if(count == 0):
#                       count = None
#             except:
#                 count = None
#             finally:
#                 data = [tweet.text, tweet.created_at.date(), tweet.created_at.time(), tweet.favorite_count, tweet.retweet_count, count]
#                 tweet_data.append(data)

In [138]:
# import numpy as np
# import pandas as pd
# tweet_data = np.array(tweet_data)

In [139]:
# pd.DataFrame(tweet_data, columns=['Text', 'Date', 'Time', 'Favorite Count', 'Retweets Count', 'No of Images'])

Unnamed: 0,Text,Date,Time,Favorite Count,Retweets Count,No of Images
0,RT @Harvard: Professor Jelani Nelson founded A...,2019-04-09,05:04:27,0,33,
1,RT @emnlp2019: For anyone interested in submit...,2019-04-09,05:04:11,0,12,
2,RT @multimediaeval: Announcing the 2019 MediaE...,2019-04-08,19:38:09,0,15,
3,"Many Congratulations to @midasIIITD student, S...",2019-04-08,07:08:12,13,2,
4,@midasIIITD thanks all students who have appea...,2019-04-08,03:27:42,5,0,
5,We request all students whose interview are sc...,2019-04-07,11:43:24,1,1,
6,"Other queries: ""none of the Tweeter Apis give ...",2019-04-07,06:55:19,5,2,
7,"Other queries: ""do we have to make two differe...",2019-04-07,06:53:38,4,1,
8,"Other queries: ""If using Twitter api, it does ...",2019-04-07,05:32:27,6,1,
9,Response to some queries asked by students on ...,2019-04-07,05:29:40,7,1,
