# Python Task for Extracting tweets from midasIIITD handle

In [1]:
# Initialising the tools required for the notebook
import requests
import bs4
from bs4 import BeautifulSoup
from requests_oauthlib import OAuth1
import pandas as pd

In [2]:
# Reading the twitter keys from the config file
import json
with open("config.json","r") as file:
    auth_params = json.load(file)

In [3]:
# initialising twitter authentication handler and accessing the API
from tweepy import OAuthHandler, API
auth1 = OAuthHandler(auth_params['app_key'], auth_params['app_secret'])
auth1.set_access_token(auth_params['oauth_token'], auth_params['oauth_token_secret'])
auth_api = API(auth1)

In [4]:
# Printing Basic Information about the midasIIITD account
account_list = ['midasIIITD']

for target in account_list:
    print("Getting data for " + target)
    item = auth_api.get_user(target)
    print("Name: " + item.name)
    print("Screen Name: " + item.screen_name)
    print("Description: " + item.description)
    print("Statuses Count: " + str(item.statuses_count))
    print("Friends Count: " + str(item.friends_count))
    print("Followers Count: " + str(item.followers_count))

Getting data for midasIIITD
Name: MIDAS IIITD
Screen Name: midasIIITD
Description: MIDAS is a group of researchers at IIIT-Delhi who study, analyze, and build different multimedia systems for society leveraging multimodal information.
Statuses Count: 342
Friends Count: 43
Followers Count: 289


In [5]:
# Extracting details from the account and statistics about it.
from datetime import datetime, date, time, timedelta
tweets = item.statuses_count
account_created_date = item.created_at
delta = datetime.utcnow() - account_created_date
account_age_days = delta.days
print("Account age (in days): " + str(account_age_days))
if account_age_days > 0:
    print("Average tweets per day: " + "%.2f"%(float(tweets)/float(account_age_days)))

Account age (in days): 260
Average tweets per day: 1.32


In [6]:
# Getting the information about the tweets, hashtags and mentions
from tweepy import Cursor
hashtags = []
mentions = []
tweet_count = 0
all_tweets = list()

# Determining the period for getting the tweet information
end_date = datetime.utcnow() - timedelta(days=265)
for status in Cursor(auth_api.user_timeline, id=target).items():
    tweet_count += 1
    # Getting the hashtags used in the tweets
    if hasattr(status, "entities"):
        entities = status.entities
        if "hashtags" in entities:
            for ent in entities["hashtags"]:
                if ent is not None:
                    if "text" in ent:
                        hashtag = ent["text"]
                        if hashtag is not None:
                              hashtags.append(hashtag)
        # Getting name of the users tagged in the tweets
        if "user_mentions" in entities:
            for ent in entities["user_mentions"]:
                if ent is not None:
                    if "screen_name" in ent:
                        name = ent["screen_name"]
                        if name is not None:
                              mentions.append(name)
    # Favorites/Likes of the tweet
    fav=0
    try:
        fav=status.retweeted_status.favorite_count
    except:
        fav=status.favorite_count
    # Number of Images in the tweet
    num=0
    try:
        num=len(status.entities['media'])
    except:
        num=None
    tweet_Data=dict(text=status.text,created_at=str(status.created_at),retweet_count=status.retweet_count,favorite_count=fav,num_images=num)
    all_tweets.append(tweet_Data)
    if status.created_at < end_date:
        break

In [7]:
# Dumping the data in JsonLines file
with open('midas_tweets_data.jsonl','w') as f:
    for data_point in all_tweets:
        json.dump(data_point,f)
        f.write("\n")

In [8]:
from collections import Counter
import sys
print("Most mentioned Twitter users:")
for item, count in Counter(mentions).most_common(10):
    print(item + "\t" + str(count))
print("Most used hashtags:")
for item, count in Counter(hashtags).most_common(10):
    print(item + "\t" + str(count))
print("All done. Processed " + str(tweet_count) + " tweets.")

Most mentioned Twitter users:
midasIIITD	82
RatnRajiv	43
IIITDelhi	38
kdnuggets	31
ACMMM19	29
the_dhumketu	24
debanjanbhucs	22
mr2amc	19
RealAAAI	13
IEEEBigMM19	11
Most used hashtags:
KDN	8
DeepLearning	6
AI	5
MachineLearning	5
NLP	3
Free	2
Books	2
DataScience	2
nlp	2
datascience	2
All done. Processed 342 tweets.


In [9]:
# Extracting data from jsonlines file
with open('midas_tweets_data.jsonl','r') as f:
    df=pd.read_json(f, lines=True)

In [10]:
df

Unnamed: 0,created_at,favorite_count,num_images,retweet_count,text
0,2019-04-09 16:45:07,41,,9,RT @IIITDelhi: We are delighted to share that ...
1,2019-04-09 05:04:27,95,,35,RT @Harvard: Professor Jelani Nelson founded A...
2,2019-04-09 05:04:11,37,,15,RT @emnlp2019: For anyone interested in submit...
3,2019-04-08 19:38:09,20,,15,RT @multimediaeval: Announcing the 2019 MediaE...
4,2019-04-08 07:08:12,17,,2,"Many Congratulations to @midasIIITD student, S..."
5,2019-04-08 03:27:42,5,,0,@midasIIITD thanks all students who have appea...
6,2019-04-07 14:17:29,0,,0,"@himanchalchandr Meanwhile, complete CV/NLP ta..."
7,2019-04-07 14:17:09,0,,0,@sayangdipto123 Submit as per the guideline ag...
8,2019-04-07 11:43:24,1,,1,We request all students whose interview are sc...
9,2019-04-07 06:55:19,5,,2,"Other queries: ""none of the Tweeter Apis give ..."
