# Twitter data Analysis

In [4]:
from twython import Twython
from collections import Counter
from geopy import Nominatim

with open("twitter_data_from_scratch.txt", "r") as f:
    line_text = [line.strip() for line in f]
    

CONSUMER_KEY         = line_text[0]
CONSUMER_SECRET      = line_text[1]
ACCESS_TOKEN         = line_text[2]
ACCESS_TOKEN_SECRET  = line_text[3]

### Making a search query

refer: https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets for formatting the search query and understanding results format.  

Max. num of results restricted to 100 per search query so we loop over many times and make the same query.
But to avoid the results from repeating, we change the max_id of search results after each iteration

In [5]:
# User inputs
COUNT_OF_TWEETS_TO_BE_FETCHED = 1000
search_string                 = "avengers" # "" to search everything
type_of_result                = "all" # all, mixed, recent or popular
location_of_interest          = "London"
radius_of_interest_in_miles   = 50



# initialisation
twitter        = Twython(CONSUMER_KEY, CONSUMER_SECRET)
tweets         = []  

word_list             = []
hashtag_list          = []
retweet_count_list    = []
favorite_count_list   = []
tweet_url_list        = []

# Search area definition
geolocator        = Nominatim(user_agent='GoogleV3')
location          = geolocator.geocode(location_of_interest)
print(location,"\n")
geo_code = str(location.latitude) + "," + str(location.longitude) + "," + str(radius_of_interest_in_miles) + "mi"


num_results_per_query = min([COUNT_OF_TWEETS_TO_BE_FETCHED, 100])
MAX_ATTEMPTS          = max(50, COUNT_OF_TWEETS_TO_BE_FETCHED//num_results_per_query)
                           
for i in range(0,MAX_ATTEMPTS):
    if(COUNT_OF_TWEETS_TO_BE_FETCHED < len(tweets)):
        break # we got 500 tweets... !!

    #----------------------------------------------------------------#
    # STEP 1: Query Twitter
    # STEP 2: Save the returned tweets
    # STEP 3: Get the next max_id
    #----------------------------------------------------------------#

    # STEP 1: Query Twitter
    if(0 == i):
        # Query twitter for data. 
        results = twitter.search(q=search_string, count=str(num_results_per_query), geocode=geo_code, 
                                 result_type=type_of_result)
    else:
        # After the first call we should have max_id from result of previous call. Pass it in query.
        results = twitter.search(q=search_string,count=str(num_results_per_query), geocode=geo_code, 
                                 result_type=type_of_result,
                                 include_entities='true',max_id=next_max_id)

    # STEP 2: Save the returned tweets
    for status in results['statuses']:        
        user = status["user"]["screen_name"].encode("utf-8")
        user = user.decode("utf-8") # to convert the encoded byte type into string
        text = status["text"].encode("utf-8")
        text = text.decode("utf-8") # to convert the encoded byte type into string
        for word in text.split():
            word_list.append(word)
            
            if word.startswith("#"):
                hashtag_list.append(word)
        
        tweets.append(text) # Keep track of number of tweets
        favorite_count_list.append(status["favorite_count"])
        retweet_count_list.append(status["retweet_count"])
        tweet_url_list.append("https://twitter.com/i/web/status/"+status["id_str"])
        
    # STEP 3: Get the next max_id
    try:
        # Parse the data returned to get max_id to be passed in consequent call.
        next_results_url_params = results['search_metadata']['next_results']
        next_max_id = next_results_url_params.split('max_id=')[1].split('&')[0]
    except:
        # No more next pages
        break

print("...Done")

London, Greater London, England, SW1A 2DX, UK 

...Done


### Post Processing

In [6]:
print("Number of tweets fetched:", len(tweets))

print("\n Top Hashtags:")
c = Counter(hashtag_list)
for tags, count in c.most_common(5):
    print(tags,count)
    
# print("\n Most common words:")
# c = Counter(word_list)
# for tags, count in c.most_common(6):
#     print(tags,count)

print("\n")
max_retweet_index = sorted(range(len(retweet_count_list)), key=lambda x: -retweet_count_list[x])[0]
                           
most_retweeted    = tweets[max_retweet_index]
max_retweet_count = retweet_count_list[max_retweet_index]
max_retweet_url   = tweet_url_list[max_retweet_index]   
print("(most) Retweeted:", max_retweet_count, "\n", most_retweeted, "\n", max_retweet_url)

print("\n")
max_favorite_index = sorted(range(len(favorite_count_list)), key=lambda x: -favorite_count_list[x])[0]
most_favorite      = tweets[max_favorite_index]
max_favorite_count = favorite_count_list[max_favorite_index]
max_favorite_url   = tweet_url_list[max_favorite_index] 
                            
print("(most) Favorited:", max_favorite_count, "\n", most_favorite, "\n", max_favorite_url)

Number of tweets fetched: 1011

 Top Hashtags:
#AvengersEndgame 60
#Endgame 51
#ThankYouAvengers 16
#avengers 11
#Avengers 9


(most) Retweeted: 1972 
 RT @The_Shiznit: I sat through 12 minutes of the Avengers: Endgame credits and all I was rewarded with was a lousy sense of appreciation fo… 
 https://twitter.com/i/web/status/1122208241675636736


(most) Favorited: 316 
 The regressive Left’s depiction of old white men as evil, diseased oppressors and themselves as righteous avengers… https://t.co/VQOFzTIGuZ 
 https://twitter.com/i/web/status/1122186976541724674
