# Network Analysis 

### Problem:

1. Use Twitter API to collect 1000 tweets in which keyword ‘narendra modi’ appears, save the collected tweets in nm.txt

2. Convert the collected tweets into BoW vectors and find cosine similarity of a pair of tweets and print the top-10 most similar tweet pairs, print these pairs

3. Do the same using TF-IDF vectors

4. Find out unique users (N) who have posted these 1000 tweets. (N <= 1000)
    U1, u2, ….., uN
5. Find the followers and followee of each user from the N users obtained before
    * u1 - [followers list] [followee list]
    * u2 - [followers list] [followee list]
    * .
    * .
    * uN - [followers list] [followee list]
6. Followers and followees are also users, so create a follower-followee directed graph among them, G. (ui → uj) iff ui is followed by uj
7. Find popular users in this G based on 
    * Degree centrality
    * Betweeness centrality
    * Closeness centrality


---

**Step 1** Use Twitter API to collect 1000 tweets in which keyword ‘narendra modi’ appears, save the collected tweets in nm.txt

In [8]:
# import libraries

import tweepy
import json
import csv
import pandas as pd
import numpy as np
import time

In [9]:
# Twitter API Credentials

from secrets import *

In [10]:
# Stablish connection with API

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)

In [11]:
# Location to store data

data_path = './data/'

In [5]:
# function for searching keyword 
# Attr: search words: word to be searched
#       no_of_tweets: no. of tweets to be fetched
# Ignore retweets

def search_tweets_by_keywords(search_words='narendra modi', max_tweets=1000):
    search_words = search_words + "-filter:retweets"
    search_tweets = tweepy.Cursor(api.search,
                  q=search_words).items(max_tweets)
    return search_tweets

In [5]:
# Saving the tweets as a text file for future reference
# creating a dataframe and converting it to a csv file

def save_tweets_to_text_file(search_tweets, file_name):
    my_list_of_dicts = []
    for each_json_tweet in search_tweets:
        my_list_of_dicts.append(each_json_tweet._json)
        
    with open(data_path + file_name + '.txt', 'w') as file:
        file.write(json.dumps(my_list_of_dicts, indent=4))             

In [6]:
# save id, text, creator's id and creator's name of the tweets
# from text file and create a dataframe and save it in CSV format

def save_required_data_from_txt_to_csv(file_name):
    my_demo_list = []
    with open(data_path + file_name + '.txt', encoding='utf-8') as json_file:
        all_data = json.load(json_file)
        for each_dictionary in all_data:
            tweet_id = each_dictionary['id'] # can also use id_str attr
            text = each_dictionary['text']
            creator_id = each_dictionary['user']['id']
            created_by = each_dictionary['user']['screen_name']
            creator_followers_count = each_dictionary['user']['followers_count']
            creator_friends_count = each_dictionary['user']['friends_count']
            

            my_demo_list.append({'tweet_id': str(tweet_id),
                             'text': str(text),
                             'creator_id': str(creator_id),
                             'created_by': str(created_by),
                             'creator_followers': creator_followers_count,
                             'creator_friends': creator_friends_count,
                            })    
        
            tweet_dataset = pd.DataFrame(my_demo_list, columns = ['tweet_id', 'text', 
                                       'creator_id', 'created_by', 'creator_followers', 'creator_friends'])
            
    # Writing tweet dataset to csv file for future reference
    tweet_dataset.to_csv(data_path + file_name + '.csv', index=False)  

In [8]:
# Collect 1000 tweets with keyword 'narendra modi'

tweets = search_tweets_by_keywords()

In [9]:
# save the tweets in nm.txt file and create dataframe from it

save_tweets_to_text_file(tweets, 'nm')

In [7]:
# Create CSV of required data

save_required_data_from_txt_to_csv('nm')

---

**Step 2** Convert the collected tweets into BoW vectors and find cosine similarity of a pair of tweets and print the top-10 most similar tweet pairs, print these pairs

In [12]:
# Read from CSV

df = pd.read_csv(data_path + 'nm.csv')

In [13]:
df.head()

Unnamed: 0,tweet_id,text,creator_id,created_by,creator_followers,creator_friends
0,1322297317639233546,@cxkeck If a friend of mine supported Nigel Fa...,1169681443481620486,nysuri,1299,1876
1,1322296152868401152,https://t.co/RvnZZWrxO2,3245455199,beckylynch81,137,115
2,1322295522154082304,I plan on checking this place out when visit I...,1019030227312246784,JosieEJung,57,1218
3,1322295207920979968,Just check the theatricals of Mr Narendra Modi...,1270180160537427969,A_K1992in,37,633
4,1322294563860434945,@parody_yadav @yadavtejashwi https://t.co/FWcq...,1319784298078105600,vman28428391,0,2


In [14]:
df.shape
df.describe()

Unnamed: 0,tweet_id,creator_id,creator_followers,creator_friends
count,1000.0,1000.0,1000.0,1000.0
mean,1.322185e+18,5.9386e+17,257482.6,874.819
std,39687200000000.0,5.744572e+17,1446822.0,2790.274798
min,1.322126e+18,5871672.0,0.0,0.0
25%,1.322153e+18,1014681000.0,47.75,68.0
50%,1.322182e+18,7.724101e+17,299.0,262.0
75%,1.322212e+18,1.185608e+18,2289.5,838.5
max,1.322297e+18,1.322239e+18,13899280.0,46692.0


### Bag of Words vector

In [14]:
# tokenize texts to get meaningful words
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
# list of Tweet's text

text = list(df['text'])

In [16]:
vectorizer = CountVectorizer()

In [17]:
# tokenize and build vocab

vectorizer.fit(text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [18]:
# summarize

print(vectorizer.vocabulary_)

{'இலவ': 3828, 'rahul': 2539, 'fatigue': 1084, 'discovered': 877, 'friends': 1155, 'butvbq3mki': 597, 'पर': 3615, 'wkvughqdqb': 3342, 'tzmdkavrxt': 3148, 'message': 1974, 'times': 3067, 'kesu': 1687, 'savarkar': 2755, 'और': 3542, 'stone': 2936, 'mungerkillings': 2044, 'expecting': 1041, 'arnab1go': 372, 'whose': 3324, 'kartikeyatanna': 1676, 'october': 2222, 'cactus': 609, 'y87hgg5o4l': 3397, 'whokilledsushant': 3322, 'जलपथ': 3575, 'would': 3357, 'back': 438, 'train': 3104, 'doland': 906, 'remember': 2604, 'clave': 680, 'ashoswai': 383, 'jaisa': 1580, 'claimed': 678, 'ರವ': 3912, 'folks': 1122, 'satisfaction': 2751, '5000rs': 107, 'reply': 2609, 'ship': 2816, 'stopped': 2938, 'bulletin': 589, 'सरद': 3699, 'fund': 1162, 'play': 2362, 'pride': 2417, 'hcsejyrlso': 1324, 'hsyrbyfplx': 1402, 'jqwywcbnik': 1637, 'biggest': 517, 'dswwiisdnd': 930, 'opposition': 2262, 'yuval': 3443, 'uvyds46tol': 3199, 'plz': 2369, 'lighting': 1827, 'apni': 354, 'రమ': 3878, 'rvnzzwrxo2': 2689, 'k3zqqtt1tw': 1656

In [19]:
# encode document

vector = vectorizer.transform(text)

In [20]:
# summarize encoded vector

print(vector.shape)

(1000, 3938)


In [21]:
print(type(vector))

<class 'scipy.sparse.csr.csr_matrix'>


In [22]:
print(vector.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [23]:
BoW_array = vector.toarray()

In [24]:
BoW_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### TF-IDF Vector

In [25]:
# tfidf vectorize

from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
tfidf_vectorizer = TfidfVectorizer()

In [27]:
tfidf_vector = tfidf_vectorizer.fit_transform(text)

In [28]:
tfidf_vector.shape

(1000, 3938)

In [29]:
tfidf_array = tfidf_vector.toarray()

In [30]:
tfidf_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Cosine Similarity

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
# Cosine Similarity function

def Cosine_Similarity(arr):
    tweet_similarity = []

    for i in range(len(arr)):
        tweet_similarity_row = []
        for j in range(len(arr)):
            a = np.reshape(arr[i], (1, arr[i].size))
            b = np.reshape(arr[j], (1, arr[j].size))
            c = cosine_similarity(a,b)
            tweet_similarity_row.append(c[0][0])
        tweet_similarity.append(tweet_similarity_row) 
    return tweet_similarity

In [33]:
# Call Cosine Similarity function for BoW vector

BoW_similarity = Cosine_Similarity(BoW_array)

In [34]:
len(BoW_similarity)

1000

In [35]:
len(BoW_similarity[0])

1000

In [36]:
# Find pair which are most similar and creating dataframe
# excluding similarity of one tweet with itself
# and considering the pair only once i.e checking (a, b) but not (b, a)

def similar_pairs(similarity_array):
    row_col_similarity = []

    for i in range(len(similarity_array)):
        for j in range(i, len(similarity_array)):
            if i != j:
                row_col_similarity.append([i, j, similarity_array[i][j]])
    
    # Create the pandas DataFrame 
    similarity_df = pd.DataFrame(row_col_similarity, columns = ['Tweet1 index', 'Tweet2 index', 'cosine similarity'])
    return similarity_df

In [37]:
# Call similarity pair for BoW_similarity

BoW_similarity_df = similar_pairs(BoW_similarity)

In [38]:
# Top 10 similar tweets

top_10_BoW_df = BoW_similarity_df.nlargest(10, 'cosine similarity')

In [39]:
top_10_BoW_df.reset_index(drop=True)

Unnamed: 0,Tweet1 index,Tweet2 index,cosine similarity
0,65,67,1.0
1,206,212,1.0
2,493,494,1.0
3,293,849,0.971429
4,20,22,0.961538
5,20,23,0.961538
6,22,23,0.961538
7,96,137,0.96
8,902,914,0.958333
9,201,203,0.958333


In [40]:
df.iloc[398, 1]

'Nitish Kumar will talk about Tejaswi’s family. Narendra Modi will talk about my family. But neither of them will ta… https://t.co/DqkLkok9Gn'

## Similar tweets pair using BoW

In [41]:
# Print top 10 similar tweet pairs

def print_tweet_pairs(top_10_df):
    for i in range(len(top_10_df)):
        print("\nTweet pair ", i+1)
        t1 = top_10_df.iloc[i, 0] # Tweet1 index in ith row
        t2 = top_10_df.iloc[i, 1] # Tweet2 index in ith row
        print("Tweet1")
        print(df.iloc[t1, 1]) # text at t1 index in df
        print("Tweet2")
        print(df.iloc[t2, 1]) # text at t2 index in df
        print("\n************************************************************")

In [42]:
print_tweet_pairs(top_10_BoW_df)


Tweet pair  1
Tweet1
@Ranbir_Crpf Ssc Gd me sabhi 85k Conidet ka Final Merit Banana Chaiye Ye Manyavar Narendra Modi G se Vinti h.
Tweet2
@Ranbir_Crpf Ssc Gd me sabhi 85k Conidet ka Final Merit Banana Chaiye, Ye Manyavar Narendra Modi G se Vinti h.

************************************************************

Tweet pair  2
Tweet1
Narendra modi
Tweet2
Narendra Modi

************************************************************

Tweet pair  3
Tweet1
PM Shri Narendra Modi launches Kevadia App.
Tweet2
PM Shri Narendra Modi launches Kevadia App.

************************************************************

Tweet pair  4
Tweet1
Keshubhai Patel, the BJP stalwart who not only helped lay the foundation of the party but also mentored the likes o… https://t.co/B9CdsagLOM
Tweet2
Keshubhai Patel, the BJP stalwart who not only helped lay the foundation of the party but also mentored the likes P… https://t.co/uw35bHPeks

************************************************************

Tweet pair  5
Tw

## Similar tweets pair using TF-IDF


In [43]:
tfidf_similarity = cosine_similarity(tfidf_array)
tfidf_similarity_df = similar_pairs(tfidf_similarity)
top_10_tfidf_df = tfidf_similarity_df.nlargest(10, 'cosine similarity')
top_10_tfidf_df.reset_index(drop=True)
print_tweet_pairs(top_10_tfidf_df)


Tweet pair  1
Tweet1
Narendra modi
Tweet2
Narendra Modi

************************************************************

Tweet pair  2
Tweet1
@Ranbir_Crpf Ssc Gd me sabhi 85k Conidet ka Final Merit Banana Chaiye Ye Manyavar Narendra Modi G se Vinti h.
Tweet2
@Ranbir_Crpf Ssc Gd me sabhi 85k Conidet ka Final Merit Banana Chaiye, Ye Manyavar Narendra Modi G se Vinti h.

************************************************************

Tweet pair  3
Tweet1
PM Shri Narendra Modi launches Kevadia App.
Tweet2
PM Shri Narendra Modi launches Kevadia App.

************************************************************

Tweet pair  4
Tweet1
Please use the promocode "Narendra Modi" to receive 56% extra Jumla while receiving ₹15 lakh.

PS: This is yet anot… https://t.co/FUxr4OMT54
Tweet2
Please use the promocode "Narendra Modi" to receive 56% extra Jumla while receiving ₹15 lakh.

PS: This is yet anot… https://t.co/3IKUkV8prw

************************************************************

Tweet pair  5
Tw

### Find out unique users (N) who have posted these 1000 tweets. (N <= 1000) U1, u2, ….., uN

In [21]:
# Unique users with followers/friends atmost 1000 (<= 1000)

df_user = df[(df['creator_followers'] <= 1000) & (df['creator_friends'] <= 1000)]
df_user.describe()

Unnamed: 0,tweet_id,creator_id,creator_followers,creator_friends
count,546.0,546.0,546.0,546.0
mean,1.322188e+18,8.128704e+17,139.498168,229.862637
std,39162220000000.0,5.472017e+17,197.454798,247.084715
min,1.322126e+18,31408970.0,0.0,0.0
25%,1.322159e+18,3330192000.0,7.0,42.25
50%,1.322186e+18,1.06947e+18,50.0,119.0
75%,1.322214e+18,1.27972e+18,190.0,336.25
max,1.322296e+18,1.322239e+18,996.0,999.0


In [22]:
unique_creators = df_user.created_by.unique()
unique_creators = list(unique_creators)
len(unique_creators)
len(unique_creators[:20])


20

In [10]:

followers = []
user1 = tweepy.Cursor(api.followers, 'beckylynch81', count=200).items()
for follower in user1:
    followers.append(follower.screen_name)
        
friends = []
user2 = tweepy.Cursor(api.friends, 'A_K1992in', count=200).items()
for friend in user2:
    friends.append(friend.screen_name)

In [27]:
def limit_handled(cursor, list_name):
    while True:
        try:
            yield cursor.next()
        # Catch Twitter API rate limit exception and wait for 15 minutes
        except tweepy.RateLimitError:
            print("Data points in list = {}".format(len(list_name)))
            print("Hit Twitter API rate limit.")
            for i in range(3, 0, -1):
                print("Wait for {} mins.".format(i*5))
                time.sleep(5*60)
        # Catch any other Twitter API exceptions
        except tweepy.error.TweepError:
            print('\nCaught TweepError exception')

In [None]:
# Create a list of followers and friends of these unique creators

user_list = []

for user in unique_creators:
    ff_list = []
    followers = []
    for follower in limit_handled(tweepy.Cursor(api.followers, screen_name=user, count=1000).items(), user_list):
        followers.append(follower.screen_name)
    
    friends = []
    for friend in limit_handled(tweepy.Cursor(api.friends, screen_name=user, count=1000).items(), user_list):
        friends.append(friend.screen_name)    
        
    ff_list = [followers, friends]
    user_list.append(ff_list)

### Create Network Graph using NetworkX

Followers and followees are also users, so create a follower-followee directed graph among them, G. (ui → uj) iff ui is followed by uj

In [None]:
#!pip install networkx

import networkx as nx
import matplotlib.pyplot as plt

In [None]:
# Instantiate Graph object (Directed graph)

g = nx.DiGraph()

In [None]:
for i in range(len(user_list)):
    for j in range(len(user_list[i][0])): # For followers
        g.add_edge(unique_creators[i], user_list[i][0][j])
    
    for k in range(len(user_list[i][1])): # For friends
        g.add_edge(user_list[i][1][k], unique_creators[i])

In [None]:
# Draw Graph

nx.draw(g)