# Network Analysis 

### Problem:

1. Use Twitter API to collect 1000 tweets in which keyword ‘narendra modi’ appears, save the collected tweets in nm.txt

2. Convert the collected tweets into BoW vectors and find cosine similarity of a pair of tweets and print the top-10 most similar tweet pairs, print these pairs

3. Do the same using TF-IDF vectors

4. Find out unique users (N) who have posted these 1000 tweets. (N <= 1000)
    U1, u2, ….., uN
5. Find the followers and followee of each user from the N users obtained before
    * u1 - [followers list] [followee list]
    * u2 - [followers list] [followee list]
    * .
    * .
    * uN - [followers list] [followee list]
6. Followers and followees are also users, so create a follower-followee directed graph among them, G. (ui → uj) iff ui is followed by uj
7. Find popular users in this G based on 
    * Degree centrality
    * Betweeness centrality
    * Closeness centrality


---

**Step 1** Use Twitter API to collect 1000 tweets in which keyword ‘narendra modi’ appears, save the collected tweets in nm.txt

In [47]:
# import libraries

import tweepy
import json
import csv
import pandas as pd
import numpy as np

In [12]:
# Twitter API Credentials

from secrets import *

In [13]:
# Stablish connection with API

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)

In [14]:
# Location to store data

data_path = './data/'

In [42]:
# function for searching keyword 
# Attr: search words: word to be searched
#       no_of_tweets: no. of tweets to be fetched
# Ignore retweets

def search_tweets_by_keywords(search_words='narendra modi', max_tweets=1000):
    search_words = search_words + "-filter:retweets"
    search_tweets = tweepy.Cursor(api.search,
                  q=search_words).items(max_tweets)
    return search_tweets

In [43]:
# Saving the tweets as a text file for future reference
# creating a dataframe and converting it to a csv file

def save_tweets_to_text_file(search_tweets, file_name):
    my_list_of_dicts = []
    for each_json_tweet in search_tweets:
        my_list_of_dicts.append(each_json_tweet._json)
        
    with open(data_path + file_name + '.txt', 'w') as file:
        file.write(json.dumps(my_list_of_dicts, indent=4))             

In [44]:
# save id, text, creator's id and creator's name of the tweets
# from text file and create a dataframe and save it in CSV format

def save_required_data_from_txt_to_csv(file_name):
    my_demo_list = []
    with open(data_path + file_name + '.txt', encoding='utf-8') as json_file:
        all_data = json.load(json_file)
        for each_dictionary in all_data:
            tweet_id = each_dictionary['id'] # can also use id_str attr
            text = each_dictionary['text']
            creator_id = each_dictionary['user']['id']
            created_by = each_dictionary['user']['screen_name']

            my_demo_list.append({'tweet_id': str(tweet_id),
                             'text': str(text),
                             'creator_id': str(creator_id),
                             'created_by': str(created_by),
                            })    
        
            tweet_dataset = pd.DataFrame(my_demo_list, columns = ['tweet_id', 'text', 
                                       'creator_id', 'created_by'])
            
    # Writing tweet dataset to csv file for future reference
    tweet_dataset.to_csv(data_path + file_name + '.csv', index=False)  

In [45]:
# Collect 1000 tweets with keyword 'narendra modi'

tweets = search_tweets_by_keywords()

In [46]:
# save the tweets in nm.txt file and create dataframe from it

save_tweets_to_text_file(tweets, 'nm')

In [48]:
# Create CSV of required data

save_required_data_from_txt_to_csv('nm')

---

**Step 2** Convert the collected tweets into BoW vectors and find cosine similarity of a pair of tweets and print the top-10 most similar tweet pairs, print these pairs

In [17]:
# Read from CSV

df = pd.read_csv(data_path + 'nm.csv')

In [18]:
df.head()

Unnamed: 0,tweet_id,text,creator_id,created_by
0,1321587869807341569,@jadafromnola @FriedrichPieter What an amazing...,1279132550380654592,BHARATIYASEEKER
1,1321585634440458240,@JoshCastelino Oh rice bag converts you ppl ha...,396353624,RecepErdoggann
2,1321580847825264640,PM Narendra Modi dubs Tejashwi ‘jungle raj ke ...,134758540,timesofindia
3,1321577699861647360,Hindu hi to tha mar gaya \nMaar diya police ne...,1184757165828927488,randm_indianguy
4,1321570785706733568,गरीब के नाम पर राजनीति करने वालों ने देश में ऐ...,1274235804525539328,BhagwanSankla


In [19]:
df.shape

(1000, 4)

### Bag of Words vector

In [22]:
# tokenize texts to get meaningful words
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
# list of Tweet's text

text = list(df['text'])

In [24]:
vectorizer = CountVectorizer()

In [25]:
# tokenize and build vocab

vectorizer.fit(text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [26]:
# summarize

print(vectorizer.vocabulary_)



In [27]:
# encode document

vector = vectorizer.transform(text)

In [29]:
# summarize encoded vector

print(vector.shape)

(1000, 4307)


In [31]:
print(type(vector))

<class 'scipy.sparse.csr.csr_matrix'>


In [32]:
print(vector.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [33]:
print(vectorizer.vocabulary_)



In [52]:
BoW_array = vector.toarray()

In [53]:
BoW_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### TF-IDF Vector

In [36]:
# tfidf vectorize

from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
tfidf_vectorizer = TfidfVectorizer()

In [48]:
tfidf_vector = tfidf_vectorizer.fit_transform(text)

In [49]:
tfidf_vector.shape

(1000, 4307)

In [50]:
tfidf_array = tfidfVector.toarray()

In [51]:
tfidf_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Cosine Similarity

In [42]:
from sklearn.metrics.pairwise import cosine_similarity

In [59]:
# Cosine Similarity function

def Cosine_Similarity(arr):
    tweet_similarity = []

    for i in range(len(arr)):
        tweet_similarity_row = []
        for j in range(len(arr)):
            a = np.reshape(arr[i], (1, arr[i].size))
            b = np.reshape(arr[j], (1, arr[j].size))
            c = cosine_similarity(a,b)
            tweet_similarity_row.append(c[0][0])
        tweet_similarity.append(tweet_similarity_row) 
    return tweet_similarity

In [60]:
# Call Cosine Similarity function for BoW vector

BoW_similarity = Cosine_Similarity(BoW_array)

In [61]:
len(BoW_similarity)

1000

In [64]:
len(BoW_similarity[0])

1000