## Prepare Data 

- Read Tweet Retweet Network
- Filter for all news tweets, real news tweets and fake news tweets 

In [1]:
#import libraries
import pandas as pd
import igraph as ig
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# read tweet retweet network

df = pd.read_csv('../tweet_retweet_network.csv')
df

Unnamed: 0,tweet_id,user_id_retweet_df,user_id_tweet_df,news_id,is_fake_news
0,1033706162695356417,7.873112e+08,4.335085e+07,politifact99,0.0
1,1033706162695356417,7.873112e+08,4.335085e+07,politifact340,0.0
2,1035580865160638464,3.338247e+09,1.629771e+07,politifact99,0.0
3,1035580865160638464,3.338247e+09,1.629771e+07,politifact340,0.0
4,934206237708865537,7.543102e+17,5.820642e+06,politifact99,0.0
...,...,...,...,...,...
573632,813192381583466496,,3.001448e+09,politifact13600,1.0
573633,812814063763918848,,5.256945e+08,politifact13600,1.0
573634,918510122363744257,,1.525687e+09,politifact14621,1.0
573635,917522953390223366,,7.889922e+17,politifact14621,1.0


In [3]:
# filter for fake and real news

df_fake = df[df['is_fake_news'] == 1.0]
df_real = df[df['is_fake_news'] == 0.0]

In [4]:
# clean edge lists

df_net_all = df[['user_id_retweet_df', 'user_id_tweet_df']].dropna()
df_net_fake = df_fake[['user_id_retweet_df', 'user_id_tweet_df']].dropna()
df_net_real = df_real[['user_id_retweet_df', 'user_id_tweet_df']].dropna()

In [5]:
# all news edge list

df_net_all

Unnamed: 0,user_id_retweet_df,user_id_tweet_df
0,7.873112e+08,4.335085e+07
1,7.873112e+08,4.335085e+07
2,3.338247e+09,1.629771e+07
3,3.338247e+09,1.629771e+07
4,7.543102e+17,5.820642e+06
...,...,...
89022,1.496571e+09,7.960258e+17
89023,9.816488e+08,7.848257e+17
89024,7.200451e+08,2.435832e+07
89025,8.990504e+08,1.342402e+08


In [6]:
# fake news edge list

df_net_fake

Unnamed: 0,user_id_retweet_df,user_id_tweet_df
4125,8.856920e+17,3.974499e+07
4161,1.917218e+09,8.277065e+07
4235,2.559283e+08,2.147477e+07
4329,8.222716e+17,8.247972e+17
4379,1.759702e+07,1.109180e+08
...,...,...
89022,1.496571e+09,7.960258e+17
89023,9.816488e+08,7.848257e+17
89024,7.200451e+08,2.435832e+07
89025,8.990504e+08,1.342402e+08


In [7]:
# real news edge list

df_net_real

Unnamed: 0,user_id_retweet_df,user_id_tweet_df
0,7.873112e+08,43350851.0
1,7.873112e+08,43350851.0
2,3.338247e+09,16297707.0
3,3.338247e+09,16297707.0
4,7.543102e+17,5820642.0
...,...,...
68198,2.163463e+07,860585294.0
68199,3.227421e+07,48608766.0
68200,8.013800e+05,18251414.0
68201,2.404211e+08,18251414.0


### Graph Creation

- Create graph for all news, fake news and real news.
- Create dictionary which stores network metrics for the graphs created. Chosen metrics are degree, hub score, authority score and betweenness.

In [8]:
# create all news graph

g_all = ig.Graph.TupleList(df_net_all.itertuples(index=False), directed=True)

In [9]:
# create real news graph

g_real = ig.Graph.TupleList(df_net_real.itertuples(index=False), directed=True)

In [10]:
# sort all news nodes by degree

df_all_degree = pd.DataFrame({'name': list(g_all.vs['name']), 'degree':g_all.degree(), 'hub_score':g_all.hub_score(), 'authority_score':g_all.authority_score(), 'betweenness':g_all.betweenness()} )


In [11]:
# create fake news graph

g_fake = ig.Graph.TupleList(df_net_fake.itertuples(index=False), directed=True)

In [12]:
df_all_degree = df_all_degree.sort_values('betweenness', ascending = False).reset_index(drop = True)
df_all_degree.head(10)

Unnamed: 0,name,degree,hub_score,authority_score,betweenness
0,3271255000.0,136,1.742001e-08,1.34032e-10,976.0
1,16190480.0,17,3.634147e-15,5.765948e-06,705.0
2,36700660.0,11,2.513345e-15,0.0,608.0
3,346257900.0,40,8.556069e-15,0.0,311.333333
4,4184335000.0,31,6.65278e-15,0.0,251.5
5,7.327676e+17,15,3.206343e-15,0.0,233.333333
6,58869220.0,28,8.333437e-15,0.0,224.0
7,450941700.0,63,1.743691e-08,4.739218e-08,219.0
8,543780300.0,22,4.694925e-15,9.069482e-14,210.0
9,617619900.0,19,5.524339e-15,0.0,210.0


In [13]:
df_all_degree.mean()

name               2.146357e+17
degree             1.462080e+00
hub_score          3.001362e-05
authority_score    8.451216e-06
betweenness        1.982344e-01
dtype: float64

In [14]:
# sort fake news nodes by degree

df_fake_degree = pd.DataFrame({'name': list(g_fake.vs['name']), 'degree':g_fake.degree() } )
df_fake_degree = df_fake_degree.sort_values('degree', ascending = False).reset_index(drop = True)
df_fake_degree.head(10)

Unnamed: 0,name,degree
0,1911303000.0,153
1,21032570.0,119
2,224653800.0,83
3,2417844000.0,56
4,14294850.0,50
5,2767681000.0,44
6,1355218000.0,42
7,8953122.0,38
8,2309297000.0,37
9,52572720.0,33


## Leading Eigenvector Community Detection

For fake news network:

- Detect communities with Leading Eigenvector Algorithm
- Calculate modularity
- Find communities with the largest number of nodes

In [15]:
#leading eigenvector algorithm

comm_leading_eigenvector= g_fake.community_leading_eigenvector()

  membership, _, q = GraphBase.community_leading_eigenvector(


In [16]:
# calculate modularity with LEA algorithm

modularity = g_fake.modularity(comm_leading_eigenvector)
print(modularity)

0.9946281435098392


In [17]:
# community dataframe

df_leading_eigenvector = pd.DataFrame({'name': list(g_fake.vs['name']), 'community':comm_leading_eigenvector.membership } )
df_leading_eigenvector

Unnamed: 0,name,community
0,8.856920e+17,0
1,3.974499e+07,0
2,1.917218e+09,1
3,8.277065e+07,1
4,2.559283e+08,2
...,...,...
31504,3.995662e+09,1339
31505,6.957517e+17,11622
31506,9.816488e+08,8781
31507,7.200451e+08,10428


In [18]:
# Find largest communities

df_leading_eigenvector['community'].value_counts()

11642    386
11622    287
214      203
11627    198
11641    163
        ... 
9605       2
6340       2
4293       2
2246       2
2039       2
Name: community, Length: 11647, dtype: int64

## Get the network metrics for each community

For each community:

- Prepare a DataFrame to store the calculated metrics for each node.
- For each node, 4 network metrics will be calculated: Degree, Hub Score, Authority Score and Betweenness.

### Community 1

In [19]:
# get node list for community 1 (11642)

df_le_com1 = df_leading_eigenvector[df_leading_eigenvector['community']==11642]
df_le_com1 

Unnamed: 0,name,community
638,1.440617e+09,11642
639,3.317085e+07,11642
678,3.453434e+08,11642
679,3.760202e+08,11642
990,3.817328e+09,11642
...,...,...
31283,3.381218e+09,11642
31287,3.805432e+08,11642
31296,3.845993e+08,11642
31337,2.159220e+09,11642


In [20]:
# convert node list to list of nodes

le_com1_list = df_le_com1['name'].tolist()

In [21]:
# create edge list

df_fake_com1 = df_net_fake[df_net_fake['user_id_retweet_df'].isin(le_com1_list)]
df_fake_com1 = df_fake_com1[df_fake_com1['user_id_tweet_df'].isin(le_com1_list)]
df_fake_com1

Unnamed: 0,user_id_retweet_df,user_id_tweet_df
68209,1.440617e+09,3.317085e+07
68230,3.453434e+08,3.760202e+08
68402,3.817328e+09,1.341686e+09
68496,5.028776e+07,7.631806e+17
68551,2.287367e+09,5.329788e+07
...,...,...
88825,3.845993e+08,1.584306e+07
88869,2.159220e+09,9.389488e+08
88925,1.921811e+07,5.831230e+08
88926,4.898620e+07,9.617064e+08


In [22]:
# get node and id for graphframe format

com1_node_list = list(set(df_fake_com1['user_id_retweet_df'].tolist() + df_fake_com1['user_id_tweet_df'].tolist()))

com1_nodes = pd.DataFrame({'id':com1_node_list, 'node':com1_node_list})
com1_nodes

Unnamed: 0,id,node
0,9.388038e+17,9.388038e+17
1,9.337378e+17,9.337378e+17
2,7.197561e+17,7.197561e+17
3,7.872964e+17,7.872964e+17
4,8.232031e+17,8.232031e+17
...,...,...
381,1.711509e+07,1.711509e+07
382,2.755170e+07,2.755170e+07
383,1.322291e+08,1.322291e+08
384,5.553970e+07,5.553970e+07


In [23]:
# get edge list as src and dst

df_fake_com1 = df_fake_com1.rename(columns = {'user_id_retweet_df':'dst', 'user_id_tweet_df':'src'})
df_fake_com1 = df_fake_com1[['src','dst']]
df_fake_com1

Unnamed: 0,src,dst
68209,3.317085e+07,1.440617e+09
68230,3.760202e+08,3.453434e+08
68402,1.341686e+09,3.817328e+09
68496,7.631806e+17,5.028776e+07
68551,5.329788e+07,2.287367e+09
...,...,...
88825,1.584306e+07,3.845993e+08
88869,9.389488e+08,2.159220e+09
88925,5.831230e+08,1.921811e+07
88926,9.617064e+08,4.898620e+07


In [24]:
# view community degrees

df_com1_degree = pd.DataFrame({'name': list(g_com1.vs['name']), 'degree':g_com1.degree(), 'hub_score':g_com1.hub_score(), 'authority_score':g_com1.authority_score(), 'betweenness':g_com1.betweenness()} )

NameError: name 'g_com1' is not defined

In [None]:
df_com1_degree = df_com1_degree.sort_values('betweenness', ascending = False).reset_index(drop = True)
df_com1_degree.head(10)

In [None]:
df_com1_degree.mean()

In [None]:
#com1_nodes.to_csv('le_com1_nodes.csv', index = False)
#df_fake_com1.to_csv('le_com1_edges.csv', index = False)

### Community 2

In [None]:
# get node list for communtiiy 2 (11622)

df_le_com2 = df_leading_eigenvector[df_leading_eigenvector['community']==11622]
df_le_com2 

In [None]:
# get list of nodes

le_com2_list = df_le_com2['name'].tolist()

In [None]:
# get edge list

df_fake_com2 = df_net_fake[df_net_fake['user_id_retweet_df'].isin(le_com2_list)]
df_fake_com2 = df_fake_com2[df_fake_com2['user_id_tweet_df'].isin(le_com2_list)]
df_fake_com2

In [None]:
# get name and id for graphframes

com2_node_list = list(set(df_fake_com2['user_id_retweet_df'].tolist() + df_fake_com2['user_id_tweet_df'].tolist()))

com2_nodes_df = pd.DataFrame({'id':com2_node_list, 'node':com2_node_list})
com2_nodes_df

In [None]:
# get src and dst for graphframes

df_fake_com2 = df_fake_com2.rename(columns = {'user_id_retweet_df':'dst', 'user_id_tweet_df':'src'})
df_fake_com2 = df_fake_com2[['src','dst']]
df_fake_com2

In [None]:
# view degrees for com2

df_com2_degree = pd.DataFrame({'name': list(g_com2.vs['name']), 'degree':g_com2.degree(), 'hub_score':g_com2.hub_score(), 'authority_score':g_com2.authority_score(), 'betweenness':g_com2.betweenness()} )

In [None]:
df_com2_degree = df_com2_degree.sort_values('betweenness', ascending = False).reset_index(drop = True)
df_com2_degree.head(10)

In [None]:
df_com2_degree.mean()

## Community 3

In [None]:
# get node list for community 3 (214)

df_le_com3 = df_leading_eigenvector[df_leading_eigenvector['community']==214]
df_le_com3 

In [None]:
# get list of nodes for community 3

le_com3_list = df_le_com3['name'].tolist()

In [None]:
# get edge list for community 3

df_fake_com3 = df_net_fake[df_net_fake['user_id_retweet_df'].isin(le_com3_list)]
df_fake_com3 = df_fake_com3[df_fake_com3['user_id_tweet_df'].isin(le_com3_list)]
df_fake_com3

In [None]:
# get name and id for graphframe format

com3_node_list = list(set(df_fake_com3['user_id_retweet_df'].tolist() + df_fake_com3['user_id_tweet_df'].tolist()))

com3_nodes_df = pd.DataFrame({'id':com3_node_list, 'node':com3_node_list})
com3_nodes_df

In [None]:
# get src and dst for graphframes format

df_fake_com3 = df_fake_com3.rename(columns = {'user_id_retweet_df':'dst', 'user_id_tweet_df':'src'})
df_fake_com3 = df_fake_com3[['src','dst']]
df_fake_com3

In [None]:
# get degrees for nodes in community 3

df_com3_degree = pd.DataFrame({'name': list(g_com3.vs['name']), 'degree':g_com3.degree(), 'hub_score':g_com3.hub_score(), 'authority_score':g_com3.authority_score(), 'betweenness':g_com3.betweenness()} )
df_com3_degree = df_com3_degree.sort_values('degree', ascending = False).reset_index(drop = True)
df_com3_degree.head(10)

In [None]:
df_com3_degree.mean()

### Community 4

In [None]:
# get node list for community 4 (11627)

df_le_com4 = df_leading_eigenvector[df_leading_eigenvector['community']==11627]
df_le_com4 

In [None]:
# get list of nodes for community 4

le_com4_list = df_le_com4['name'].tolist()

In [None]:
# get edge list for community 4

df_fake_com4 = df_net_fake[df_net_fake['user_id_retweet_df'].isin(le_com4_list)]
df_fake_com4 = df_fake_com4[df_fake_com4['user_id_tweet_df'].isin(le_com4_list)]
df_fake_com4

In [None]:
# get node and id for graphframe format

com4_node_list = list(set(df_fake_com4['user_id_retweet_df'].tolist() + df_fake_com4['user_id_tweet_df'].tolist()))

com4_nodes_df = pd.DataFrame({'id':com4_node_list, 'node':com4_node_list})
com4_nodes_df

In [None]:
# get src and dst for graphframe format

df_fake_com4 = df_fake_com4.rename(columns = {'user_id_retweet_df':'dst', 'user_id_tweet_df':'src'})
df_fake_com4 = df_fake_com4[['src','dst']]
df_fake_com4

In [None]:
# get degrees for all nodes in community 4

df_com4_degree = pd.DataFrame({'name': list(g_com4.vs['name']), 'degree':g_com4.degree(), 'hub_score':g_com4.hub_score(), 'authority_score':g_com4.authority_score(), 'betweenness':g_com4.betweenness()} )

In [None]:
df_com4_degree = df_com4_degree.sort_values('betweenness', ascending = False).reset_index(drop = True)
df_com4_degree.head(10)

In [None]:
df_com4_degree.mean()

### Community 5

In [None]:
# get node list for community 5 (11641)

df_le_com5 = df_leading_eigenvector[df_leading_eigenvector['community']==11641]
df_le_com5 

In [None]:
# get list of nodes

le_com5_list = df_le_com5['name'].tolist()

In [None]:
# get edge list for community 5

df_fake_com5 = df_net_fake[df_net_fake['user_id_retweet_df'].isin(le_com5_list)]
df_fake_com5 = df_fake_com5[df_fake_com5['user_id_tweet_df'].isin(le_com5_list)]
df_fake_com5

In [None]:
# get id and node for graphframe format

com5_node_list = list(set(df_fake_com5['user_id_retweet_df'].tolist() + df_fake_com5['user_id_tweet_df'].tolist()))

com5_nodes_df = pd.DataFrame({'id':com5_node_list, 'node':com5_node_list})
com5_nodes_df

In [None]:
# get src and dst for graphframe format

df_fake_com5 = df_fake_com5.rename(columns = {'user_id_retweet_df':'dst', 'user_id_tweet_df':'src'})
df_fake_com5 = df_fake_com5[['src','dst']]
df_fake_com5

In [None]:
# get degrees for all nodes in community 5

df_com5_degree = pd.DataFrame({'name': list(g_com5.vs['name']), 'degree':g_com5.degree(), 'hub_score':g_com5.hub_score(), 'authority_score':g_com5.authority_score(), 'betweenness':g_com5.betweenness()} )
df_com5_degree = df_com5_degree.sort_values('degree', ascending = False).reset_index(drop = True)
df_com5_degree.head(10)

In [None]:
df_com5_degree.mean()