# Script for some network analysis

In [5]:
import pandas as pd
import ast

network_data = pd.read_csv('data/network_stats/complete.csv')
network_data_rt = pd.read_csv('data/network_stats/rt_only.csv')
tweets = pd.read_csv('data/cleaned_w_rt/cleaned_rt.csv')

tweets['clean_text'] = tweets['clean_text'].apply(ast.literal_eval)

network_data = network_data.rename(columns = {'Id':'username'})

print('Number of users in the network: ', network_data.username.nunique())
print('Number of users in the tweets: ', tweets.username.nunique())

Number of users in the network:  3054
Number of users in the tweets:  5432


In [6]:
# Find the sizes of each cluster in the network
network_data['modularity_class'].value_counts(normalize=True) * 100

2    48.297315
1    25.867714
3    11.755075
0     8.677145
4     5.402750
Name: modularity_class, dtype: float64

In [36]:
# Sort users based on pagerank
network_data.sort_values('pageranks').tail(n=10)

Unnamed: 0,username,Label,timeset,componentnumber,strongcompnum,indegree,outdegree,Degree,weighted indegree,weighted outdegree,...,Eccentricity,closnesscentrality,harmonicclosnesscentrality,betweenesscentrality,Authority,Hub,pageranks,clustering,eigencentrality,modularity_class
18,erikpjverweij,erikpjverweij,,0,634,109,3,112,154,3,...,15,0.124773,0.137056,136860.272338,0.096377,0.004986,0.007534,0.006306,0.302282,0
171,NOS,NOS,,0,17,134,0,134,174,0,...,0,0.0,0.0,0.0,0.128279,0.0,0.008438,0.004152,0.481444,2
13,ShashiRoopram,ShashiRoopram,,0,634,90,1,91,106,2,...,14,0.140561,0.155039,689806.071053,0.088789,2e-06,0.009376,0.007814,0.347346,0
8,SylvanaBIJ1,SylvanaBIJ1,,0,109,152,0,152,185,0,...,0,0.0,0.0,0.0,0.195546,0.0,0.009433,0.001394,0.447703,1
22,jesseklaver,jesseklaver,,0,13,147,0,147,205,0,...,0,0.0,0.0,0.0,0.168528,0.0,0.009438,0.003867,0.649081,2
621,WitdeBerna,WitdeBerna,,0,634,4,6,10,6,8,...,13,0.163446,0.188076,701754.834849,3.5e-05,0.000958,0.009465,0.013889,0.074021,2
43,HartvNL,HartvNL,,0,27,190,0,190,237,0,...,0,0.0,0.0,0.0,0.21636,0.0,0.01233,0.002005,0.534336,2
46,TheRebelThePoet,TheRebelThePoet,,0,25,213,1,214,316,1,...,1,1.0,1.0,1148.623016,0.498295,0.002671,0.0144,0.004366,0.813891,1
1145,volkskrant,volkskrant,,0,24,45,0,45,65,0,...,0,0.0,0.0,0.0,0.053298,0.0,0.014708,0.028283,0.285381,4
32,kozwartepiet,kozwartepiet,,0,634,273,18,291,471,22,...,19,0.08995,0.117547,379874.383167,0.611762,0.064434,0.024833,0.005387,1.0,1


In [38]:
# Sort users based on how often they are retweeted
to_search = list(network_data_rt.sort_values('weighted indegree').tail(n=10)['Id'])
network_data[network_data['username'].isin(to_search)]

Unnamed: 0,username,Label,timeset,componentnumber,strongcompnum,indegree,outdegree,Degree,weighted indegree,weighted outdegree,...,Eccentricity,closnesscentrality,harmonicclosnesscentrality,betweenesscentrality,Authority,Hub,pageranks,clustering,eigencentrality,modularity_class
8,SylvanaBIJ1,SylvanaBIJ1,,0,109,152,0,152,185,0,...,0,0.0,0.0,0.0,0.195546,0.0,0.009433,0.001394,0.447703,1
18,erikpjverweij,erikpjverweij,,0,634,109,3,112,154,3,...,15,0.124773,0.137056,136860.272338,0.096377,0.004986,0.007534,0.006306,0.302282,0
22,jesseklaver,jesseklaver,,0,13,147,0,147,205,0,...,0,0.0,0.0,0.0,0.168528,0.0,0.009438,0.003867,0.649081,2
32,kozwartepiet,kozwartepiet,,0,634,273,18,291,471,22,...,19,0.08995,0.117547,379874.383167,0.611762,0.064434,0.024833,0.005387,1.0,1
43,HartvNL,HartvNL,,0,27,190,0,190,237,0,...,0,0.0,0.0,0.0,0.21636,0.0,0.01233,0.002005,0.534336,2
46,TheRebelThePoet,TheRebelThePoet,,0,25,213,1,214,316,1,...,1,1.0,1.0,1148.623016,0.498295,0.002671,0.0144,0.004366,0.813891,1
112,frankvdlinde,frankvdlinde,,0,19,111,5,116,151,8,...,1,1.0,1.0,3823.845238,0.129578,0.015771,0.006045,0.007946,0.327386,3
171,NOS,NOS,,0,17,134,0,134,174,0,...,0,0.0,0.0,0.0,0.128279,0.0,0.008438,0.004152,0.481444,2
196,NHNieuws,NHNieuws,,0,30,89,0,89,131,0,...,0,0.0,0.0,0.0,0.118986,0.0,0.004943,0.004852,0.272249,3
1402,ADnl,ADnl,,0,0,95,0,95,111,0,...,0,0.0,0.0,0.0,0.105295,0.0,0.005337,0.002688,0.26836,2


In [39]:
# Group clusters together and give a mean summary
grouped_network_data = network_data.groupby('modularity_class')
grouped_network_data.mean()

Unnamed: 0_level_0,timeset,componentnumber,strongcompnum,indegree,outdegree,Degree,weighted indegree,weighted outdegree,Weighted Degree,Eccentricity,closnesscentrality,harmonicclosnesscentrality,betweenesscentrality,Authority,Hub,pageranks,clustering,eigencentrality
modularity_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,,0.0,1250.962264,2.532075,2.430189,4.962264,3.316981,3.264151,6.581132,8.743396,0.310897,0.328087,4755.785319,0.001826,0.005775,0.00034,0.133431,0.008227
1,,0.0,1234.805063,3.143038,3.196203,6.339241,4.194937,4.24557,8.440506,10.056962,0.324614,0.345212,3766.410852,0.00482,0.019186,0.000353,0.166089,0.013549
2,,0.0,1186.558644,2.648136,2.621017,5.269153,3.148475,3.110508,6.258983,7.168814,0.420766,0.439778,4094.944563,0.002145,0.006119,0.000305,0.089895,0.008904
3,,0.0,1234.022284,2.392758,2.454039,4.846797,3.103064,3.194986,6.29805,5.91922,0.431282,0.452481,1755.95979,0.002144,0.0057,0.000274,0.107406,0.007105
4,,0.0,1065.939394,2.733333,2.751515,5.484848,3.981818,3.963636,7.945455,7.90303,0.335641,0.351205,6288.215287,0.002261,0.005766,0.000433,0.142996,0.011411


In [8]:
# Filter tweets on users appearing in the final network, result is data used for R.
workable_data = tweets[tweets["username"].isin(network_data['username'])]
workable_data['clean_text_string'] = [' '.join(map(str, l)) for l in workable_data['clean_text']]
workable_data = workable_data.merge(network_data.loc[:,['username','modularity_class']], on="username", how='inner')
workable_data[['clean_text_string', 'modularity_class']].to_csv('data/R_data/workable_data.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  workable_data['clean_text_string'] = [' '.join(map(str, l)) for l in workable_data['clean_text']]
