## **Signed Network Intercommunity Analysis**

In [1]:
import pandas as pd

# **Load the Dataset**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
title_links = pd.read_csv("/content/drive/MyDrive/ENS/soc-redditHyperlinks-title.tsv", sep="\t")
body_links = pd.read_csv("/content/drive/MyDrive/ENS/soc-redditHyperlinks-body.tsv", sep="\t")

In [4]:
title_links.head()

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,POST_ID,TIMESTAMP,LINK_SENTIMENT,PROPERTIES
0,rddtgaming,rddtrust,1u4pzzs,2013-12-31 16:39:18,1,"25.0,23.0,0.76,0.0,0.44,0.12,0.12,4.0,4.0,0.0,..."
1,xboxone,battlefield_4,1u4tmfs,2013-12-31 17:59:11,1,"100.0,88.0,0.78,0.02,0.08,0.13,0.07,16.0,16.0,..."
2,ps4,battlefield_4,1u4tmos,2013-12-31 17:59:40,1,"100.0,88.0,0.78,0.02,0.08,0.13,0.07,16.0,16.0,..."
3,fitnesscirclejerk,leangains,1u50xfs,2013-12-31 19:01:56,1,"49.0,43.0,0.775510204082,0.0,0.265306122449,0...."
4,fitnesscirclejerk,lifeprotips,1u51nps,2013-12-31 21:02:28,1,"14.0,14.0,0.785714285714,0.0,0.428571428571,0...."


In [5]:
body_links.head()

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,POST_ID,TIMESTAMP,LINK_SENTIMENT,PROPERTIES
0,leagueoflegends,teamredditteams,1u4nrps,2013-12-31 16:39:58,1,"345.0,298.0,0.75652173913,0.0173913043478,0.08..."
1,theredlion,soccer,1u4qkd,2013-12-31 18:18:37,-1,"101.0,98.0,0.742574257426,0.019801980198,0.049..."
2,inlandempire,bikela,1u4qlzs,2014-01-01 14:54:35,1,"85.0,85.0,0.752941176471,0.0235294117647,0.082..."
3,nfl,cfb,1u4sjvs,2013-12-31 17:37:55,1,"1124.0,949.0,0.772241992883,0.0017793594306,0...."
4,playmygame,gamedev,1u4w5ss,2014-01-01 02:51:13,1,"715.0,622.0,0.777622377622,0.00699300699301,0...."


**The data is presented as follows:**

- SOURCE_SUBREDDIT: The subreddit where the link originates.
- TARGET_SUBREDDIT: The subreddit where the link ends.
- POST_ID: The post in the source subreddit that starts the link.
- TIMESTAMP: The time of the post.
- POST_LABEL: A label indicating if the source post is explicitly negative towards the target post. The value is -1 if the source is negative towards the target, and 1 if it is neutral or positive. This label is created using crowd-sourcing and training a text-based classifier, providing better accuracy than simple sentiment analysis.
- POST_PROPERTIES: A vector representing the text properties of the source post, including various metrics such as number of characters, fractions of alphabetic characters, digits, uppercase characters, stopwords, and sentiments calculated by VADER.

POST_PROPERTIES were constructed to assign a sign (-1 or 1) to the directed link between the SOURCE_SUBREDDIT and TARGET_SUBREDDIT, which is reflected in the POST_LABEL. Thus, these three features are primarily needed for analysis purposes.

In [6]:
## Drop unwanted features
title_links = title_links[["SOURCE_SUBREDDIT","TARGET_SUBREDDIT","LINK_SENTIMENT"]]
body_links = body_links[["SOURCE_SUBREDDIT","TARGET_SUBREDDIT","LINK_SENTIMENT"]]

In [7]:
title_links.head()

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,LINK_SENTIMENT
0,rddtgaming,rddtrust,1
1,xboxone,battlefield_4,1
2,ps4,battlefield_4,1
3,fitnesscirclejerk,leangains,1
4,fitnesscirclejerk,lifeprotips,1


In [8]:
title_source_subreddits = title_links.SOURCE_SUBREDDIT.values.tolist()
print(f'The number of links in the title links dataset:\t {len(title_source_subreddits)}')
title_source_set = set(title_source_subreddits)
print(f'The number of unique source subreddits: \t {len(title_source_set)}')
title_subreddits = set(title_links.SOURCE_SUBREDDIT.values.tolist() + title_links.TARGET_SUBREDDIT.values.tolist())
print(f'the set of all subreddits in the title dataset:\t {len(title_subreddits)}')

The number of links in the title links dataset:	 571927
The number of unique source subreddits: 	 43695
the set of all subreddits in the title dataset:	 54075


In [9]:
body_source_subreddits = body_links.SOURCE_SUBREDDIT.values.tolist()
print(f'The number of links in the body links dataset: \t {len(body_source_subreddits)}')
body_source_set = set(body_source_subreddits)
print(f'The number of unique source subreddits : \t {len(body_source_set)}')
body_subreddits = set(body_links.SOURCE_SUBREDDIT.values.tolist() + body_links.TARGET_SUBREDDIT.values.tolist())
print(f'the set of all subreddits in the body dataset :\t {len(body_subreddits)}')

The number of links in the body links dataset: 	 286561
The number of unique source subreddits : 	 27863
the set of all subreddits in the body dataset :	 35776


In [10]:
Data = pd.concat([title_links,body_links])
Data

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,LINK_SENTIMENT
0,rddtgaming,rddtrust,1
1,xboxone,battlefield_4,1
2,ps4,battlefield_4,1
3,fitnesscirclejerk,leangains,1
4,fitnesscirclejerk,lifeprotips,1
...,...,...,...
286556,negareddit,debatefascism,1
286557,mildlynomil,justnomil,1
286558,mmorpg,blackdesertonline,1
286559,electricskateboards,askreddit,1


In [11]:
source_subreddits = Data.SOURCE_SUBREDDIT.values.tolist()
print(f'The number of links in all data: \t {len(source_subreddits)}')
source_set = set(source_subreddits)
print(f'The number of unique source subreddits:  {len(source_set)}')
subreddits = set(Data.SOURCE_SUBREDDIT.values.tolist() + Data.TARGET_SUBREDDIT.values.tolist())
print(f'the set of all subreddits :\t\t {len(subreddits)}')

The number of links in all data: 	 858488
The number of unique source subreddits:  55863
the set of all subreddits :		 67180


In [12]:
number_pos_links = Data.where(Data.LINK_SENTIMENT==1).count()[0]
number_neg_links = Data.where(Data.LINK_SENTIMENT==-1).count()[0]
print(f'There is {number_pos_links} positive links and {number_neg_links} negative links')

There is 776278 positive links and 82210 negative links


In [13]:
print(f'REDDIT : Percentage of (+)edges is {round(number_pos_links*100/len(Data),1)}% and Percentage of (-)edges is {round(number_neg_links*100/len(Data),1)}%')

REDDIT : Percentage of (+)edges is 90.4% and Percentage of (-)edges is 9.6%


In [14]:
import networkx as nx

In [15]:
complete_graph = nx.from_pandas_edgelist(Data,source='SOURCE_SUBREDDIT', target="TARGET_SUBREDDIT", edge_attr = "LINK_SENTIMENT",create_using=nx.DiGraph)

In [16]:
nbr_nodes = complete_graph.number_of_nodes()
nbr_edges = complete_graph.number_of_edges()
print("REDDIT:    Nodes =",nbr_nodes," Edges =",nbr_edges)

sum_of_pos = sum(1 if w["LINK_SENTIMENT"]==1 else 0 for (_,_,w) in complete_graph.edges(data=True))
sum_of_neg = sum(1 if w["LINK_SENTIMENT"]==-1 else 0 for (_,_,w) in complete_graph.edges(data=True))
perc_of_pos = round(100 *sum_of_pos/nbr_edges,1)
perc_of_neg = round(100 *sum_of_neg/nbr_edges,1)
print("REDDIT : Percentage of (+)edges is {pos}% and Percentage of (-)edges is {neg}%".format(pos = perc_of_pos, neg= perc_of_neg))

REDDIT:    Nodes = 67180  Edges = 339643
REDDIT : Percentage of (+)edges is 92.5% and Percentage of (-)edges is 7.5%


In [17]:
triads = nx.triadic_census(complete_graph)

In [18]:
total_triads = triads['030T']+triads['030C']

In [19]:
total_triads= total_triads+(2*(triads['120U']+triads['120D']+triads['120C']))

In [20]:
total_triads= total_triads+(4*triads['210'])+(8*triads['300'])

In [21]:
total_triads

4077337

The method described above considers only the weight of the last edge when there are multiple edges between the same two nodes.

In contrast, our implementation here introduces a method that considers all weights, where the weight of multiple edges between the same two nodes is stored in a dictionary containing all weights.

In [22]:
import numpy as np

In [23]:
## We groupby the data using pairs (source,target) and collect all link signs between same pair in one attribute (a dictionary)
groupedBySource_Target_pair = Data.groupby(['SOURCE_SUBREDDIT','TARGET_SUBREDDIT']).LINK_SENTIMENT.apply(lambda x: list(x)).to_frame()
groupedBySource_Target_pair.LINK_SENTIMENT = groupedBySource_Target_pair.LINK_SENTIMENT.apply(lambda x: dict(zip(np.arange(len(x)), x)))
groupedBySource_Target_pair

Unnamed: 0_level_0,Unnamed: 1_level_0,LINK_SENTIMENT
SOURCE_SUBREDDIT,TARGET_SUBREDDIT,Unnamed: 2_level_1
007,daniel_craig,{0: 1}
07scape,osrstranscripts,"{0: 1, 1: 1}"
07thexpansion,visualnovels,{0: 1}
098f6bcd4621d373,askreddit,{0: 1}
0________0,todayilearned,{0: 1}
...,...,...
zyramains,threshmains,{0: 1}
zyzz,bodybuilding,{0: 1}
zyzz,fitness,{0: 1}
zyzz,girlsmirin,{0: 1}


In [24]:
## The groupby function makes the pair (source,target) as index.
# To construct the graph, we need to have list of all sources , targets and signs in a dataframe
listed_source = [ elem[0] for elem in groupedBySource_Target_pair.index]
listed_target = [ elem[1] for elem in groupedBySource_Target_pair.index]
listed_sign = [ elem[0] for elem in groupedBySource_Target_pair.values ]

data_df = pd.DataFrame({"source":listed_source,"target":listed_target,"sign":listed_sign})

In [25]:
graph_dict = nx.from_pandas_edgelist(data_df,source='source', target="target", edge_attr = "sign",create_using=nx.DiGraph)

In [26]:
nbr_nodes_dict = graph_dict.number_of_nodes()
nbr_edges_dict = graph_dict.number_of_edges()
print("REDDIT:    Nodes =",nbr_nodes_dict," Edges =",nbr_edges_dict)

REDDIT:    Nodes = 67180  Edges = 339643


In [27]:
def compute_triads(g):
    triads = nx.triadic_census(g)
    total_triads = triads['030T']+triads['030C']
    total_triads= total_triads+(2*(triads['120U']+triads['120D']+triads['120C']))
    total_triads= total_triads+(4*triads['210'])+(8*triads['300'])
    return total_triads

In [28]:
total_triads_dict = compute_triads(graph_dict)

# This method computes the total weight connecting identical pairs of nodes and designates it as a distinct singular weight.

In [29]:
## Here we group by (source,target) but instead of keeping all weights, we sum them into one weight
groupedBySource_Target_summed = Data.groupby(['SOURCE_SUBREDDIT','TARGET_SUBREDDIT']).LINK_SENTIMENT.apply(lambda x: sum(x)).to_frame()
groupedBySource_Target_summed

Unnamed: 0_level_0,Unnamed: 1_level_0,LINK_SENTIMENT
SOURCE_SUBREDDIT,TARGET_SUBREDDIT,Unnamed: 2_level_1
007,daniel_craig,1
07scape,osrstranscripts,2
07thexpansion,visualnovels,1
098f6bcd4621d373,askreddit,1
0________0,todayilearned,1
...,...,...
zyramains,threshmains,1
zyzz,bodybuilding,1
zyzz,fitness,1
zyzz,girlsmirin,1


In [30]:
groupedBySource_Target_summed.LINK_SENTIMENT.unique()

array([  1,   2,   5,  39,   8,   6,   7,   3,  -1,   4,  21,   0,  22,
        54,  11,   9,  12,  25,  62, 103,  14,  15,  50, 105,  13,  10,
        56,  16,  19,  65,  33,  -2,  75,  49,  31,  17,  58,  69,  81,
        36,  26,  23,  24,  34,  18,  41,  47, 187,  53,  38,  -3,  29,
        -4,  40,  20,  -6,  -8,  89,  35,  48,  27,  64,  28,  59,  30,
        32,  37,  71,  52,  72,  61, 172,  63, -22,  -7,  43, 160,  94,
        46,  51,  44,  42, 114,  60,  98,  -5,  45,  70, 210,  83, 223,
        55,  99, 113, 106, 116, 158,  93, 119, 194,  90,  77, 117, 138,
       208, 147, 135, 140,  67, 118,  76,  66,  68, 165, 154, 108,  91,
       127,  80,  74, 125, 112,  87, 142, 133, 120,  79, 121, 206, 175,
        85, 159, 124,  92, 149,  73, 243, 134, 146, 168,  84, 107,  57,
        86,  82,  96, 109, 104,  78, 129, 150, 130, 144, 128,  88, 161,
        -9, 137, 151, 157, -15, -13, 179, 155, 101, 211, 139,  95, 215,
       110, 180, 299,  97, 100, 164, 184, 220, 181, -25, 166, 17

In [31]:
## The groupby function makes the pair (source,target) as index.
# To construct the graph, we need to have list of all sources , targets and signs in a dataframe
listed_source = [ elem[0] for elem in groupedBySource_Target_summed.index]
listed_target = [ elem[1] for elem in groupedBySource_Target_summed.index]
listed_sign = [ elem[0] for elem in groupedBySource_Target_summed.values ]

data_summed = pd.DataFrame({"source":listed_source,"target":listed_target,"sign":listed_sign})

In [32]:
graph_summed = nx.from_pandas_edgelist(data_summed,source='source', target="target", edge_attr = "sign",create_using=nx.DiGraph)

In [33]:
nbr_nodes_summed = graph_summed.number_of_nodes()
nbr_edges_summed = graph_summed.number_of_edges()
print("REDDIT:    Nodes =",nbr_nodes," Edges =",nbr_edges)


sum_of_pos_summed = sum(1 if w["sign"]>0 else 0 for (_,_,w) in graph_summed.edges(data=True))
sum_of_neg_summed = sum(1 if w["sign"]<0 else 0 for (_,_,w) in graph_summed.edges(data=True))
perc_of_pos_summed = round(100 *sum_of_pos_summed/nbr_edges_summed,1)
perc_of_neg_summed = round(100 *sum_of_neg_summed/nbr_edges_summed,1)
print("REDDIT : Percentage of (+)edges is {pos}% and percentage of (-)edges is {neg}%".format(pos = perc_of_pos_summed, neg= perc_of_neg_summed))

REDDIT:    Nodes = 67180  Edges = 339643
REDDIT : Percentage of (+)edges is 92.6% and percentage of (-)edges is 5.7%


In [34]:
total_triads_summed = compute_triads(graph_summed)

# Computing the Mean of the weights

In [35]:
## Here we group by (source,target) but instead of keeping all weights, we sum them into one weight
groupedBySource_Target_mean = Data.groupby(['SOURCE_SUBREDDIT','TARGET_SUBREDDIT']).LINK_SENTIMENT.apply(lambda x: np.mean(x)).to_frame()
groupedBySource_Target_mean

Unnamed: 0_level_0,Unnamed: 1_level_0,LINK_SENTIMENT
SOURCE_SUBREDDIT,TARGET_SUBREDDIT,Unnamed: 2_level_1
007,daniel_craig,1.0
07scape,osrstranscripts,1.0
07thexpansion,visualnovels,1.0
098f6bcd4621d373,askreddit,1.0
0________0,todayilearned,1.0
...,...,...
zyramains,threshmains,1.0
zyzz,bodybuilding,1.0
zyzz,fitness,1.0
zyzz,girlsmirin,1.0


In [36]:
groupedBySource_Target_mean.LINK_SENTIMENT.unique()

array([1.        , 0.33333333, 0.5       , ..., 0.61643836, 0.43820225,
       0.96039604])

In [37]:
## The groupby function makes the pair (source,target) as index.
# To construct the graph, we need to have list of all sources , targets and signs in a dataframe
listed_source = [ elem[0] for elem in groupedBySource_Target_mean.index]
listed_target = [ elem[1] for elem in groupedBySource_Target_mean.index]
listed_sign = [ elem[0] for elem in groupedBySource_Target_mean.values ]

data_mean = pd.DataFrame({"source":listed_source,"target":listed_target,"sign":listed_sign})

graph_mean = nx.from_pandas_edgelist(data_mean,source='source', target="target", edge_attr = "sign",create_using=nx.DiGraph)

In [38]:
nbr_nodes_mean = graph_mean.number_of_nodes()
nbr_edges_mean = graph_mean.number_of_edges()
print("REDDIT:    Nodes =",nbr_nodes," Edges =",nbr_edges)


sum_of_pos_mean = sum(1 if w["sign"]>0 else 0 for (_,_,w) in graph_mean.edges(data=True))
sum_of_neg_mean = sum(1 if w["sign"]<0 else 0 for (_,_,w) in graph_mean.edges(data=True))
perc_of_pos_mean = round(100 *sum_of_pos_mean/nbr_edges_mean,1)
perc_of_neg_mean = round(100 *sum_of_neg_mean/nbr_edges_mean,1)
print("REDDIT : Percentage of (+)edges is {pos}% and percentage of (-)edges is {neg}%".format(pos = perc_of_pos_mean, neg= perc_of_neg_mean))


REDDIT:    Nodes = 67180  Edges = 339643
REDDIT : Percentage of (+)edges is 92.6% and percentage of (-)edges is 5.7%


In [39]:
total_triads_mean = compute_triads(graph_mean)

In [40]:
import pandas as pd

In [41]:

# Assuming you have these variables defined: nbr_nodes_dict, nbr_nodes_summed, nbr_nodes_mean,
# nbr_edges_dict, nbr_edges_summed, nbr_edges_mean,
# perc_of_pos_summed, perc_of_pos_mean,
# perc_of_neg_summed, perc_of_neg_mean,
# total_triads_dict, total_triads_summed, total_triads_mean

# Create an empty DataFrame with the specified columns
output_df = pd.DataFrame(columns=["Dictionary weights", "Summed_weights", "mean_weights"])

# Define the data to be appended as a list of lists
data_to_append = [
    ["{:,}".format(nbr_nodes_dict), "{:,}".format(nbr_nodes_summed), "{:,}".format(nbr_nodes_mean)],
    ["{:,}".format(nbr_edges_dict), "{:,}".format(nbr_edges_summed), "{:,}".format(nbr_edges_mean)],
    [str(None) + '%', str(perc_of_pos_summed) + '%', str(perc_of_pos_mean) + '%'],
    [str(None) + '%', str(perc_of_neg_summed) + "%", str(perc_of_neg_mean) + '%'],
    ["{:,}".format(total_triads_dict), "{:,}".format(total_triads_summed), "{:,}".format(total_triads_mean)]
]

# Create a DataFrame from the data with the specified columns
data_df = pd.DataFrame(data_to_append, columns=["Dictionary weights", "Summed_weights", "mean_weights"])

# Concatenate the original DataFrame with the new DataFrame
output_df = pd.concat([output_df, data_df])

# Display the resulting DataFrame
print(output_df)


  Dictionary weights Summed_weights mean_weights
0             67,180         67,180       67,180
1            339,643        339,643      339,643
2              None%          92.6%        92.6%
3              None%           5.7%         5.7%
4          4,077,337      4,077,337    4,077,337


In [42]:
Graph = nx.from_pandas_edgelist(Data,source='SOURCE_SUBREDDIT', target="TARGET_SUBREDDIT", edge_attr = "LINK_SENTIMENT",create_using=nx.MultiDiGraph)

In [43]:
len(Graph.edges())

858488

In [45]:
total_triads_ = compute_triads(Graph)

In [46]:
total_triads

4077337

# **Generating Communities**

In [47]:
## Imports

import pandas as pd
import networkx as nx
import itertools
import numpy as np

In [48]:
title_links = pd.read_csv("/content/drive/MyDrive/ENS/soc-redditHyperlinks-title.tsv", sep="\t")
body_links = pd.read_csv("/content/drive/MyDrive/ENS/soc-redditHyperlinks-body.tsv", sep="\t")

In [49]:
## Drop unwanted features
title_links = title_links[["SOURCE_SUBREDDIT","TARGET_SUBREDDIT","LINK_SENTIMENT","TIMESTAMP"]]
body_links = body_links[["SOURCE_SUBREDDIT","TARGET_SUBREDDIT","LINK_SENTIMENT","TIMESTAMP"]]


## Print some characteristics for Title links
print(20*'=')
print('Title links :')
title_source_subreddits = title_links.SOURCE_SUBREDDIT.values.tolist()
print(f'The number of links in the title links dataset:\t {len(title_source_subreddits)}')
title_source_set = set(title_source_subreddits)
print(f'The number of unique source subreddits: \t {len(title_source_set)}')
title_subreddits = set(title_links.SOURCE_SUBREDDIT.values.tolist() + title_links.TARGET_SUBREDDIT.values.tolist())
print(f'the set of all subreddits in the title dataset:\t {len(title_subreddits)}')
print(20*'=')

## Print some characteristics for Body links
print(20*'=')
print("Body links :")
body_source_subreddits = body_links.SOURCE_SUBREDDIT.values.tolist()
print(f'The number of links in the body links dataset: \t {len(body_source_subreddits)}')
body_source_set = set(body_source_subreddits)
print(f'The number of unique source subreddits : \t {len(body_source_set)}')
body_subreddits = set(body_links.SOURCE_SUBREDDIT.values.tolist() + body_links.TARGET_SUBREDDIT.values.tolist())
print(f'the set of all subreddits in the body dataset :\t {len(body_subreddits)}')
print(20*'=')

Title links :
The number of links in the title links dataset:	 571927
The number of unique source subreddits: 	 43695
the set of all subreddits in the title dataset:	 54075
Body links :
The number of links in the body links dataset: 	 286561
The number of unique source subreddits : 	 27863
the set of all subreddits in the body dataset :	 35776


In [50]:
## Here we concatenate the both dataframes to create a unique dataset of connexions between subreddits.
Data = pd.concat([title_links,body_links])
Data = Data.rename({"LINK_SENTIMENT":"Sign","TIMESTAMP":'Date'},axis=1)
print("All subreddit links :")
source_subreddits = Data.SOURCE_SUBREDDIT.values.tolist()
print(f'The number of links in all data: \t {len(source_subreddits)}')
source_set = set(source_subreddits)
print(f'The number of unique source subreddits:  {len(source_set)}')
subreddits = set(Data.SOURCE_SUBREDDIT.values.tolist() + Data.TARGET_SUBREDDIT.values.tolist())
print(f'the set of all subreddits :\t\t {len(subreddits)}')

All subreddit links :
The number of links in all data: 	 858488
The number of unique source subreddits:  55863
the set of all subreddits :		 67180


In [51]:
## Let's see the number and percentages of positive and negatives links
number_pos_links = Data.where(Data.Sign==1).count()[0]
number_neg_links = Data.where(Data.Sign==-1).count()[0]
print(f'There is {number_pos_links} positive links and {number_neg_links} negative links')

print(f'REDDIT : Percentage of (+)edges is {round(number_pos_links*100/len(Data),1)}% and percentage of (-)edges is {round(number_neg_links*100/len(Data),1)}%')

There is 776278 positive links and 82210 negative links
REDDIT : Percentage of (+)edges is 90.4% and percentage of (-)edges is 9.6%


Reddit's significantly lower number of negative links seem to primarily signify conflicts rather than merely expressing negative opinions or votes, with conflicts being less frequent occurrences.

In [52]:
complete_graph = nx.from_pandas_edgelist(Data,source='SOURCE_SUBREDDIT', target="TARGET_SUBREDDIT", edge_attr = ["Sign","Date"],create_using=nx.DiGraph)

nbr_nodes = complete_graph.number_of_nodes()
nbr_edges = complete_graph.number_of_edges()
print("REDDIT:    Nodes =",nbr_nodes," Edges =",nbr_edges)

sum_of_pos = sum(1 if w["Sign"]==1 else 0 for (_,_,w) in complete_graph.edges(data=True))
sum_of_neg = sum(1 if w["Sign"]==-1 else 0 for (_,_,w) in complete_graph.edges(data=True))
perc_of_pos = round(100 *sum_of_pos/nbr_edges,1)
perc_of_neg = round(100 *sum_of_neg/nbr_edges,1)
print("REDDIT : Percentage of (+)edges is {pos}% and percentage of (-)edges is {neg}%".format(pos = perc_of_pos, neg= perc_of_neg))

REDDIT:    Nodes = 67180  Edges = 339643
REDDIT : Percentage of (+)edges is 92.5% and percentage of (-)edges is 7.5%


**Given the prevalence of duplicate edges in our dataset, some of which may consist entirely of positive or negative connections, or a mix of both, constructing our directed graph directly from these edges would overlook these duplicates. Instead, it would consider only their most recent occurrences, effectively capturing only the last sign of the link between the respective subreddits. This approach alters our perspectives, particularly in scenarios where all links, except the last one, between subreddit A and subreddit B were negative. In such cases, the generated graph would retain only the positive link, resulting in the loss of significant information.**

In [53]:
complete_multi_graph = nx.from_pandas_edgelist(Data,source='SOURCE_SUBREDDIT', target="TARGET_SUBREDDIT", edge_attr = ["Sign",'Date'],create_using=nx.MultiDiGraph)

nbr_nodes = complete_multi_graph.number_of_nodes()
nbr_edges = complete_multi_graph.number_of_edges()
print("REDDIT:    Nodes =",nbr_nodes," Edges =",nbr_edges)

sum_of_pos = sum(1 if w["Sign"]==1 else 0 for (_,_,w) in complete_multi_graph.edges(data=True))
sum_of_neg = sum(1 if w["Sign"]==-1 else 0 for (_,_,w) in complete_multi_graph.edges(data=True))
perc_of_pos = round(100 *sum_of_pos/nbr_edges,1)
perc_of_neg = round(100 *sum_of_neg/nbr_edges,1)
print("REDDIT : Percentage of (+)edges is {pos}% and percentage of (-)edges is {neg}%".format(pos = perc_of_pos, neg= perc_of_neg))

REDDIT:    Nodes = 67180  Edges = 858488
REDDIT : Percentage of (+)edges is 90.4% and percentage of (-)edges is 9.6%


**Now, it's evident that this graph provides more comprehensive insights into the data compared to the previous one. It's complete, encompassing all edges and nodes, thereby offering a more thorough representation of the dataset.**

In [54]:
## Here we group by (source,target) but instead of keeping all weights, we sum them into one weight
groupedBySource_Target_mean = Data.groupby(['SOURCE_SUBREDDIT','TARGET_SUBREDDIT']).Sign.apply(lambda x: np.mean(x)).to_frame()
## The groupby function makes the pair (source,target) as index.
# To construct the graph, we need to have list of all sources , targets and signs in a dataframe
listed_source = [ elem[0] for elem in groupedBySource_Target_mean.index]
listed_target = [ elem[1] for elem in groupedBySource_Target_mean.index]
listed_sign = [ elem[0] for elem in groupedBySource_Target_mean.values ]

In [55]:
## We groupby the data using pairs (source,target) and collect all link signs between same pair in one attribute (a dictionary)
groupedBySource_Target_pair = Data.groupby(['SOURCE_SUBREDDIT','TARGET_SUBREDDIT']).Date.apply(lambda x: list(x)).to_frame()
groupedBySource_Target_pair.Date = groupedBySource_Target_pair.Date.apply(lambda x: x[len(x)-1])


In [56]:
data_mean = pd.DataFrame({"source":listed_source,"target":listed_target,"Sign":listed_sign,'Date':groupedBySource_Target_pair.Date.apply(lambda x: x[:10].replace('-','/'))}).reset_index().drop(columns=["SOURCE_SUBREDDIT","TARGET_SUBREDDIT"])

graph_mean = nx.from_pandas_edgelist(data_mean,source='source', target="target", edge_attr = ["Sign",'Date'],create_using=nx.DiGraph)


In [57]:
print(f'This meaned graph contains {len(data_mean)} signed edges.')
print(f'The number of edges which mean is equal to 1 is {len(data_mean[data_mean.Sign==1])}')
print(f'The number of edges which mean is equal to -1 is {len(data_mean[data_mean.Sign==-1])}')


This meaned graph contains 339643 signed edges.
The number of edges which mean is equal to 1 is 298473
The number of edges which mean is equal to -1 is 18104


**We observe that the mean sign of the edges predominantly falls into two categories: 1 or -1.**

Based on this observation, we have chosen to define "FRIENDS Communities" as those where multiple (i.e., two or more) edges are exclusively positive, and "ENEMIES Communities" as those where multiple (i.e., two or more) edges are exclusively negative.

**To implement this, we isolate communities that have multiple edges connecting them. Simply having a single negative link between two communities does not suffice to classify them as enemies; instead, we focus on communities with multiple connections.**

In [58]:
# Group by the data using pairs (source, target) and collect all link sentiments between the same pair in one attribute (a dictionary)
groupedBySource_Target_pair = Data.groupby(['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT'])['Sign'].apply(list).reset_index()
groupedBySource_Target_pair['Sign'] = groupedBySource_Target_pair['Sign'].apply(lambda x: dict(enumerate(x)))

# Construct the DataFrame with columns 'source', 'target', and 'sign'
data_dict_df = pd.DataFrame({
    "source": groupedBySource_Target_pair['SOURCE_SUBREDDIT'],
    "target": groupedBySource_Target_pair['TARGET_SUBREDDIT'],
    "sign": groupedBySource_Target_pair['Sign']
})


In [59]:
indices =[]
for i,elem in zip(range(len(data_dict_df.sign.values)),data_dict_df.sign.values):
    if len(elem)>=4: indices.append(i)

In [60]:
data_multiple_edges = data_dict_df.loc[indices]

In [61]:
data_multiple_edges.head()

Unnamed: 0,source,target,sign
22,100daysofketo,keto,"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1}"
25,100daysofrejection,30daystosortmylifeout,"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1}"
27,100daysofrejection,howtonotgiveafuck,"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: ..."
30,100movies365days,movieaweek,"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1}"
31,100movies365days,movieclub,"{0: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1}"


In [62]:
len(set(data_multiple_edges.source.unique()) or set(data_multiple_edges.target.unique()))

9760

In [63]:
multiple_edges_meaned_data = data_mean.loc[indices].copy()

In [64]:
multiple_edges_meaned_data

Unnamed: 0,source,target,Sign,Date
22,100daysofketo,keto,1.0,2016/05/07
25,100daysofrejection,30daystosortmylifeout,1.0,2015/12/01
27,100daysofrejection,howtonotgiveafuck,1.0,2015/11/14
30,100movies365days,movieaweek,1.0,2014/07/12
31,100movies365days,movieclub,1.0,2014/07/12
...,...,...,...,...
339586,zorceror44,writingprompts,1.0,2016/08/25
339596,zreviews,audiophile,1.0,2016/06/25
339634,zylooxwrites,writingprompts,1.0,2015/11/05
339635,zyramains,leagueoflegends,1.0,2016/11/16


In [65]:
enemies_df = multiple_edges_meaned_data[multiple_edges_meaned_data.Sign==-1].copy()
enemies_df

Unnamed: 0,source,target,Sign,Date
36987,bestof,veterans,-1.0,2016/08/10
38789,bestoftldr,news,-1.0,2017/02/11
62725,circlejerkcopypasta,shitamericanssay,-1.0,2016/03/10
80951,deadpoolhatersclub,overwatch,-1.0,2016/09/28
90863,drama,biggestproblem,-1.0,2016/09/26
91629,drama,pokemon,-1.0,2017/01/27
91785,drama,skyrim,-1.0,2015/04/23
94739,ebolahoax,videos,-1.0,2015/02/10
99863,enoughlibertarianspam,relationships,-1.0,2016/07/18
122314,gamerghazi,magictcg,-1.0,2015/11/04


In [66]:
set(enemies_df.source.unique()) -  (set(Data[Data.Sign==1].SOURCE_SUBREDDIT.unique()) or set(Data[Data.Sign==1].TARGET_SUBREDDIT.unique()))


{'mongolianhatewatch', 'the_iowa'}

**Here, we observe that certain subreddits exclusively initiate negative links to other subreddits.**

In [67]:
Data[Data.SOURCE_SUBREDDIT=="mongolianhatewatch"]

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,Sign,Date
235198,mongolianhatewatch,european,-1,2015-08-30 07:52:35
238207,mongolianhatewatch,european,-1,2015-08-30 07:52:35
242467,mongolianhatewatch,european,-1,2015-08-30 07:52:35
273497,mongolianhatewatch,european,-1,2015-11-02 17:55:45
277230,mongolianhatewatch,topmindsofreddit,-1,2015-11-02 22:14:04
277237,mongolianhatewatch,againsthatesubreddits,-1,2015-11-02 22:14:04
279497,mongolianhatewatch,european,-1,2015-11-02 22:14:04


In [68]:
Data[Data.SOURCE_SUBREDDIT=="the_iowa"]

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,Sign,Date
543601,the_iowa,iowa,-1,2017-03-18 06:16:24
544359,the_iowa,iowa,-1,2017-03-19 06:18:24
546131,the_iowa,iowa,-1,2017-03-22 09:52:00
547385,the_iowa,iowa,-1,2017-03-24 10:27:57


In [69]:
Data[Data.SOURCE_SUBREDDIT=="shittyokcupid2"]

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,Sign,Date
467277,shittyokcupid2,okcupid,-1,2016-11-09 03:58:04
472275,shittyokcupid2,okcupid,-1,2016-11-17 08:19:39
475311,shittyokcupid2,okcupid,-1,2016-11-24 06:57:51


**Negative links tend to repeat within a short timeframe, typically within a maximum of one month, suggesting a recurring conflict.**

In [70]:
print(multiple_edges_meaned_data.columns)


Index(['source', 'target', 'Sign', 'Date'], dtype='object')


In [71]:
friends_df = multiple_edges_meaned_data[multiple_edges_meaned_data['Sign'] == 1].copy()

In [72]:
friends_df

Unnamed: 0,source,target,Sign,Date
22,100daysofketo,keto,1.0,2016/05/07
25,100daysofrejection,30daystosortmylifeout,1.0,2015/12/01
27,100daysofrejection,howtonotgiveafuck,1.0,2015/11/14
30,100movies365days,movieaweek,1.0,2014/07/12
31,100movies365days,movieclub,1.0,2014/07/12
...,...,...,...,...
339586,zorceror44,writingprompts,1.0,2016/08/25
339596,zreviews,audiophile,1.0,2016/06/25
339634,zylooxwrites,writingprompts,1.0,2015/11/05
339635,zyramains,leagueoflegends,1.0,2016/11/16


In [73]:
print(f'number of source subreddits from which originates the friendships : {len(friends_df.source.unique())}')
print(f'number of source subreddits from which originates the conflicts : {len(enemies_df.source.unique())}')


number of source subreddits from which originates the friendships : 8153
number of source subreddits from which originates the conflicts : 27


In [74]:
indices_source_enemies = []
for i, elem in enumerate(Data.SOURCE_SUBREDDIT.values):
    if elem in enemies_df.source.unique():
        indices_source_enemies.append(i)

In [75]:
print(Data.index)


Index([     0,      1,      2,      3,      4,      5,      6,      7,      8,
            9,
       ...
       286551, 286552, 286553, 286554, 286555, 286556, 286557, 286558, 286559,
       286560],
      dtype='int64', length=858488)


In [76]:
# Convert the set valid_indices to a list
valid_indices_list = list(indices_source_enemies)

# Select the subset of data using only valid indices
edges_originated_from_enemies_source = Data.iloc[valid_indices_list].copy()


In [77]:
print(edges_originated_from_enemies_source.columns)

Index(['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT', 'Sign', 'Date'], dtype='object')


In [78]:
# Filter out rows with NaN values
edges_originated_from_enemies_source = edges_originated_from_enemies_source.dropna()

# Filter negative edges based on the 'Sign' column
negative_edges_originated_from_enemies_source = edges_originated_from_enemies_source[edges_originated_from_enemies_source['Sign'] == -1]


In [79]:
negative_edges_originated_from_enemies_source

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,Sign,Date
16,subredditdrama,nfl,-1,2013-12-31 21:59:32
135,subredditdrama,cringe,-1,2014-01-01 14:45:00
143,subredditdrama,mildlyinteresting,-1,2014-01-01 15:01:57
193,subredditdrama,trees,-1,2014-01-01 19:58:38
194,subredditdrama,canada,-1,2014-01-01 19:58:38
...,...,...,...,...
285867,subredditdrama,livestreamfail,-1,2017-04-27 21:26:23
285962,subredditdrama,relationships,-1,2017-04-28 08:57:12
286398,subredditcancer,the_donald,-1,2017-04-30 06:06:55
286450,subredditdrama,vive,-1,2017-04-30 10:10:03


In [80]:
all_negative_signs = Data[Data.Sign==-1].copy()
all_negative_signs

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,Sign,Date
16,subredditdrama,nfl,-1,2013-12-31 21:59:32
20,libertyworldproblems,bitcoin,-1,2014-01-01 13:56:53
52,shitstatistssay,foodforthought,-1,2014-01-01 07:01:10
69,childfree,adviceanimals,-1,2014-01-01 11:06:32
80,levantinewar,worldpolitics,-1,2014-01-01 10:48:53
...,...,...,...,...
286475,badpolitics,bannedfromthe_donald,-1,2017-04-30 11:57:36
286491,tipofmytongue,deathcore,-1,2017-04-30 12:34:09
286501,soundcloud,procss,-1,2017-04-30 12:16:13
286523,enoughtrumpspam,humansbeingbros,-1,2017-04-30 14:25:18


In [81]:
len(set(all_negative_signs.SOURCE_SUBREDDIT.unique()) or set(all_negative_signs.TARGET_SUBREDDIT.unique()))


8698

**Here, we calculate the percentage of negative edges originating from the subset of source subreddits that led to more than 2 conflicts, compared to the total number of negative links.**

In [82]:
100 * len(negative_edges_originated_from_enemies_source) / len(all_negative_signs)


21.718769006203626

**Here, we calculate the percentage of source subreddits that led to more than 3 conflicts compared to the total set of source subreddits from which a negative link originated.**

In [83]:
100 * len(enemies_df.source.unique()) / len(all_negative_signs.SOURCE_SUBREDDIT.unique())


0.3104161876293401

In [84]:
indices = []
for i,elem in zip(range(len(all_negative_signs)),all_negative_signs.SOURCE_SUBREDDIT.values):
    if elem in set(enemies_df.source.unique()):
        indices.append(i)

In [85]:
# Check for missing indices
missing_indices = [idx for idx in indices if idx not in all_negative_signs.index]
if missing_indices:
    print("Missing indices:", missing_indices)
    # Handle missing indices, either by removing them or correcting them

# Filter out only the valid indices
valid_indices = [idx for idx in indices if idx in all_negative_signs.index]

# Access the DataFrame using only valid indices
negative_links_originated_from_enemies_df = all_negative_signs.loc[valid_indices].copy()


Missing indices: [0, 10, 11, 13, 14, 19, 22, 25, 26, 27, 35, 36, 39, 41, 42, 51, 56, 61, 63, 67, 71, 77, 79, 82, 98, 101, 102, 104, 113, 117, 124, 131, 133, 139, 141, 147, 148, 152, 155, 157, 161, 162, 171, 173, 174, 175, 176, 179, 190, 192, 196, 200, 201, 203, 205, 207, 212, 214, 217, 220, 231, 232, 235, 236, 239, 241, 247, 249, 251, 258, 260, 263, 269, 272, 279, 283, 287, 293, 304, 306, 311, 314, 317, 318, 320, 322, 323, 324, 332, 335, 345, 347, 348, 353, 356, 359, 363, 366, 367, 372, 374, 375, 378, 381, 386, 389, 391, 393, 397, 398, 403, 412, 425, 430, 436, 438, 443, 445, 448, 449, 450, 451, 453, 454, 460, 464, 468, 469, 470, 474, 475, 477, 481, 485, 489, 490, 508, 509, 511, 514, 517, 519, 530, 533, 534, 535, 536, 542, 546, 553, 554, 555, 560, 561, 564, 571, 572, 575, 577, 578, 581, 583, 584, 589, 593, 596, 602, 603, 610, 611, 615, 617, 618, 623, 625, 635, 643, 646, 648, 650, 656, 658, 661, 663, 664, 666, 667, 672, 677, 681, 683, 685, 688, 689, 692, 693, 695, 697, 700, 704, 708, 723

**Here, we calculate the percentage of negative links originating from subreddits with more than 2 conflicts, comparing it to the total set of negative links.**

In [86]:
100*len(negative_links_originated_from_enemies_df)/len(all_negative_signs)

4.06519888091473

In [87]:
indices = []
for i,source,target in zip(range(len(Data)),Data.SOURCE_SUBREDDIT.values, Data.TARGET_SUBREDDIT.values):
    if (source in enemies_df.source.unique() or target in enemies_df.source.unique()):
        indices.append(i)

In [88]:
# Check for missing indices
missing_indices = [idx for idx in indices if idx not in Data.index]
if missing_indices:
    print("Missing indices:", missing_indices)
    # Handle missing indices, either by removing them or correcting them

# Filter out only the valid indices
valid_indices = [idx for idx in indices if idx in Data.index]

# Access the DataFrame using only valid indices and drop NaN values
links_sourceORtarget_sourceConflicts = Data.loc[valid_indices].dropna()


Missing indices: [571940, 571960, 572004, 572026, 572053, 572057, 572062, 572067, 572077, 572118, 572123, 572153, 572197, 572199, 572210, 572220, 572237, 572285, 572298, 572300, 572304, 572305, 572345, 572346, 572392, 572438, 572442, 572448, 572479, 572530, 572579, 572606, 572607, 572628, 572657, 572661, 572698, 572712, 572723, 572724, 572738, 572752, 572756, 572757, 572767, 572776, 572781, 572792, 572799, 572802, 572811, 572842, 572920, 572926, 572927, 572948, 572974, 573000, 573001, 573027, 573085, 573086, 573113, 573118, 573140, 573147, 573152, 573166, 573169, 573210, 573221, 573231, 573248, 573269, 573301, 573339, 573497, 573607, 573621, 573644, 573690, 573736, 573762, 573764, 573775, 573776, 573783, 573823, 573853, 573928, 573957, 573997, 574045, 574066, 574069, 574085, 574095, 574097, 574098, 574155, 574159, 574173, 574231, 574235, 574263, 574267, 574287, 574294, 574299, 574300, 574339, 574352, 574353, 574445, 574495, 574550, 574561, 574573, 574594, 574600, 574625, 574672, 574678

In [89]:
count = 0
for elem in set(all_negative_signs.SOURCE_SUBREDDIT.unique()):
    if elem in set(all_negative_signs.TARGET_SUBREDDIT.unique()):
        count+=1

In [90]:
100*len(set(all_negative_signs.SOURCE_SUBREDDIT.unique()) and set(all_negative_signs.TARGET_SUBREDDIT.unique())) / len(set(all_negative_signs.SOURCE_SUBREDDIT.unique()))


76.2819038859508

We can observe that if a subreddit initiates a negative link to another subreddit, it will be involved as a target in 76% of conflicts.

In [91]:
100*len(set(all_negative_signs.SOURCE_SUBREDDIT.unique()) or set(all_negative_signs.TARGET_SUBREDDIT.unique())) / len(set(Data.SOURCE_SUBREDDIT.unique()) or set(Data.TARGET_SUBREDDIT.unique()))

15.570234323255105

We can conclude that conflicts between subreddit communities stem from a mere 15.5% of the total nodes.

**Triads in the two different graphs**

In [101]:
def compute_count_wanted_triads(graph):
    triads = nx.triadic_census(graph)
    total_triads = triads['030T']+triads['030C']
    total_triads= total_triads+(2*(triads['120U']+triads['120D']+triads['120C']))
    total_triads=total_triads+(4*triads['210'])+(8*triads['300'])
    return total_triads

In [102]:
total_multigraph_triads = compute_count_wanted_triads(complete_multi_graph)
total_meanedgraph_triads = compute_count_wanted_triads(graph_mean)

In [103]:
print(20*'=')
print('MEANED WEIGHTS DIRECTED GRAPH')
nbr_nodes_mean = graph_mean.number_of_nodes()
nbr_edges_mean = graph_mean.number_of_edges()
print("REDDIT:    Nodes =",nbr_nodes_mean," Edges =",nbr_edges_mean)

MEANED WEIGHTS DIRECTED GRAPH
REDDIT:    Nodes = 67180  Edges = 339643


In [104]:
# Assuming the key is "Sign" instead of "sign"
sum_of_pos_mean = sum(1 if w["Sign"] > 0 else 0 for (_, _, w) in graph_mean.edges(data=True))
sum_of_neg_mean = sum(1 if w["Sign"] < 0 else 0 for (_, _, w) in graph_mean.edges(data=True))

perc_of_pos_mean = round(100 * sum_of_pos_mean / nbr_edges_mean, 1)
perc_of_neg_mean = round(100 * sum_of_neg_mean / nbr_edges_mean, 1)
print("REDDIT: Percentage of (+) edges is {pos}% and percentage of (-) edges is {neg}%".format(pos=perc_of_pos_mean, neg=perc_of_neg_mean))


REDDIT: Percentage of (+) edges is 92.6% and percentage of (-) edges is 5.7%


In [105]:
print(20*'=')
print('MULTI EDGED DIRECTED GRAPH')
nbr_nodes_multi_graph = complete_multi_graph.number_of_nodes()
nbr_edges_multi_graph = complete_multi_graph.number_of_edges()
print("REDDIT:    Nodes =",nbr_nodes_multi_graph," Edges =",nbr_edges_multi_graph)

MULTI EDGED DIRECTED GRAPH
REDDIT:    Nodes = 67180  Edges = 858488


In [106]:
sum_of_pos_multi_graph = sum(1 if w["Sign"]>0 else 0 for (_,_,w) in graph_mean.edges(data=True))
sum_of_neg_multi_graph = sum(1 if w["Sign"]<0 else 0 for (_,_,w) in graph_mean.edges(data=True))

perc_of_pos_multi_graph = round(100 *sum_of_pos_multi_graph/nbr_edges_multi_graph,1)
perc_of_neg_multi_graph = round(100 *sum_of_neg_multi_graph/nbr_edges_multi_graph,1)
print("REDDIT : Percentage of (+)edges is {pos}% and percentage of (-)edges is {neg}%".format(pos = perc_of_pos_multi_graph, neg= perc_of_neg_multi_graph))

REDDIT : Percentage of (+)edges is 36.6% and percentage of (-)edges is 2.3%


In [108]:
output_data = [
    ["{:,}".format(nbr_nodes_multi_graph), "{:,}".format(nbr_nodes_mean)],
    ["{:,}".format(nbr_edges_multi_graph), "{:,}".format(nbr_edges_mean)],
    [str(perc_of_pos_multi_graph) + '%', str(perc_of_pos_mean) + '%'],
    [str(perc_of_neg_multi_graph) + '%', str(perc_of_neg_mean) + '%'],
    ["{:,}".format(total_multigraph_triads), "{:,}".format(total_meanedgraph_triads)]
]

output_df = pd.DataFrame(output_data, columns=["MultiGraph", "meaned_weights"], index=["Nodes", "Edges", "+ edges", "- edges", "Triads"])


In [109]:
output_df

Unnamed: 0,MultiGraph,meaned_weights
Nodes,67180,67180
Edges,858488,339643
+ edges,36.6%,92.6%
- edges,2.3%,5.7%
Triads,4077337,4077337
