# Social Network Analysis - Python Handson

In [1]:
import numpy as np
from networkx import nx

import pandas as pd
import datetime
import matplotlib.pyplot as plt
from numpy.linalg import matrix_power


## The Data

The data set is provided by Der Standard, one of the top Austrian newspapers.
In the online Standard people can post comments below articles and up/down vote comments.
The data set used in this handson and further in the project part of the course will consider a sample of those articles, comments, and votes. 

In [2]:
date_cols = ["PostingCreatedAt","ArticlePublishingDate"]

df1 = pd.read_csv('../data/Postings_01052019_15052019.csv',usecols=["ID_CommunityIdentity", "ID_Posting", "PostingCreatedAt", "ArticleTitle",'ArticleChannel' ,"ArticleRessortName","ArticlePublishingDate"],parse_dates=date_cols, sep=';')
df2 = pd.read_csv('../data/Postings_16052019_31052019.csv', usecols=["ID_CommunityIdentity", "ID_Posting","PostingCreatedAt", "ArticleTitle",'ArticleChannel' ,"ArticleRessortName","ArticlePublishingDate"], parse_dates=date_cols,sep=';')
df=df1.append(df2, ignore_index=True)
df.shape

(739094, 7)

In [3]:
df = df[(df.ArticleChannel == "Inland") & (~df.ArticleRessortName.isin([ "Pensionen", "Eurofighter","Off-Topic"]))]
#df.head()

In [4]:
df.shape

(185509, 7)

There are different entities in the data set: 
* Users - identified by *ID_CommunityIdentity* (or *UserCommunityName*)
* Postings - identified by *ID_Posting*
* Articles - identified by *ID_Article*

Thus, there are different possibilities to build networks based on voting and posting data. 
We will concentrate now on the ***votes-to-network***. 


In [5]:
date_cols = ["VoteCreatedAt","UserCreatedAt"]
votes1 = pd.read_csv('../data/Votes_01052019_15052019.csv',parse_dates=date_cols, sep=';')
votes2 = pd.read_csv('../data/Votes_16052019_31052019.csv', parse_dates=date_cols,sep=';')
votes=votes1.append(votes2, ignore_index=True)
#votes.head()

In [6]:
PostAndVotes=pd.merge(df,votes,on="ID_Posting")
#PostAndVotes.head()
PostAndVotes.shape

(1046528, 14)

In [7]:
# filter off (= 1)
PostAndVotes_less=PostAndVotes.groupby('ID_Posting').filter(lambda x : len(x)>1).copy()
split_date= datetime.datetime(2019,5,17)

PostAndVotes_before = PostAndVotes_less.loc[PostAndVotes_less['PostingCreatedAt'] <= split_date]
PostAndVotes_after = PostAndVotes_less.loc[PostAndVotes_less['PostingCreatedAt'] > split_date]
#PostAndVotes_after.head()

In [8]:
print('Before shape: ' + str(PostAndVotes_before.shape))
print('After shape: ' + str(PostAndVotes_after.shape))


Before shape: (326124, 14)
After shape: (694606, 14)


A line in the table above shows that a user (i.e., *ID_CommunityIdentiy*) posted a comment. Every post has its own uniqe identifier (i.e., *ID_Posting*). If a user votes for a posting then the vote is identified by the *ID_Posting* the voting was for, the *ID_CommunityIdentiy* from the voter. Next, it is also recorded, if the vote was negative or positive. This informtion is saved in  *VoteNegative* and *VotePositive* respectively.  

We want to bring the structure above into following format: 
* source, i.e., the voting user
* target, i.e., the post creator
* weight, i.e., how often the source voted for the target (postive and negative)

In other words, we are aiming for a *weighted edge-list*.

### Edges

In [9]:
edgeListBefore= PostAndVotes_before.groupby(["ID_CommunityIdentity_x","ID_CommunityIdentity_y"]).agg({"VoteNegative": [("votes_neg_count","sum")], "VotePositive":[("votes_pos_count","sum")]})
edgeListAfter= PostAndVotes_after.groupby(["ID_CommunityIdentity_x","ID_CommunityIdentity_y"]).agg({"VoteNegative": [("votes_neg_count","sum")], "VotePositive":[("votes_pos_count","sum")]})


In [10]:
edgeListBefore.columns=edgeListBefore.columns.droplevel()
edgeListAfter.columns=edgeListAfter.columns.droplevel()

In [11]:
#edgeListBefore.loc[edgeListBefore["votes_neg_count"]>2]
#edgeListBefore.describe()

### Weight calculation
PLEASE DEFINE YOUR WEIGHT BELOW 

In [12]:
## original
#edgeListBefore["weight"]=edgeListBefore["votes_pos_count"]-edgeListBefore["votes_neg_count"]
#edgeListAfter["weight"]=edgeListAfter["votes_pos_count"]-edgeListAfter["votes_neg_count"]

## pos > neg
#edgeListBefore["weight"]=np.where(edgeListBefore["votes_pos_count"] >= edgeListBefore["votes_neg_count"], 1, -1) 
#edgeListAfter["weight"]=np.where(edgeListAfter["votes_pos_count"] >= edgeListAfter["votes_neg_count"], 1, -1) 

In [13]:
## v1
# edgeListBefore["weight"]= (1+edgeListBefore["votes_pos_count"])/(1+edgeListBefore["votes_neg_count"])
# edgeListAfter["weight"]=(1+edgeListAfter["votes_pos_count"])/(1+edgeListAfter["votes_neg_count"])

## v2: (-1 if any_neg_vote else 1)
edgeListBefore["weight"]= np.where(edgeListBefore["votes_neg_count"] > 0, -1, 1) 
edgeListAfter["weight"]= np.where(edgeListAfter["votes_neg_count"] > 0, -1, 1) 

In [14]:
edgeListBefore.rename_axis(['source', 'target'], inplace=True)
edgeListAfter.rename_axis(['source', 'target'], inplace=True)
edgeListAfter.describe()

Unnamed: 0,votes_neg_count,votes_pos_count,weight
count,599530.0,599530.0,599530.0
mean,0.226669,0.931915,0.601171
std,0.53239,0.854299,0.799121
min,0.0,0.0,-1.0
25%,0.0,1.0,1.0
50%,0.0,1.0,1.0
75%,0.0,1.0,1.0
max,61.0,114.0,1.0


In [15]:
edgesBefore = edgeListBefore.drop(['votes_neg_count', 'votes_pos_count'], axis=1).reset_index()
edgesAfter = edgeListAfter.drop(['votes_neg_count', 'votes_pos_count'], axis=1).reset_index()
#edgesAfter

In [16]:
edgesBefore.to_csv("../data/edges_before.csv", index=False)
edgesAfter.to_csv("../data/edges_after.csv", index=False)

### Quickstart point #1

In [1]:
## quickstart
import numpy as np
from networkx import nx

import pandas as pd
import datetime
import matplotlib.pyplot as plt
from numpy.linalg import matrix_power

edgesBefore = pd.read_csv("../data/edges_before.csv")
edgesAfter = pd.read_csv("../data/edges_after.csv")

### Graph

We use the *networkx* library.
Since we build a *votes-to-network* we have *source* nodes and *target* nodes. 
Thus, the network is directed.
Therefore, we use *nx.Digraph()*

In [2]:
G = nx.from_pandas_edgelist(edgesAfter, 
                            source='source', 
                            target='target', 
                            edge_attr = 'weight',
                            create_using=nx.DiGraph())
A = nx.to_numpy_matrix(G)

### Create Adjacency Matrix from diGraph, take A^3

In [5]:
A3 = matrix_power(A,3)

In [7]:
print(np.trace(A3))
## shape = (20181, 20181)

819600.0

### Make a dictionary:  A3di from subset of elements where A3[i,i]>0
diagonal & directed & its a dictionary!

In [113]:
A3di = {}
for i in range(len(A3)):
    if (A3[i,i] > 0):
        A3di[i] = A3[i,i] 
len(A3di.keys())

5864

#### Write dictionary to file

In [115]:
# write after
A3di_after_df = pd.DataFrame.from_dict(A3di, orient='index')
A3di_after_df.to_csv("../data/A3di_after.csv")

# read after
A3di_after_df = pd.read_csv("../data/A3di_after.csv", index_col=0)
A3di = A3di_after_df['0'].to_dict()

len(A3di.keys())

5864

### Repeat for before

In [108]:
G_b = nx.from_pandas_edgelist(edgesBefore, 
                            source='source', 
                            target='target', 
                            edge_attr = 'weight',
                            create_using=nx.DiGraph())
Adj_b = nx.to_numpy_matrix(G_b)

In [109]:
A3_b = matrix_power(Adj_b,3)

In [110]:
A3di_b = {}
for i in range(len(A3_b)):
    if (A3_b[i,i] > 0):
        A3di_b[i] = A3_b[i,i] 
len(A3di_b.keys())

3369

In [111]:
# write before
A3di_before_df = pd.DataFrame.from_dict(A3di_b, orient='index')
A3di_before_df.to_csv("../data/A3di_before.csv")

# read before
A3di_before_df = pd.read_csv("../data/A3di_before.csv", index_col=0)
A3di_b = A3di_before_df['0'].to_dict()
len(A3di_b.keys())

3369

# Quickstart here
### Now use this to create a subgraph which should be highly connected.
Easily filter with values (which are A3[key,key])

In [116]:
# read before
A3di_before_df = pd.read_csv("../data/A3di_before.csv", index_col=0)
A3di_b = A3di_before_df['0'].to_dict()
# read after
A3di_after_df = pd.read_csv("../data/A3di_after.csv", index_col=0)
A3di = A3di_after_df['0'].to_dict()
print(len(A3di.keys()), len(A3di_b.keys()))

5864 3369


In [121]:
Gmini = G.subgraph(A3di.keys()).copy()
## possibly filter value > k

In [122]:
nx.triadic_census(Gmini)

{'003': 412398,
 '012': 6275,
 '102': 227,
 '021D': 41,
 '021U': 135,
 '021C': 94,
 '111D': 29,
 '111U': 7,
 '030T': 8,
 '030C': 0,
 '201': 1,
 '120D': 3,
 '120U': 1,
 '120C': 1,
 '210': 0,
 '300': 0}

### ...

## Previously...

In [None]:
edgesBefore = pd.read_csv("../data/votes_to_comments_before.csv")
edgesAfter = pd.read_csv("../data/votes_to_comments_after.csv")

In [None]:
G = nx.from_pandas_edgelist(edgesAfter, 
                            source='source', 
                            target='target', 
                            edge_attr = 'weight',
                            create_using=nx.DiGraph())

In [3]:
UG = G.to_undirected()
#count=0
for node in G:
    for ngbr in nx.neighbors(G, node):
        if node in nx.neighbors(G, ngbr):
            UG.edges[node, ngbr]['weight'] = (np.where( 
                G.edges[node, ngbr]['weight'] + G.edges[ngbr, node]['weight'] >=0,1,-1))
           # if np.sign(G.edges[node, ngbr]['weight'])==np.sign(G.edges[ngbr, node]['weight']):
                                       #       count=count+1

In [None]:
A = nx.to_numpy_matrix(UG)
A3 = matrix_power(A,3)

In [None]:
A_absolut_3 = matrix_power(abs(A),3)
triangle_index = (np.trace(A3) + np.trace(A_absolut_3)) / (2*np.trace(A_absolut_3))
triangle_index

## Before

In [None]:
#edgesBefore = pd.read_csv("../data/votes_to_comments_before.csv")
G_2 = nx.from_pandas_edgelist(edgesBefore, 
                            source='source', 
                            target='target', 
                            edge_attr = 'weight',
                            create_using=nx.DiGraph())

In [None]:
UG_2 = G_2.to_undirected()
for node in G_2:
    for ngbr in nx.neighbors(G_2, node):
        if node in nx.neighbors(G_2, ngbr):
            UG_2.edges[node, ngbr]['weight'] = (np.where( 
                G_2.edges[node, ngbr]['weight'] + G_2.edges[ngbr, node]['weight'] >=0,1,-1))
           # if np.sign(G.edges[node, ngbr]['weight'])==np.sign(G.edges[ngbr, node]['weight']):
                                       #       count=count+1

In [None]:
A = nx.to_numpy_matrix(UG_2)
A3 = matrix_power(A,3)


In [None]:
A_absolut_3 = matrix_power(abs(A),3)
triangle_index_2 = (np.trace(A3) + np.trace(A_absolut_3)) / (2*np.trace(A_absolut_3))
triangle_index_2

__________________________________________

## Triadic Census

In [None]:
nx.algorithms.triads.triadic_census(G)

In [None]:
from nxtriads import triads_by_type
### Copied source code instead of updating to NetworkX == 2.5
### see file nxtriads.py
## __all__ = ["triadic_census", "is_triad", "all_triplets", "all_triads",
## "triads_by_type", "triad_type", "random_triad", ]

# convenience
from networkx import nx
import pandas as pd

In [None]:
edges5d = pd.read_csv("../data/votes_to_comments_5days.csv")
G5 = nx.from_pandas_edgelist(edges5d, 
                            source='source', 
                            target='target', 
                            edge_attr = 'weight',
                            create_using=nx.DiGraph())
print(nx.info(G5))

In [None]:
## kills kernel every time (pushes RAM >>10gb even on tiny graph)
triads_5 = triads_by_type(G5)

In [None]:

#triads_5.keys()