In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
      white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
import pandas as pd
import numpy as np
import os
import networkx as nx
from tqdm import tqdm 
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import random
from sklearn.metrics import roc_auc_score,f1_score,recall_score, mean_absolute_error, mean_squared_error, r2_score
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import *

!pip install dgl
import dgl
# from dgl.data import DGLDataset
from dgl.nn import GraphConv,GATConv,SAGEConv,HeteroGraphConv


import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl.function as fn

print('Cuda Enabled:', torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() and False else "cpu")
print(device)

from google.colab import drive
drive.mount('/content/gdrive')



Using backend: pytorch


Cuda Enabled: False
cpu
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Import external python file

In [None]:
import sys
sys.path.append('/content/gdrive/My Drive/Loopy_belief_propagation/Graph_neural_network/code')

import features
import models

In [None]:
data_path = "/content/gdrive/My Drive/Loopy_belief_propagation/Graph_neural_network/Data/"
df_user = pd.read_csv(data_path+"politifact_shu_user_data_formated.csv")
df_news = pd.read_csv(data_path+"all_features_for_title_and_text_shu_936_news.csv")
df_source = pd.read_csv(data_path+"politifact_shu_source_data_formated.csv")
df_followers_sample = pd.read_csv(data_path+"politifact_shu_follower_data_formated_with_features_only.csv")
df_all_user_news_pair = pd.read_csv(data_path+"politifact_shu_user_news_pair_data_formated.csv")
df_all_source_news_pair = pd.read_csv(data_path+"politifact_shu_source_news_pair_data_formated.csv")
df_506_news = pd.read_csv("/content/gdrive/My Drive/Loopy_belief_propagation/Graph_neural_network/code/\
code_for_516_news/code_for_news_with_atleast_3_tweets_recompute_user_label/506_news_with_atleast_3_tweets.csv")

In [None]:
#change news label to numeric
df_news.drop_duplicates(subset=['news_id'], inplace=True)
df_news.rename(columns={'label':'news_label'}, inplace=True)
df_news.news_label = df_news.news_label.apply(lambda x: 1 if x in ["fake","Fake",'1',1,'false',
                                                   'half-true', 'pants-fire', 
                                                   'barely-true','full-flop'] else 0)

In [None]:
#select news with atleast 3 tweets
df_news = df_news[df_news.news_id.isin(df_506_news.news_id)]

In [None]:
df_source.factual_score = (df_source.factual_score/0.16)-1

In [None]:
df_user.shape, df_news.shape, df_source.shape

((1135, 246), (506, 140), (88, 5))

# select only those news shared by users with features

In [None]:
df_all_user_news_pair = df_all_user_news_pair.astype(str)
df_user.user_id = df_user.user_id.astype(str)
df_user_news_pair = df_all_user_news_pair[df_all_user_news_pair.user_id.isin(df_user.user_id)]


In [None]:
df_user_news_pair.shape, df_user_news_pair.user_id.nunique(), df_user_news_pair.news_id.nunique()

((15511, 3), 1135, 544)

In [None]:
df_news = df_news[df_news.news_id.isin(df_user_news_pair.news_id)]
df_news.shape, df_news.news_id.nunique()

((506, 140), 506)

select only those pairs where news are shared by user and have features

In [None]:
df_user_news_pair = df_user_news_pair[df_user_news_pair.news_id.isin(df_news.news_id)]

In [None]:
df_user_news_pair.shape, df_user_news_pair.user_id.nunique(), df_user_news_pair.news_id.nunique()

((14309, 3), 1135, 506)

In [None]:
df_source_news_pair = df_all_source_news_pair[df_all_source_news_pair.news_id.isin(df_news.news_id)]
df_source_news_pair.shape, df_source_news_pair.news_id.nunique()

((560, 2), 506)

# Add source bias and news

In [None]:
df_news_source_added = df_news.merge(df_source_news_pair, on='news_id', how='left')
df_news_source_added = df_news_source_added.merge(df_source[['source_id','bias']], on='source_id', how='left')
df_news_source_added.shape

(560, 142)

In [None]:
df_news_source_added.columns

Index(['news_id', 'news_label', 'WC', 'Analytic', 'Clout', 'Authentic', 'Tone',
       'WPS', 'Sixltr', 'Dic',
       ...
       'Sadness', 'Surprise', 'Trust', 'Objective', 'compound', 'neg', 'neu',
       'pos', 'source_id', 'bias'],
      dtype='object', length=142)

In [None]:
df_news_source_added[df_news_source_added.duplicated(subset=['news_id','bias'])]
df_news_source_added.drop_duplicates(subset=['news_id','source_id'], inplace=True)

In [None]:
df_news_source_added.shape, df_news_source_added.news_id.nunique(), df_news_source_added.source_id.nunique()

((506, 142), 506, 197)

# select only those sources who published 516 news

In [None]:
df_source = df_source[df_source.source_id.isin(df_news_source_added.source_id)]

In [None]:
df_source.shape, df_source.source_id.nunique()

((71, 5), 71)

In [None]:
def recompute_user_label(df_user, df_news, df_user_news):
  train_test_path = '/content/gdrive/My Drive/Loopy_belief_propagation/Graph_neural_network/code/code_for_516_news/4_fold/train_test/'
  for i in tqdm(range(4)):
    train = pd.read_csv(train_test_path+"fold"+str(i)+"/train.csv")
    test = pd.read_csv(train_test_path+"fold"+str(i)+"/test.csv")

    train.ids = train.ids.astype(str)
    test.ids = test.ids.astype(str)
    news_train = df_news[df_news.news_id.isin(train[train.entity_label=="news"]['ids'])]
    user_train = df_user[df_user.user_id.isin(train[train.entity_label=="user"]['ids'])]
    user_test = df_user[df_user.user_id.isin(test[test.entity_label=="user"]['ids'])]
    #recompute user label
    df_user_train = df_user_news[df_user_news.user_id.isin(user_train.user_id)]
    df_user_train['label'] = df_user_train['label'].apply(lambda x: 1 if x=='fake' else 0)
    df_avg = df_user_train.groupby(['user_id']).label.mean().reset_index(name='avg')
    df_avg['user_label'] = [ 1 if avg>0.6 else(0 if avg<0.4 else 2) for avg in df_avg.avg]
    user_train_recomputed = user_train.merge(df_avg[['user_id','user_label']], on=['user_id'], suffixes=["_x",""])
    user_train_recomputed.drop(columns=[col for col in user_train_recomputed if col.endswith("_x")], inplace=True)
    df_user_recomputed = pd.concat([user_train_recomputed, user_test])
    print(df_user_recomputed.shape)
    return df_user_recomputed

df_user = recompute_user_label(df_user, df_news, df_user_news_pair)

  0%|          | 0/4 [00:00<?, ?it/s]

(1135, 246)





# Get features

In [None]:
df_user, user_features = features.get_features(df_user, node_type='user', impute=True, scale=True)
df_news, news_features = features.get_features(df_news, node_type='news', impute=True, scale=True)
df_source, source_features = features.get_features(df_source, node_type='source', impute=False, scale=True)
df_news_source_added, news_source_features = features.get_features(df_news_source_added, node_type='news_source', impute=True, scale=True)

no of features for user 99
no of features for news 71
no of features for source 1
no of features for news_source 72


In [None]:
df_user.shape, df_news.shape, df_source.shape, df_all_user_news_pair.shape, df_all_source_news_pair.shape

((1135, 246), (506, 140), (71, 5), (401715, 3), (1081, 2))

In [None]:
"""# Split user followes user and followers follow user pairs"""
df_followers_sample = df_followers_sample.astype(str)
df_user.user_id = df_user.user_id.astype(str)

df_user_user_pair = df_followers_sample[df_followers_sample.followers.isin(df_user.user_id)]
df_user_follower_pair = df_followers_sample[~df_followers_sample.followers.isin(df_user.user_id)]

#check number of unique entities
print("source", df_source.source_id.nunique(), df_source_news_pair.source_id.nunique())
print("news", df_news.news_id.nunique(), df_source_news_pair.news_id.nunique(), df_user_news_pair.news_id.nunique())
print("users", df_user.user_id.nunique(), df_user_news_pair.user_id.nunique(),df_user_user_pair.user_id.nunique())
print("followers", df_user_follower_pair.user_id.nunique(), df_user_follower_pair.followers.nunique())


source 71 197
news 506 506 506
users 1135 1135 357
followers 0 0


In [None]:
Counter(df_source.factual_score)

Counter({0.0: 5, 1.0: 16, 2.0: 10, 3.0: 9, 4.0: 26, 5.0: 5})

# Assign id to each node as heterograph requires int id starting from 0

## assign ids for entities

In [None]:
# id for all source
sources = list(df_source.source_id.unique())+list(df_source_news_pair.source_id.unique())
mapping_source_id = dict(zip(set(sources), range(len(set(sources)))))
print("#sources = ", len(mapping_source_id))

# id for all news
news = list(df_news.news_id.unique())+list(df_source_news_pair.news_id.unique())+list(df_user_news_pair.news_id.unique())
mapping_news_id = dict(zip(set(news), range(len(set(news)))))
print("#news = ", len(mapping_news_id))

# id for all users including followers with features
df_user.user_id = df_user.user_id.astype(str)
df_user_news_pair.user_id = df_user_news_pair.user_id.astype(str)
df_user_user_pair = df_user_user_pair.astype(str)
df_user_follower_pair = df_user_follower_pair.astype(str)

users = list(df_user.user_id.unique())+list(df_user_news_pair.user_id.unique())\
+list(df_user_user_pair.followers.unique())+list(df_user_user_pair.user_id.unique())\
+list(df_user_follower_pair.user_id.unique())
mapping_user_id = dict(zip(set(users), range(len(set(users)))))
print("#users", len(mapping_user_id))

# id for followers without feature
followers = list(df_user_follower_pair.followers.unique())
mapping_follower_id = dict(zip(set(followers), range(len(set(followers)))))
print("#followers", len(mapping_follower_id))

#sources =  197
#news =  506
#users 1135
#followers 0


In [None]:
# id for news and source
df_source['publisherNode_id'] = df_source.source_id.apply(lambda x: mapping_source_id[x])
df_source_news_pair['publisherNode_id'] = df_source_news_pair.source_id.apply(lambda x: mapping_source_id[x])

df_news['newsNode_id'] = df_news.news_id.apply(lambda x: mapping_news_id[x])
df_source_news_pair['newsNode_id'] = df_source_news_pair.news_id.apply(lambda x: mapping_news_id[x])
df_user_news_pair['newsNode_id'] = df_user_news_pair.news_id.apply(lambda x: mapping_news_id[x])

df_user['userNode_id'] = df_user.user_id.apply(lambda x: mapping_user_id[x])
df_user_news_pair['userNode_id'] = df_user_news_pair.user_id.apply(lambda x: mapping_user_id[x])
df_user_user_pair['userNode_id'] = df_user_user_pair.user_id.apply(lambda x: mapping_user_id[x])
df_user_user_pair['followerWFeatNode_id'] = df_user_user_pair.followers.apply(lambda x: mapping_user_id[x])

df_user_follower_pair['userNode_id'] = df_user_follower_pair.user_id.apply(lambda x: mapping_user_id[x])
df_user_follower_pair['followerWoFeatNode_id'] = df_user_follower_pair.followers.apply(lambda x: mapping_follower_id[x])

In [None]:
df_user.shape, df_news.shape, df_source.shape, df_source_news_pair.shape, df_user_news_pair.shape,\
df_user_user_pair.shape, df_user_follower_pair.shape

((1135, 247), (506, 141), (71, 6), (560, 4), (14309, 5), (4118, 4), (0, 4))

# save entity pairs

In [None]:
df_user.user_id = df_user.user_id.astype(str)
df_user_news_pair.user_id = df_user_news_pair.user_id.astype(str)
df_news.news_id = df_news.news_id.astype(str)
df_user_news_pair.news_id = df_user_news_pair.news_id.astype(str)

In [None]:
df_news_source = df_source_news_pair[df_source_news_pair.news_id.isin(list(set(df_news.news_id)))]
print(df_news_source.source_id.nunique(),'sources published',df_news_source.news_id.nunique(),"news")

df_source_news = df_source_news_pair[df_source_news_pair.source_id.isin(list(set(df_source.source_id)))]
print(df_source_news.source_id.nunique(),"sources published", df_source_news.news_id.nunique(),"news")

print("bidirectional connection between source and news")
len(set(df_news_source.source_id).intersection(set(df_source_news.source_id))), len(set(df_source_news.news_id).intersection(set(df_news_source.news_id)))

197 sources published 506 news
71 sources published 171 news
bidirectional connection between source and news


(71, 171)

In [None]:
df_news_user = df_user_news_pair[df_user_news_pair.news_id.isin(list(set(df_news.news_id)))]
print(df_news_user.user_id.nunique(),'users shared',df_news_user.news_id.nunique(),"news")

df_user_news = df_user_news_pair[df_user_news_pair.user_id.isin(list(set(df_user.user_id)))]
print(df_user_news.user_id.nunique(),"users shared", df_user_news.news_id.nunique(),"news")

print("bidirectional connection between user and news")
len(set(df_news_user.user_id).intersection(set(df_user_news.user_id))), len(set(df_news_user.news_id).intersection(set(df_user_news.news_id)))

1135 users shared 506 news
1135 users shared 506 news
bidirectional connection between user and news


(1135, 506)

In [None]:
print(df_user_user_pair.user_id.nunique(),'users followedby',df_user_user_pair.followers.nunique(),"users with feature")
print(df_user_follower_pair.user_id.nunique(),'users followedby',df_user_follower_pair.followers.nunique(),"users with no feature")


357 users followedby 513 users with feature
0 users followedby 0 users with no feature


In [None]:
def get_label_and_feat(node, feature, node_type, padding=True):
  if node_type == 'user':
    id = 'userNode_id'
    node_label = 'user_label'
    node_df = df_user[df_user[id]==node.tolist()]
  elif node_type == 'news':
    id = 'newsNode_id'
    node_label = 'news_label'
    node_df = df_news[df_news[id]==node.tolist()]
  elif node_type == 'source':
    id = 'publisherNode_id'
    node_label = 'factual_score'
    node_df = df_source[df_source[id]==node.tolist()]
  elif node_type == 'follower':
    label = 100
    feat = np.zeros(len(feature))
    node_df = pd.DataFrame()
  # for any node with no features and label
  if node_df.shape[0] == 0:
    # id = 'followerNode_id'
    # node_df = df_followers_sample[df_followers_sample[id]==node.tolist()]
    label = 100
    feat = np.zeros(len(feature))
  else:
    label = node_df[node_label].values[0]
    feat = node_df[feature].values[0]
  
  if padding:
    if node_type == 'user':
      user_f = feat
      news_f = np.zeros(len(news_features))
      source_f = np.zeros(len(source_features))
      
    elif node_type == 'news':
      user_f = np.zeros(len(user_features))
      news_f = feat
      source_f = np.zeros(len(source_features))
      
    elif node_type == 'source':
      user_f = np.zeros(len(user_features))
      news_f = np.zeros(len(news_features))
      source_f = feat
  
    feat = np.array(list(user_f)+list(news_f)+list(source_f))
  # print(label)
  return label, feat

In [None]:
def create_heterograph(df_source_news_pair, df_user_news_pair, df_user_user_pair, df_user_follower_pair, verbose=True,padding=True):
  # Create heterogenous edges
  source_publish_news_edge = list(set([tuple(x) for x in df_source_news_pair[['publisherNode_id', 'newsNode_id']].to_numpy()]))
  news_publishedby_source_edge = list(set([tuple(x) for x in df_source_news_pair[['newsNode_id', 'publisherNode_id']].to_numpy()]))
  news_shareby_user_edge = list(set([tuple(x) for x in df_user_news_pair[['newsNode_id', 'userNode_id']].to_numpy()]))
  user_share_news_edge = list(set([tuple(x) for x in df_user_news_pair[['userNode_id', 'newsNode_id']].to_numpy()]))
  user_follow_user_edge = [tuple(x) for x in df_user_user_pair[['followerWFeatNode_id','userNode_id']].to_numpy()]
  user_followedby_user_edge = [tuple(x) for x in df_user_user_pair[['userNode_id','followerWFeatNode_id']].to_numpy()]
  # follower_follow_user_edge = [tuple(x) for x in df_user_follower_pair[['followerWoFeatNode_id','userNode_id']].to_numpy()]
  # user_followedby_follower_edge = [tuple(x) for x in df_user_follower_pair[['userNode_id','followerWoFeatNode_id']].to_numpy()]



  # create heterogenous graph
  hetero_graph = dgl.heterograph({('source', 'publish', 'news'):source_publish_news_edge,
                                ('news', 'publishedby', 'source'): news_publishedby_source_edge,
                      ('news', 'shareby', 'user'): news_shareby_user_edge,
                      ('user', 'share', 'news'): user_share_news_edge,
                      ('user', 'followedby', 'user'): user_followedby_user_edge,
                      ('user', 'follow', 'user'): user_follow_user_edge
                      # ('user', 'followedby_', 'follower'): user_followedby_follower_edge,
                      # ('follower', 'follow_', 'user'): follower_follow_user_edge
                      })
  if verbose:
    print(hetero_graph)
    print('Node types:', hetero_graph.ntypes)
    print('Edge types:', hetero_graph.etypes)
    print('Canonical edge types:', hetero_graph.canonical_etypes)
  
  # Assign features to node
  user_label = []
  user_feat = []
  # i=0
  for user in hetero_graph.nodes('user'):
    label, feat = get_label_and_feat(user, user_features, 'user',padding=padding)
    user_label.append(label)
    user_feat.append(feat)
    # i+=1
    # if i==2:
    #   break

    
  news_label = []
  news_feat = []
  for news in hetero_graph.nodes('news'):
    label, feat = get_label_and_feat(news, news_features, 'news',padding=padding)
    news_label.append(label)
    news_feat.append(feat)

  source_label = []
  source_feat = []
  for source in hetero_graph.nodes('source'):
    label, feat = get_label_and_feat(source, source_features, 'source',padding=padding)
    source_label.append(label)
    source_feat.append(feat)

  # follower_label = []
  # follower_feat = []
  # for follower in hetero_graph.nodes('follower'):
  #   label, feat = get_label_and_feat(follower, user_features, 'follower',padding=padding)
  #   follower_label.append(label)
  #   follower_feat.append(feat)

  hetero_graph.nodes['user'].data['feat'] = torch.as_tensor(np.array(user_feat)).to(device)
  hetero_graph.nodes['news'].data['feat'] = torch.as_tensor(np.array(news_feat)).to(device)
  hetero_graph.nodes['source'].data['feat'] = torch.as_tensor(np.array(source_feat)).to(device)
  # hetero_graph.nodes['follower'].data['feat'] = torch.as_tensor(np.array(follower_feat)).to(device)

  hetero_graph.nodes['user'].data['label'] = torch.as_tensor(np.array(user_label)).to(device)
  hetero_graph.nodes['news'].data['label'] = torch.as_tensor(np.array(news_label)).to(device)
  hetero_graph.nodes['source'].data['label'] = torch.as_tensor(np.array(source_label)).to(device)
  # hetero_graph.nodes['follower'].data['label'] = torch.as_tensor(np.array(follower_label)).to(device)

  return hetero_graph


In [None]:
def run_model(hetero_graph, node_type, news_train, news_test, user_train, user_test, source_train, source_test, model,add_layer=False, baseline=False):
  # randomly generate training masks on user nodes
  n_users = len(hetero_graph.nodes('user'))
  n_news = len(hetero_graph.nodes('news'))
  n_source = len(hetero_graph.nodes('source'))
  # n_follower = len(hetero_graph.nodes('follower'))

  # mask train test for news
  news_train_mask = torch.zeros(n_news, dtype=torch.bool)
  news_test_mask = torch.zeros(n_news, dtype=torch.bool)
  news_train_mask[news_train] = True
  news_test_mask[news_test] = True

   # mask train test for user
  user_train_mask = torch.zeros(n_users, dtype=torch.bool)
  user_test_mask = torch.zeros(n_users, dtype=torch.bool)
  user_train_mask[user_train] = True
  user_test_mask[user_test] = True

   # mask train test for source
  source_train_mask = torch.zeros(n_source, dtype=torch.bool)
  source_test_mask = torch.zeros(n_source, dtype=torch.bool)
  source_train_mask[source_train] = True
  source_test_mask[source_test] = True


  hetero_graph.nodes['user'].data['train_mask'] = user_train_mask
  hetero_graph.nodes['user'].data['test_mask'] = user_test_mask
  hetero_graph.nodes['news'].data['train_mask'] = news_train_mask
  hetero_graph.nodes['news'].data['test_mask'] = news_test_mask
  hetero_graph.nodes['source'].data['train_mask'] = source_train_mask
  hetero_graph.nodes['source'].data['test_mask'] = source_test_mask

  train_mask_n = hetero_graph.nodes['news'].data['train_mask']
  test_mask_n = hetero_graph.nodes['news'].data['test_mask']
  train_mask_u = hetero_graph.nodes['user'].data['train_mask']
  test_mask_u = hetero_graph.nodes['user'].data['test_mask']
  train_mask_s = hetero_graph.nodes['source'].data['train_mask']
  test_mask_s = hetero_graph.nodes['source'].data['test_mask']

  

  labels_n = hetero_graph.nodes['news'].data['label']
  # labels_n = labels_n.to(torch.float32)
  labels_u = hetero_graph.nodes['user'].data['label']
  # labels_u = labels_u.to(torch.float32)
  labels_s = hetero_graph.nodes['source'].data['label'].long()
  # labels_s = labels_s.to(torch.float32)

  u_feats = hetero_graph.nodes['user'].data['feat'].float()
  n_feats = hetero_graph.nodes['news'].data['feat'].float()
  s_feats = hetero_graph.nodes['source'].data['feat'].float()
  # f_feats = hetero_graph.nodes['follower'].data['feat'].float()
  

  node_features = {'user': u_feats, 'news': n_feats, 'source': s_feats}#,'follower':f_feats}
  

  user_dim = len(user_features)
  news_dim = len(news_features)
  source_dim = len(source_features)
  hidden_dim = 150
    

  #in_features should be the same as the dimensionality of the input node features to your GNN model. 
  #It can either be your initial node feature size or the output of an initial MLP that projects your initial node features.
  # in_features corresponds to the size of your input features.
  # out_features corresponds to the size of your output, usually the number of classes for classification or 1 for regression.
  # hidden_features corresponds to the size of your hidden state, where you set it as a hyperparameter.
  n_hetero_features= hidden_dim #len(user_features+news_features+source_features)
 
  n_user_classes = len(hetero_graph.nodes['user'].data['label'].unique())
  n_news_classes = len(hetero_graph.nodes['news'].data['label'].unique())
  n_source_classes = len(hetero_graph.nodes['source'].data['label'].unique())
  model = model(user_dim, news_dim, source_dim, hidden_dim, n_user_classes,n_news_classes, n_source_classes, hetero_graph.etypes)
  opt = torch.optim.Adam(model.parameters(), lr=0.01)
  # print(model)

  
  loss_ = []
  train_pred = []
  train_label = []
  test_pred = []
  test_label = []
  train_pred_rf = []
  test_pred_rf = []

  reports = []
  reports_rf = []
  for epoch in range(50):
      model.train()
      
      # forward propagation by using all nodes and extracting the news embeddings
      logits, logit1 = model(hetero_graph, node_features)
      logits_u = logits['user']
      logits_n = logits['news']
      logits_s = logits['source']

      # logit1= logit1[node_type]
      
      # compute loss
      loss_n = F.cross_entropy(logits_n[train_mask_n], labels_n[train_mask_n])
      loss_u = F.cross_entropy(logits_u[train_mask_u], labels_u[train_mask_u])
      loss_s = F.cross_entropy(logits_s[train_mask_s], labels_s[train_mask_s])
      loss = loss_u+loss_n+loss_s

      # zero the parameter gradients
      opt.zero_grad()
      # backward propagation
      loss.backward(retain_graph=True)
      opt.step()

      
      #GNN
      pred = logits[node_type].argmax(1)
      if node_type=='news':
        train_m = train_mask_n
        test_m = test_mask_n
        label = labels_n
      elif node_type=='user':
        train_m = train_mask_u
        test_m = test_mask_u
        label = labels_u
      elif node_type=='source':
        train_m = train_mask_s
        test_m = test_mask_s
        label = labels_s

      label_train = label[train_m].detach().numpy()
      label_test = label[test_m].detach().numpy()
      pred_train = pred[train_m].detach().numpy()
      pred_test = pred[test_m].detach().numpy()

      # random forest regression
      reg = RandomForestClassifier(class_weight="balanced", random_state=0)
      if baseline:
        feat = node_features[node_type]
      else:
        feat= logit1[node_type]
      reg.fit(feat[train_m].detach().numpy(), label_train)
      pred_train_rf = reg.predict(feat[train_m].detach().numpy())
      pred_test_rf = reg.predict(feat[test_m].detach().numpy())
      
      train_label.extend(label_train.tolist())
      train_pred.extend(pred_train.tolist())
      test_label.extend(label_test.tolist())
      test_pred.extend(pred_test.tolist())
      train_pred_rf.extend(pred_train_rf.tolist())
      test_pred_rf.extend(pred_test_rf.tolist())

      loss_.append(loss.item())
      # print(classification_report(label_test, pred_test_rf, digits=3))
      report = pd.DataFrame(classification_report(label_test, pred_test, digits=3, output_dict=True))
      reports.append(report)

      report_rf = pd.DataFrame(classification_report(label_test, pred_test_rf, digits=3, output_dict=True))
      reports_rf.append(report_rf)

  df_score = pd.concat(reports)
  df_score.groupby(df_score.index).mean()
  df_score_rf = pd.concat(reports_rf)
  df_score_rf.groupby(df_score_rf.index).mean()
  
  return train_label, train_pred, test_label, test_pred,train_pred_rf, test_pred_rf, loss_, df_score, df_score_rf

      


In [None]:
def run_exp(hetero_graph, df_source, df_news, df_user, node_type, model,add_layer=False, baseline=False):
  df_result = pd.DataFrame(columns= ['label', 'pred','type','fold'])
  df_scores = []
  df_scores_rf = []
  df_user['user_id'] = df_user['user_id'].astype(str)
  
  train_test_path = '/content/gdrive/My Drive/Loopy_belief_propagation/Graph_neural_network/code/code_for_516_news/4_fold/train_test/'
  for i in tqdm(range(4)):
    train = pd.read_csv(train_test_path+"fold"+str(i)+"/train.csv")
    test = pd.read_csv(train_test_path+"fold"+str(i)+"/test.csv")

    train.ids = train.ids.astype(str)
    test.ids = test.ids.astype(str)

    news_train = df_news[df_news.news_id.isin(train[train.entity_label=="news"]['ids'])]['newsNode_id'].values
    news_test = df_news[df_news.news_id.isin(test[test.entity_label=="news"]['ids'])]['newsNode_id'].values
    user_train = df_user[df_user.user_id.isin(train[train.entity_label=="user"]['ids'])]['userNode_id'].values
    user_test = df_user[df_user.user_id.isin(test[test.entity_label=="user"]['ids'])]['userNode_id'].values
    source_train = df_source[df_source.source_id.isin(train[train.entity_label=="source"]['ids'])]['publisherNode_id'].values
    source_test = df_source[df_source.source_id.isin(test[test.entity_label=="source"]['ids'])]['publisherNode_id'].values
    print("actual number in train", len(set(news_train)), len(set(user_train)), len(set(source_train)))
    print("actual number in test", len(set(news_test)), len(set(user_test)), len(set(source_test)))
  
    train_label, train_pred, test_label, test_pred,train_pred_rf, test_pred_rf, loss_, df_score, df_score_rf = run_model(
        hetero_graph, node_type, news_train,news_test, user_train, user_test, source_train, source_test, model,add_layer, baseline)
    #spearsman rank correlation coefficient
    print("For fold ", str(i))
    print("Train", stats.spearmanr(train_label, train_pred))
    print("Test", stats.spearmanr(test_label, test_pred))
    print("Train", stats.pearsonr(train_label, train_pred))
    print("Test", stats.pearsonr(test_label, test_pred))

    print("Train RF", stats.spearmanr(train_label, train_pred_rf))
    print("Test RF", stats.spearmanr(test_label, test_pred_rf))
    print("Train RF", stats.pearsonr(train_label, train_pred_rf))
    print("Test RF", stats.pearsonr(test_label, test_pred_rf))

    df_result= df_result.append(pd.DataFrame({'label': train_label,
                                              'pred': train_pred,
                                              'pred_rf': train_pred_rf,
                                  'type': 'train',
                                  'fold': str(i)}))
    df_result= df_result.append(pd.DataFrame({'label': test_label,
                                              'pred': test_pred,
                                              'pred_rf': test_pred_rf,
                                  'type': 'test',
                                  'fold': str(i)}))
    df_scores.append(df_score)
    df_scores_rf.append(df_score_rf)
    # df_scores['loss'] = df_scores['loss'].append(loss)

  # print(df_scores)
  df_result_test = df_result[df_result['type']=='test']
  print("Test overall",stats.spearmanr(df_result_test.label, df_result_test.pred))
  print("Test overall",stats.pearsonr(df_result_test.label, df_result_test.pred))
  print("Test overall RF",stats.spearmanr(df_result_test.label, df_result_test.pred_rf))
  print("Test overall RF",stats.pearsonr(df_result_test.label, df_result_test.pred_rf))
  df_scores = pd.concat(df_scores)
  # print("\n GNN \n")
  # print(df_scores.groupby(df_scores.index).mean())
  df_scores_rf = pd.concat(df_scores_rf)
  # print("\n RF \n")
  # print(df_scores_rf.groupby(df_scores_rf.index).mean())
  df_scores.to_csv(data_path+node_type+'GNN_scores_for_modified_RGCN.csv')
  df_scores_rf.to_csv(data_path+node_type+'GNN_scores_for_modified_RGCN_plus_RF.csv')
  return df_scores.groupby(df_scores.index).mean(), df_scores_rf.groupby(df_scores_rf.index).mean()
 

In [None]:
hetero_graph = create_heterograph(df_source_news_pair, df_user_news_pair, 
                                  df_user_user_pair, df_user_follower_pair, 
                                  verbose = True, padding = False)

Graph(num_nodes={'news': 506, 'source': 197, 'user': 1135},
      num_edges={('news', 'publishedby', 'source'): 506, ('news', 'shareby', 'user'): 14309, ('source', 'publish', 'news'): 506, ('user', 'follow', 'user'): 4118, ('user', 'followedby', 'user'): 4118, ('user', 'share', 'news'): 14309},
      metagraph=[('news', 'source', 'publishedby'), ('news', 'user', 'shareby'), ('source', 'news', 'publish'), ('user', 'user', 'follow'), ('user', 'user', 'followedby'), ('user', 'news', 'share')])
Node types: ['news', 'source', 'user']
Edge types: ['publishedby', 'shareby', 'publish', 'follow', 'followedby', 'share']
Canonical edge types: [('news', 'publishedby', 'source'), ('news', 'shareby', 'user'), ('source', 'publish', 'news'), ('user', 'follow', 'user'), ('user', 'followedby', 'user'), ('user', 'share', 'news')]


In [None]:
# if conlayer is not added change padding to True
df_GNN_score_news, df_RF_score_news = run_exp(hetero_graph, df_source, df_news, df_user,'news', models.RGCN2_combine_losses, add_layer=False, baseline=False)

  0%|          | 0/4 [00:00<?, ?it/s]

actual number in train 381 851 53
actual number in test 125 284 18


 25%|██▌       | 1/4 [00:39<01:59, 39.99s/it]

For fold  0
Train SpearmanrResult(correlation=0.8689161690467991, pvalue=0.0)
Test SpearmanrResult(correlation=0.7654132604295895, pvalue=0.0)
Train (0.8689161690468012, 0.0)
Test (0.7654132604295935, 0.0)
Train RF SpearmanrResult(correlation=0.9999999999999998, pvalue=0.0)
Test RF SpearmanrResult(correlation=0.8215691918881317, pvalue=0.0)
Train RF (1.0, 0.0)
Test RF (0.8215691918881268, 0.0)
actual number in train 380 851 53
actual number in test 126 284 18


 50%|█████     | 2/4 [01:05<01:03, 31.71s/it]

For fold  1
Train SpearmanrResult(correlation=0.8624829319115594, pvalue=0.0)
Test SpearmanrResult(correlation=0.7401269660235616, pvalue=0.0)
Train (0.8624829319115559, 0.0)
Test (0.7401269660235615, 0.0)
Train RF SpearmanrResult(correlation=0.994514178333822, pvalue=0.0)
Test RF SpearmanrResult(correlation=0.8226614072455686, pvalue=0.0)
Train RF (0.994514178333825, 0.0)
Test RF (0.8226614072455698, 0.0)
actual number in train 378 851 54
actual number in test 128 284 17


 75%|███████▌  | 3/4 [01:31<00:28, 28.82s/it]

For fold  2
Train SpearmanrResult(correlation=0.8823483782661135, pvalue=0.0)
Test SpearmanrResult(correlation=0.7899839513091972, pvalue=0.0)
Train (0.8823483782661086, 0.0)
Test (0.7899839513091969, 0.0)
Train RF SpearmanrResult(correlation=0.9945341876050766, pvalue=0.0)
Test RF SpearmanrResult(correlation=0.8509875597598032, pvalue=0.0)
Train RF (0.9945341876050822, 0.0)
Test RF (0.8509875597598078, 0.0)
actual number in train 379 852 53
actual number in test 127 283 18


100%|██████████| 4/4 [01:57<00:00, 29.49s/it]

For fold  3
Train SpearmanrResult(correlation=0.862049362675302, pvalue=0.0)
Test SpearmanrResult(correlation=0.7816998860891662, pvalue=0.0)
Train (0.8620493626753057, 0.0)
Test (0.7816998860891661, 0.0)
Train RF SpearmanrResult(correlation=0.9944391038583434, pvalue=0.0)
Test RF SpearmanrResult(correlation=0.8674326505088518, pvalue=0.0)
Train RF (0.994439103858348, 0.0)
Test RF (0.8674326505088514, 0.0)
Test overall SpearmanrResult(correlation=0.7690784189067515, pvalue=0.0)
Test overall (0.7690784189067559, 0.0)
Test overall RF SpearmanrResult(correlation=0.8408000457175777, pvalue=0.0)
Test overall RF (0.840800045717576, 0.0)





In [None]:
# if conlayer is not added change padding to True
df_GNN_score_user, df_RF_score_user = run_exp(hetero_graph, df_source, df_news, df_user,'user', models.RGCN2_combine_losses, add_layer=False, baseline=False)

  0%|          | 0/4 [00:00<?, ?it/s]

actual number in train 381 851 53
actual number in test 125 284 18


 25%|██▌       | 1/4 [00:36<01:50, 36.94s/it]

For fold  0
Train SpearmanrResult(correlation=0.7863686204555417, pvalue=0.0)
Test SpearmanrResult(correlation=0.669538325907357, pvalue=0.0)
Train (0.7493383682959656, 0.0)
Test (0.6180114668191792, 0.0)
Train RF SpearmanrResult(correlation=1.0, pvalue=0.0)
Test RF SpearmanrResult(correlation=0.7468537307124546, pvalue=0.0)
Train RF (0.9999999999999856, 0.0)
Test RF (0.6889907951987809, 0.0)
actual number in train 380 851 53
actual number in test 126 284 18


 50%|█████     | 2/4 [01:16<01:16, 38.46s/it]

For fold  1
Train SpearmanrResult(correlation=0.7529325229904958, pvalue=0.0)
Test SpearmanrResult(correlation=0.6779916312174727, pvalue=0.0)
Train (0.7115708990404643, 0.0)
Test (0.6237621698573838, 0.0)
Train RF SpearmanrResult(correlation=1.0, pvalue=0.0)
Test RF SpearmanrResult(correlation=0.7682164520679581, pvalue=0.0)
Train RF (1.0, 0.0)
Test RF (0.7116565788052028, 0.0)
actual number in train 378 851 54
actual number in test 128 284 17


 75%|███████▌  | 3/4 [01:54<00:38, 38.29s/it]

For fold  2
Train SpearmanrResult(correlation=0.722452968922362, pvalue=0.0)
Test SpearmanrResult(correlation=0.7191320926502177, pvalue=0.0)
Train (0.6761558397001135, 0.0)
Test (0.6671612270596656, 0.0)
Train RF SpearmanrResult(correlation=1.0, pvalue=0.0)
Test RF SpearmanrResult(correlation=0.8098299410514916, pvalue=0.0)
Train RF (1.0, 0.0)
Test RF (0.759257125482157, 0.0)
actual number in train 379 852 53
actual number in test 127 283 18


100%|██████████| 4/4 [02:33<00:00, 38.39s/it]

For fold  3
Train SpearmanrResult(correlation=0.7465608950774011, pvalue=0.0)
Test SpearmanrResult(correlation=0.7710692805806264, pvalue=0.0)
Train (0.7003612987115346, 0.0)
Test (0.7199913891980201, 0.0)
Train RF SpearmanrResult(correlation=1.0, pvalue=0.0)
Test RF SpearmanrResult(correlation=0.7960738267367903, pvalue=0.0)
Train RF (0.9999999999999873, 0.0)
Test RF (0.7375071751738764, 0.0)


100%|██████████| 4/4 [02:33<00:00, 38.28s/it]


Test overall SpearmanrResult(correlation=0.7090088286385078, pvalue=0.0)
Test overall (0.6562123694204105, 0.0)
Test overall RF SpearmanrResult(correlation=0.7793431603852324, pvalue=0.0)
Test overall RF (0.7224329187741307, 0.0)


In [None]:
# if conlayer is not added change padding to True
df_GNN_score_source, df_RF_score_source= run_exp(hetero_graph, df_source, df_news, df_user,'source', models.RGCN2_combine_losses, add_layer=False, baseline=False)

  0%|          | 0/4 [00:00<?, ?it/s]

actual number in train 381 851 53
actual number in test 125 284 18


 25%|██▌       | 1/4 [00:20<01:01, 20.35s/it]

For fold  0
Train SpearmanrResult(correlation=0.915146312471668, pvalue=0.0)
Test SpearmanrResult(correlation=0.5983536784176596, pvalue=1.6685944243023892e-88)
Train (0.9203610293984033, 0.0)
Test (0.6119534470124726, 1.4053041242786732e-93)
Train RF SpearmanrResult(correlation=1.0, pvalue=0.0)
Test RF SpearmanrResult(correlation=0.6565310332996657, pvalue=4.379151160827349e-112)
Train RF (0.9999999999999986, 0.0)
Test RF (0.6694709125014785, 4.59105217361328e-118)
actual number in train 380 851 53
actual number in test 126 284 18


 50%|█████     | 2/4 [00:40<00:40, 20.39s/it]

For fold  1
Train SpearmanrResult(correlation=0.861498236305609, pvalue=0.0)
Test SpearmanrResult(correlation=0.8468216155767944, pvalue=1.8865643266154333e-248)
Train (0.8641889288506653, 0.0)
Test (0.8208214035293877, 1.2479913413586281e-220)
Train RF SpearmanrResult(correlation=1.0, pvalue=0.0)
Test RF SpearmanrResult(correlation=0.9093304955004751, pvalue=0.0)
Train RF (0.9999999999999989, 0.0)
Test RF (0.8928064542556872, 2.75871154e-313)
actual number in train 378 851 54
actual number in test 128 284 17


 75%|███████▌  | 3/4 [01:01<00:20, 20.42s/it]

For fold  2
Train SpearmanrResult(correlation=0.8723472007081186, pvalue=0.0)
Test SpearmanrResult(correlation=0.8138906702741059, pvalue=3.627361978670711e-202)
Train (0.8860643384333327, 0.0)
Test (0.8290336765735398, 2.850323038718969e-216)
Train RF SpearmanrResult(correlation=1.0, pvalue=0.0)
Test RF SpearmanrResult(correlation=0.8372372801202077, pvalue=1.6592335208707603e-224)
Train RF (1.0, 0.0)
Test RF (0.8433917969908942, 5.427653591352418e-231)
actual number in train 379 852 53
actual number in test 127 283 18


100%|██████████| 4/4 [01:21<00:00, 20.43s/it]

For fold  3
Train SpearmanrResult(correlation=0.8957987032576421, pvalue=0.0)
Test SpearmanrResult(correlation=0.5419438346044276, pvalue=7.680087428411145e-70)
Train (0.8989435459229425, 0.0)
Test (0.5801170748501133, 4.5369894546648673e-82)
Train RF SpearmanrResult(correlation=1.0, pvalue=0.0)
Test RF SpearmanrResult(correlation=0.6244351554813555, pvalue=1.85287060276398e-98)
Train RF (0.9999999999999982, 0.0)
Test RF (0.6638158571751495, 2.050964761441322e-115)
Test overall SpearmanrResult(correlation=0.6917407347163648, pvalue=0.0)
Test overall (0.700412903797793, 0.0)
Test overall RF SpearmanrResult(correlation=0.7410750612300299, pvalue=0.0)
Test overall RF (0.7560103503814847, 0.0)





In [None]:
df_GNN_score_news.T

Unnamed: 0,f1-score,precision,recall,support
0,0.848801,0.861253,0.853099,51.5
1,0.900552,0.907979,0.912279,75.0
accuracy,0.888759,0.888759,0.888759,0.888759
macro avg,0.874676,0.884616,0.882689,126.5
weighted avg,0.879777,0.888951,0.888759,126.5


In [None]:
df_RF_score_news.T

Unnamed: 0,f1-score,precision,recall,support
0,0.90534,0.903149,0.909073,51.5
1,0.934797,0.938305,0.932013,75.0
accuracy,0.92298,0.92298,0.92298,0.92298
macro avg,0.920069,0.920727,0.920543,126.5
weighted avg,0.922904,0.92388,0.92298,126.5


In [None]:
df_GNN_score_user.T

Unnamed: 0,f1-score,precision,recall,support
0,0.851252,0.877577,0.859816,96.25
1,0.926948,0.903532,0.961511,164.25
2,0.340087,0.388703,0.323507,23.25
accuracy,0.874345,0.874345,0.874345,0.874345
macro avg,0.706096,0.723271,0.714945,283.75
weighted avg,0.853288,0.85303,0.874345,283.75


In [None]:
df_RF_score_user.T

Unnamed: 0,f1-score,precision,recall,support
0,0.941669,0.929949,0.954495,96.25
1,0.956928,0.947197,0.96742,164.25
2,0.430829,0.520083,0.384583,23.25
accuracy,0.914614,0.914614,0.914614,0.914614
macro avg,0.776475,0.799076,0.768833,283.75
weighted avg,0.9085,0.906851,0.914614,283.75


In [None]:
df_GNN_score_source.T

Unnamed: 0,f1-score,precision,recall,support
0,0.193152,0.37741,0.150826,2.066116
1,0.792657,0.791704,0.856,4.0
2,0.209333,0.220833,0.2775,2.5
3,0.093846,0.075086,0.154167,2.25
4,0.583151,0.578608,0.664792,6.5
5,0.006,0.003958,0.0125,1.25
accuracy,0.509755,0.509755,0.509755,0.509755
macro avg,0.323129,0.340002,0.367253,17.75
weighted avg,0.478179,0.510295,0.509755,17.75
6,0.0,0.0,0.0,0.0


In [None]:
df_RF_score_source.T

Unnamed: 0,f1-score,precision,recall,support
0,0.248598,0.383178,0.214953,2.336449
1,0.826712,0.811726,0.9075,4.0
2,0.298413,0.317833,0.400417,2.5
3,0.070702,0.0545,0.11,2.25
4,0.578097,0.570051,0.641458,6.5
5,0.025,0.025,0.025,1.25
accuracy,0.518448,0.518448,0.518448,0.518448
macro avg,0.348371,0.357958,0.393785,17.75
weighted avg,0.491482,0.514519,0.518448,17.75
