In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from numpy.random import seed
import csv
# # seed the random number generator
# seed(seed=10)

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LinearRegression, HuberRegressor, Lasso, Ridge, SGDRegressor, LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import *
import string
from tqdm import tqdm
from matplotlib import pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from collections import Counter

!pip install dgl
from google.colab import drive
drive.mount('/content/gdrive')


Collecting dgl
  Downloading dgl-0.6.1-cp37-cp37m-manylinux1_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 4.7 MB/s 
Installing collected packages: dgl
Successfully installed dgl-0.6.1
Mounted at /content/gdrive


## Import external python file

In [None]:
import sys
sys.path.append('/content/gdrive/My Drive/Loopy_belief_propagation/Graph_neural_network/code')

import features
import models

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


Using backend: pytorch


In [None]:
data_path = "/content/gdrive/My Drive/Loopy_belief_propagation/Graph_neural_network/Data/"
df_user = pd.read_csv(data_path+"politifact_shu_user_data_formated_recomputed_training_user_label_4_fold.csv")
df_news = pd.read_csv(data_path+"all_features_for_title_and_text_shu_936_news.csv")
df_source = pd.read_csv(data_path+"politifact_shu_source_data_formated.csv")
df_followers_sample = pd.read_csv(data_path+"politifact_shu_follower_data_formated_with_features_only.csv")
df_all_user_news_pair = pd.read_csv(data_path+"politifact_shu_user_news_pair_data_formated.csv")
df_all_source_news_pair = pd.read_csv(data_path+"politifact_shu_source_news_pair_data_formated.csv")
df_506_news = pd.read_csv("/content/gdrive/My Drive/Loopy_belief_propagation/Graph_neural_network/code/\
code_for_516_news/code_for_news_with_atleast_3_tweets_recompute_user_label/506_news_with_atleast_3_tweets.csv")

In [None]:
#change news label to numeric
df_news.drop_duplicates(subset=['news_id'], inplace=True)
df_news.rename(columns={'label':'news_label'}, inplace=True)
df_news.news_label = df_news.news_label.apply(lambda x: 1 if x in ["fake","Fake",'1',1,'false',
                                                   'half-true', 'pants-fire', 
                                                   'barely-true','full-flop'] else 0)

In [None]:
#select news with atleast 3 tweets
df_news = df_news[df_news.news_id.isin(df_506_news.news_id)]

In [None]:
#convert labels to 0 to 5
df_source.factual_score = (df_source.factual_score/0.16)-1

In [None]:
df_user.shape, df_news.shape, df_source.shape

((1135, 246), (506, 140), (88, 5))

# select only those news shared by users with features

In [None]:
df_all_user_news_pair = df_all_user_news_pair.astype(str)
df_user.user_id = df_user.user_id.astype(str)
df_user_news_pair = df_all_user_news_pair[df_all_user_news_pair.user_id.isin(df_user.user_id)]

In [None]:
df_user_news_pair.shape, df_user_news_pair.user_id.nunique(), df_user_news_pair.news_id.nunique()

((15511, 3), 1135, 544)

In [None]:
df_news = df_news[df_news.news_id.isin(df_user_news_pair.news_id)]
df_news.shape, df_news.news_id.nunique()

((506, 140), 506)

select only those pairs where news are shared by user and have features

In [None]:
df_user_news_pair = df_user_news_pair[df_user_news_pair.news_id.isin(df_news.news_id)]

In [None]:
df_user_news_pair.shape, df_user_news_pair.user_id.nunique(), df_user_news_pair.news_id.nunique()

((14309, 3), 1135, 506)

In [None]:
df_source_news_pair = df_all_source_news_pair[df_all_source_news_pair.news_id.isin(df_news.news_id)]
df_source_news_pair.shape, df_source_news_pair.news_id.nunique()

((560, 2), 506)

# Add source bias and news

In [None]:
df_news_source_added = df_news.merge(df_all_source_news_pair, on='news_id', how='left')
df_news_source_added = df_news_source_added.merge(df_source[['source_id','bias']], on='source_id', how='left')
df_news_source_added.shape

(560, 142)

In [None]:
df_news_source_added.columns

Index(['news_id', 'news_label', 'WC', 'Analytic', 'Clout', 'Authentic', 'Tone',
       'WPS', 'Sixltr', 'Dic',
       ...
       'Sadness', 'Surprise', 'Trust', 'Objective', 'compound', 'neg', 'neu',
       'pos', 'source_id', 'bias'],
      dtype='object', length=142)

In [None]:
df_news_source_added[df_news_source_added.duplicated(subset=['news_id','bias'])]
df_news_source_added.drop_duplicates(subset=['news_id','source_id'], inplace=True)

In [None]:
df_news_source_added.shape, df_news_source_added.news_id.nunique(), df_news_source_added.source_id.nunique()

((506, 142), 506, 197)

# select only those sources who published 516 news

In [None]:
df_source = df_source[df_source.source_id.isin(df_news_source_added.source_id)]

In [None]:
df_source.shape, df_source.source_id.nunique()

((71, 5), 71)

In [None]:
def recompute_user_label(df_user, df_news, df_user_news):
  train_test_path = '/content/gdrive/My Drive/Loopy_belief_propagation/Graph_neural_network/code/code_for_516_news/4_fold/train_test/'
  for i in tqdm(range(4)):
    train = pd.read_csv(train_test_path+"fold"+str(i)+"/train.csv")
    test = pd.read_csv(train_test_path+"fold"+str(i)+"/test.csv")

    train.ids = train.ids.astype(str)
    test.ids = test.ids.astype(str)
    news_train = df_news[df_news.news_id.isin(train[train.entity_label=="news"]['ids'])]
    user_train = df_user[df_user.user_id.isin(train[train.entity_label=="user"]['ids'])]
    user_test = df_user[df_user.user_id.isin(test[test.entity_label=="user"]['ids'])]
    #recompute user label
    df_user_train = df_user_news[df_user_news.user_id.isin(user_train.user_id)]
    df_user_train['label'] = df_user_train['label'].apply(lambda x: 1 if x=='fake' else 0)
    df_avg = df_user_train.groupby(['user_id']).label.mean().reset_index(name='avg')
    df_avg['user_label'] = [ 1 if avg>0.6 else(0 if avg<0.4 else 2) for avg in df_avg.avg]
    user_train_recomputed = user_train.merge(df_avg[['user_id','user_label']], on=['user_id'], suffixes=["_x",""])
    user_train_recomputed.drop(columns=[col for col in user_train_recomputed if col.endswith("_x")], inplace=True)
    df_user_recomputed = pd.concat([user_train_recomputed, user_test])
    print(df_user_recomputed.shape)
    return df_user_recomputed

df_user = recompute_user_label(df_user, df_news, df_user_news_pair)

  0%|          | 0/4 [00:01<?, ?it/s]

(1135, 246)





# Get features

In [None]:
df_user, user_features = features.get_features(df_user, node_type='user', impute=True, scale=True)
df_news, news_features = features.get_features(df_news, node_type='news', impute=True, scale=True)
df_source, source_features = features.get_features(df_source, node_type='source', impute=False, scale=True)
df_news_source_added, news_source_features = features.get_features(df_news_source_added, node_type='news_source', impute=True, scale=True)

no of features for user 99
no of features for news 71
no of features for source 1
no of features for news_source 72


In [None]:
df_user.shape, df_news.shape, df_source.shape

((1135, 246), (506, 140), (71, 5))

In [None]:
df_followers_sample.user_id.nunique(), df_followers_sample.followers.nunique()

(357, 513)

In [None]:
Counter(df_user.user_label), Counter(df_news.news_label),Counter(df_source.factual_score)

(Counter({0: 385, 1: 657, 2: 93}),
 Counter({0: 206, 1: 300}),
 Counter({0.0: 5, 1.0: 16, 2.0: 10, 3.0: 9, 4.0: 26, 5.0: 5}))

In [None]:
"""# Split user followes user and followers follow user pairs"""
df_followers_sample = df_followers_sample.astype(str)
df_user.user_id = df_user.user_id.astype(str)

df_user_user_pair = df_followers_sample[df_followers_sample.followers.isin(df_user.user_id)]
df_user_follower_pair = df_followers_sample[~df_followers_sample.followers.isin(df_user.user_id)]

#check number of unique entities
print("source", df_source.source_id.nunique(), df_source_news_pair.source_id.nunique())
print("news", df_news.news_id.nunique(), df_source_news_pair.news_id.nunique(), df_user_news_pair.news_id.nunique())
print("users", df_user.user_id.nunique(), df_user_news_pair.user_id.nunique(),df_user_user_pair.user_id.nunique())
print("followers", df_user_follower_pair.user_id.nunique(), df_user_follower_pair.followers.nunique())


source 71 197
news 506 506 506
users 1135 1135 357
followers 0 0


In [None]:
def run_model(df, features, node_type, model, regression=True):
  if node_type=='news':
    id = 'news_id'
    label = 'news_label'
  elif node_type == 'user':
    id = 'user_id'
    label = 'user_label'
  elif node_type=='source':
    id = 'source_id'
    label= 'factual_score'

  train_R2 = []
  test_R2 = []
  train_MAE = []
  test_MAE = []
  train_RMSE = []
  test_RMSE = []
  pearsons_coeff = []
  spearsman_coeff = []

  precision_list=[]
  recall_list=[]
  f1_list=[]
  accuracy_list=[]
  Avg_precision_list=[]
  AUROC_list=[]
  reports = []

  train_test_path = '/content/gdrive/My Drive/Loopy_belief_propagation/Graph_neural_network/code/code_for_516_news/4_fold/train_test/'
  for i in range(4):
    train = pd.read_csv(train_test_path+"fold"+str(i)+"/train.csv")
    test = pd.read_csv(train_test_path+"fold"+str(i)+"/test.csv")

    train.ids = train.ids.astype(str)
    test.ids = test.ids.astype(str)
    df[id] = df[id].astype(str)

    assert train.ids.dtype == object
    assert test.ids.dtype == object
    assert df[id].dtype == object

    train_ = df[df[id].isin(train[train.entity_label== node_type]['ids'])]
    test_ = df[df[id].isin(test[test.entity_label== node_type]['ids'])]
     
    X_train = train_[features].values
    X_test = test_[features].values
    y_train = train_[label].values
    y_test = test_[label].values
    print(X_train.shape, X_test.shape)

    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)

    if regression:
      # print("Performing regression")
      test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
      test_mae = mean_absolute_error(y_test, y_pred)
      test_r2 = r2_score(y_test, y_pred)

      test_R2.append(test_r2)
      test_MAE.append(test_mae)
      test_RMSE.append(test_rmse)  

      spearsman_coeff.append(stats.spearmanr(y_test, y_pred)[0])
      pearsons_coeff.append(stats.pearsonr(y_test, y_pred)[0])
      df_score = pd.DataFrame()
      return np.mean(test_RMSE),np.mean(test_MAE),np.mean(test_R2), np.mean(spearsman_coeff), np.mean(pearsons_coeff),df_score 
    else:
      # print("Performing Classification")   
      accuracy = accuracy_score(y_pred,y_test)
      precision = precision_score(y_test, y_pred, average='macro')
      recall = recall_score(y_test, y_pred, average='macro')
      f1 = f1_score(y_test, y_pred,average='micro')
      accuracy = accuracy_score(y_pred, y_test)
      # print(classification_report(y_test, y_pred, digits=3))
      report = pd.DataFrame(classification_report(y_test, y_pred, digits=3, output_dict=True))
      reports.append(report)
    
      precision_list.append(precision)
      recall_list.append(recall)
      f1_list.append(f1)
      accuracy_list.append(accuracy)
     
      spearsman_coeff.append(stats.spearmanr(y_test, y_pred)[0])
      pearsons_coeff.append(stats.pearsonr(y_test, y_pred)[0])
      df_score = pd.concat(reports)
      df_score.groupby(df_score.index).mean()
      return np.mean(accuracy_list),np.mean(precision_list),np.mean(recall_list),np.mean(f1_list),np.mean(spearsman_coeff), np.mean(pearsons_coeff), df_score

In [None]:
def run_exp(df, features, node_type):
  result = pd.DataFrame()
  Regs = [LinearRegression(),
          HuberRegressor(),
            DecisionTreeRegressor(random_state=0),
            RandomForestRegressor(random_state=0),
          Lasso(),
          Ridge(),
          SGDRegressor(),
          LogisticRegression(class_weight="balanced", random_state=0)]

  clfs = [svm.LinearSVC(class_weight='balanced', random_state=0),
            LogisticRegression(class_weight="balanced", random_state=0),
            RandomForestClassifier(class_weight="balanced", random_state=0)]

  i=0
  for reg in Regs:
    result.at[i,"Regressor"] = str(reg).split("(")[0]
    result.at[i,"features"] = node_type
    result.at[i,"RMSE"],result.at[i,"MAE"],result.at[i,"R2"]\
    ,result.at[i,"spearsman"],result.at[i,"pearsons"],df_score = run_model(df, features, node_type, reg, regression=True)
    i+=1

  for clf in clfs:
    result.at[i,"Classifier"] = str(clf).split("(")[0]
    result.at[i,"features"] = node_type
    result.at[i,"Acc"],result.at[i,"Precision"],result.at[i,"Recall"],result.at[i,"F1"]\
    ,result.at[i,"spearsman"],result.at[i,"pearsons"],df_score = run_model(df, features, node_type, clf, regression=False)
    i+=1

  
  return result,df_score


In [None]:
result_news,df_score_news= run_exp(df_news, news_features, node_type='news')
result_news

(381, 71) (125, 71)
(381, 71) (125, 71)
(381, 71) (125, 71)
(381, 71) (125, 71)
(381, 71) (125, 71)
(381, 71) (125, 71)
(381, 71) (125, 71)
(381, 71) (125, 71)
(381, 71) (125, 71)
(381, 71) (125, 71)
(381, 71) (125, 71)


Unnamed: 0,Regressor,features,RMSE,MAE,R2,spearsman,pearsons,Classifier,Acc,Precision,Recall,F1
0,LinearRegression,news,0.490672,0.342508,-0.010167,0.617176,0.496222,,,,,
1,HuberRegressor,news,0.445343,0.338368,0.167853,0.590382,0.536144,,,,,
2,DecisionTreeRegressor,news,0.409878,0.168,0.295113,0.652158,0.652158,,,,,
3,RandomForestRegressor,news,0.315169,0.23536,0.58323,0.747267,0.774851,,,,,
4,Lasso,news,0.488609,0.481008,-0.001691,,,,,,,
5,Ridge,news,0.428575,0.330125,0.229336,0.622172,0.55109,,,,,
6,SGDRegressor,news,0.423825,0.330068,0.246327,0.632163,0.548552,,,,,
7,LogisticRegression,news,0.409878,0.168,0.295113,0.64892,0.64892,,,,,
8,,news,,,,0.630773,0.630773,LinearSVC,0.824,0.815387,0.815387,0.824
9,,news,,,,0.64892,0.64892,LogisticRegression,0.832,0.823333,0.825591,0.832


In [None]:
df_score_news.T

NameError: ignored

In [None]:
result_user, df_score_user= run_exp(df_user, user_features, node_type='user')
result_user

(851, 99) (284, 99)
(851, 99) (284, 99)
(851, 99) (284, 99)
(851, 99) (284, 99)
(851, 99) (284, 99)
(851, 99) (284, 99)
(851, 99) (284, 99)
(851, 99) (284, 99)
(851, 99) (284, 99)
(851, 99) (284, 99)
(851, 99) (284, 99)


Unnamed: 0,Regressor,features,RMSE,MAE,R2,spearsman,pearsons,Classifier,Acc,Precision,Recall,F1
0,LinearRegression,user,0.502319,0.318521,0.341034,0.604711,0.590873,,,,,
1,HuberRegressor,user,0.499311,0.279916,0.348904,0.615682,0.616782,,,,,
2,DecisionTreeRegressor,user,0.590416,0.235915,0.089626,0.567307,0.51934,,,,,
3,RandomForestRegressor,user,0.463226,0.24419,0.439611,0.630749,0.664006,,,,,
4,Lasso,user,0.618965,0.524495,-0.000541,,,,,,,
5,Ridge,user,0.499653,0.315645,0.348012,0.606344,0.594871,,,,,
6,SGDRegressor,user,0.493446,0.308786,0.364109,0.614402,0.606339,,,,,
7,LogisticRegression,user,0.578366,0.207746,0.126408,0.634468,0.568244,,,,,
8,,user,,,,0.621298,0.543504,LinearSVC,0.859155,0.702571,0.698291,0.859155
9,,user,,,,0.634468,0.568244,LogisticRegression,0.855634,0.720059,0.722983,0.855634


In [None]:
df_score_user.T

Unnamed: 0,precision,recall,f1-score,support
0,0.864078,0.908163,0.885572,98.0
1,0.872928,1.0,0.932153,158.0
2,0.0,0.0,0.0,28.0
accuracy,0.869718,0.869718,0.869718,0.869718
macro avg,0.579002,0.636054,0.605909,284.0
weighted avg,0.783811,0.869718,0.824177,284.0


In [None]:
result_source, df_score_source = run_exp(df_source, source_features, node_type='source')
result_source

(53, 1) (18, 1)
(53, 1) (18, 1)
(53, 1) (18, 1)
(53, 1) (18, 1)
(53, 1) (18, 1)
(53, 1) (18, 1)
(53, 1) (18, 1)
(53, 1) (18, 1)
(53, 1) (18, 1)
(53, 1) (18, 1)
(53, 1) (18, 1)


Unnamed: 0,Regressor,features,RMSE,MAE,R2,spearsman,pearsons,Classifier,Acc,Precision,Recall,F1
0,LinearRegression,source,1.938657,1.507627,-0.499654,-0.250316,-0.169165,,,,,
1,HuberRegressor,source,2.413568,1.692772,-1.324385,-0.250316,-0.169165,,,,,
2,DecisionTreeRegressor,source,0.96688,0.839815,0.626978,0.800028,0.853132,,,,,
3,RandomForestRegressor,source,0.971004,0.839208,0.623789,0.800028,0.854479,,,,,
4,Lasso,source,1.709709,1.429769,-0.166362,,,,,,,
5,Ridge,source,1.933005,1.505531,-0.490922,-0.250316,-0.169165,,,,,
6,SGDRegressor,source,1.884806,1.478348,-0.417498,-0.250316,-0.169165,,,,,
7,LogisticRegression,source,1.130388,0.833333,0.490148,0.807288,0.842123,,,,,
8,,source,,,,0.836404,0.84459,LinearSVC,0.444444,0.241162,0.375,0.444444
9,,source,,,,0.807288,0.842123,LogisticRegression,0.388889,0.236111,0.333333,0.388889


In [None]:
df_score_source.T

Unnamed: 0,precision,recall,f1-score,support
0.0,0.75,0.75,0.75,4.0
1.0,0.5,0.5,0.5,2.0
2.0,0.666667,0.5,0.571429,4.0
3.0,0.0,0.0,0.0,3.0
4.0,0.285714,0.5,0.363636,4.0
5.0,0.0,0.0,0.0,1.0
accuracy,0.444444,0.444444,0.444444,0.444444
macro avg,0.367063,0.375,0.364177,18.0
weighted avg,0.433862,0.444444,0.430014,18.0


In [None]:
result_news_source, df_score_news_source = run_exp(df_news_source_added, news_source_features, node_type='news')
result_news_source

(381, 72) (125, 72)
(381, 72) (125, 72)
(381, 72) (125, 72)
(381, 72) (125, 72)
(381, 72) (125, 72)
(381, 72) (125, 72)
(381, 72) (125, 72)
(381, 72) (125, 72)
(381, 72) (125, 72)
(381, 72) (125, 72)
(381, 72) (125, 72)


Unnamed: 0,Regressor,features,RMSE,MAE,R2,spearsman,pearsons,Classifier,Acc,Precision,Recall,F1
0,LinearRegression,news,0.479695,0.336655,0.034527,0.636704,0.508096,,,,,
1,HuberRegressor,news,0.434197,0.333271,0.208987,0.610364,0.561215,,,,,
2,DecisionTreeRegressor,news,0.481664,0.232,0.026584,0.5086,0.5086,,,,,
3,RandomForestRegressor,news,0.317174,0.24056,0.57791,0.754752,0.774579,,,,,
4,Lasso,news,0.488609,0.481008,-0.001691,,,,,,,
5,Ridge,news,0.417566,0.326239,0.268423,0.637612,0.568058,,,,,
6,SGDRegressor,news,0.410113,0.327754,0.294304,0.644425,0.569509,,,,,
7,LogisticRegression,news,0.409878,0.168,0.295113,0.64892,0.64892,,,,,
8,,news,,,,0.628436,0.628436,LinearSVC,0.824,0.816694,0.811762,0.824
9,,news,,,,0.64892,0.64892,LogisticRegression,0.832,0.823333,0.825591,0.832


In [None]:
df_score_news_source.T

Unnamed: 0,precision,recall,f1-score,support
0,0.902439,0.755102,0.822222,49.0
1,0.857143,0.947368,0.9,76.0
accuracy,0.872,0.872,0.872,0.872
macro avg,0.879791,0.851235,0.861111,125.0
weighted avg,0.874899,0.872,0.869511,125.0
