author: "Anu Shrestha"

# Run classification with three different classifiers

This code will run classification with three classifiers namely Linear SVM, Logistic Regression, Random Forest to generate the table 4, 5 and 6 used in paper.

# Load required packages

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from numpy.random import seed
# seed the random number generator
seed(1)

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import *
import string
from tqdm import tqdm
from matplotlib import pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/gdrive')

import sys
sys.path.append('/content/gdrive/My Drive/ECIR 2021 Reproducibility/Code/code_to_publish')

import statistical_tests

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Defining required functions

In [31]:


def classify(df_final, final_features, feature_selection, k, clf):    
    y = df_final["label"].values

    #feature selection
    if feature_selection:
      if type(final_features)!=list:
        final_features = final_features.to_list()
      X, selected_features = statistical_tests.stat_sig_test(df_final[final_features+['label']], k)
      
    else:
      X = df_final[final_features].values
      selected_features = final_features

  
    if X.shape[1]==0:
      return (0,0,0,)
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
    precision_list=[]
    recall_list=[]
    f1_list=[]
    accuracy_list=[]
    Avg_precision_list=[]
    AUROC_list=[]

    for tr_ind, tst_ind in skf.split(X,y):
        X_train = X[tr_ind]
        X_test = X[tst_ind]
        y_train = y[tr_ind]
        y_test = y[tst_ind]

        clf.fit(X_train,y_train)
        y_pred = clf.predict(X_test)
        try:
          proba = clf.predict_proba(X_test)
        except:
          proba = clf.decision_function(X_test)
        # print("finish predicting")

        accuracy = accuracy_score(y_pred,y_test)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        accuracy = accuracy_score(y_pred, y_test)
        try:
          AP = average_precision_score(y_test, proba[:,1])
          AUROC = roc_auc_score(y_test, proba[:,1])
        except:
          AP = average_precision_score(y_test, proba)
          AUROC = roc_auc_score(y_test, proba)

        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)
        accuracy_list.append(accuracy)
        Avg_precision_list.append(AP)
        AUROC_list.append(AUROC)
    
    return(
          np.round(np.mean(AUROC_list),3),
          # np.round(np.mean(f1_list),3),
          np.round(np.mean(Avg_precision_list),3),
          # np.round(np.mean(accuracy_list),3),
         selected_features)

In [3]:
def format_df(df,title_or_text):
  '''
  function to convert label to make it uniform accross the datasets
  '''
  df.label = df.label.apply(lambda x: 1 if x in ["fake","Fake",'1',1] else 0)
  new_names = [(i,title_or_text+"_"+i) for i in df.iloc[:, 3:].columns.values]
  df.rename(columns = dict(new_names), inplace=True)
  return df

def merge_title_text(df_title, df_text):
  '''
  Merge all title and text features to form single dataframe
  '''
  df_final = pd.merge(df_title,df_text, on=["news_id","label"], how='left',suffixes=['', '_y'])
  df_final.drop([x for x in df_final if x.endswith('_y')], axis=1, inplace=True)
  return df_final


def get_features(df_final, title_or_text):
  '''
  function to get only those features used in paper of Horne and Adali plus emotion features
  '''
  
  features_used_in_paper = ['Analytic','insight','cause','discrep','tentat','certain','differ','affiliation','power','reward','risk','work','leisure',
                            'money','relig','Tone','affect','WC','WPS','num_nouns','num_propernouns','num_personalnouns','num_ppssessivenouns',
                            'num_whpronoun','num_determinants','num_whdeterminants','num_cnum','num_adverb','num_interjections','num_verb','num_adj',
                            'num_vbd','num_vbg','num_vbn','num_vbp','num_vbz','focuspast','focusfuture','i','we','you','shehe','quant','compare','Exclam',
                            'negate','swear','netspeak','interrog','count_uppercased','percentage_stopwords','AllPunc','Quote', 'lexical_diversity','wlen',
                            'gunning_fog_index','smog_index','flesch_kincaid_grade_level']

  features_used_in_paper_ = [title_or_text+'_'+x for x in features_used_in_paper]

  emotion_features = [title_or_text+'_'+x for x in ['Anger','Anticipation','Disgust','Fear','Joy', 
                                                    'Sadness', 'Surprise', 'Trust','neg','pos','posemo','negemo','anx']]

  
  return features_used_in_paper_+emotion_features 
 

def classification_result(df, text, title, feature_selection):
  '''
  function to automate the classification using multiple classifiers

  df: input dataframe
  text: news body features to be considered if not empty
  title: news title features to be considered if not empty
  feature_selection: if True selects statistically significant (pvalue < 0.05) features up to sqrt of training set 
  '''
  drop_features = ['lexicon_count','neu','compound','adverb', 'verb', 'adj', 'Objective','anger','sad']
  df.drop(columns=['text_'+x for x in drop_features if 'text_'+x in list(df.columns)], inplace=True)
  df.drop(columns=['title_'+x for x in drop_features if 'title_'+x in list(df.columns)], inplace=True)

  result = pd.DataFrame()
  clfs = [svm.LinearSVC(class_weight='balanced', random_state=0),
            LogisticRegression(class_weight="balanced", random_state=0),
            RandomForestClassifier(class_weight="balanced", random_state=0)]

  i=0
  for n in tqdm([int(np.sqrt(df.shape[0]*0.8))]):
    for clf in clfs:
      if text == [] and title == [] :
        text_features = get_features(df, 'text')
        result.at[i,"classifier"] = str(clf).split("(")[0]
        result.at[i,"features"] = "text"
        result.at[i,"AUROC"],result.at[i,"AvgP"],\
        selected_text_features = classify(df,text_features, feature_selection, n, clf)

        title_features = get_features(df, 'title')
        result.at[i+1,"classifier"] = str(clf).split("(")[0]
        result.at[i+1,"features"] = "title"
        result.at[i+1,"AUROC"],result.at[i+1,"AvgP"],\
        selected_title_features = classify(df,title_features,feature_selection,n, clf)
      else:
        result.at[i,"classifier"] = str(clf).split("(")[0]
        result.at[i,"features"] = "text"
        result.at[i,"AUROC"],result.at[i,"AvgP"],\
        selected_text_features = classify(df,text, feature_selection, n, clf)

        result.at[i+1,"classifier"] = str(clf).split("(")[0]
        result.at[i+1,"features"] = "title"
        result.at[i+1,"AUROC"],result.at[i+1,"AvgP"],\
        selected_title_features = classify(df,title,feature_selection,n, clf)
      i+=2
  return result, selected_text_features, selected_title_features

def groupwise_features(selected_features, title_or_text, group):
  '''
  function to get features within each group 
  '''
  complexity_features = ['lexical_diversity','wlen','gunning_fog_index','smog_index','flesch_kincaid_grade_level']

  psychology_features = ['Analytic','insight','cause','discrep','tentat','certain','differ','affiliation','power','reward','risk','work','leisure','money','relig',
                         'Tone','affect','Anger','Anticipation','Disgust','Fear','Joy','Sadness','Surprise','Trust','neg','pos','posemo','negemo','anx']

  stylistic_features = ['WC','WPS','num_nouns','num_propernouns','num_personalnouns','num_ppssessivenouns','num_whpronoun','num_determinants',
                        'num_whdeterminants','num_cnum','num_adverb','num_interjections','num_verb','num_adj','num_vbd','num_vbg','num_vbn','num_vbp','num_vbz',
                        'focuspast','focusfuture','i','we','you','shehe','quant','compare','Exclam','negate','swear','netspeak','interrog','count_uppercased',
                        'percentage_stopwords','AllPunc','Quote']

  selected_features_= [x for x in selected_features]
  if group == 'complexity':
    return [x for x in selected_features_ if x.strip(title_or_text+'_') in complexity_features]
  elif group == 'stylistic':
    return [x for x in selected_features_ if x.strip(title_or_text+'_') in stylistic_features]
  elif group == 'psychology':
    return [x for x in selected_features_ if x.strip(title_or_text+'_') in psychology_features]


# Load data that consists of all the features including Stylistic, Psychology and Complexity features.

In [4]:
df_title_gossipcop = pd.read_pickle("/content/gdrive/My Drive/ECIR 2021 Reproducibility/Data/Generated_features/all_features_for_title_gossipcop.pkl")
df_title_politifact= pd.read_pickle("/content/gdrive/My Drive/ECIR 2021 Reproducibility/Data/Generated_features/all_features_for_title_politifact.pkl")
df_text_gossipcop= pd.read_pickle("/content/gdrive/My Drive/ECIR 2021 Reproducibility/Data/Generated_features/all_features_for_text_gossipcop.pkl")
df_text_politifact= pd.read_pickle("/content/gdrive/My Drive/ECIR 2021 Reproducibility/Data/Generated_features/all_features_for_text_politifact.pkl")
df_title_buzzfeed= pd.read_pickle("/content/gdrive/My Drive/ECIR 2021 Reproducibility/Data/Generated_features/all_features_for_title_buzzfeed.pkl")
df_text_buzzfeed= pd.read_pickle("/content/gdrive/My Drive/ECIR 2021 Reproducibility/Data/Generated_features/all_features_for_text_buzzfeed.pkl")

# Format the dataset so that we can add 'title' or 'text' as prefix on the feature names.

In [5]:
df_title_gossipcop = format_df(df_title_gossipcop, 'title')
df_title_politifact = format_df(df_title_politifact, 'title')
df_title_buzzfeed = format_df(df_title_buzzfeed, 'title')
df_text_buzzfeed = format_df(df_text_buzzfeed, 'text')
df_text_gossipcop = format_df(df_text_gossipcop, 'text')
df_text_politifact = format_df(df_text_politifact, 'text')

# Combine title and text features for each dataset

In [6]:
df_final_g = merge_title_text(df_title_gossipcop, df_text_gossipcop)
df_final_p = merge_title_text(df_title_politifact, df_text_politifact)
df_final_b = merge_title_text(df_title_buzzfeed, df_text_buzzfeed)

# Run classification for each dataset using all three classifiers namely, linear SVM, Logistic Regression and Random Forest. 

# Results for PolitiFact

In [7]:
result_p, selected_text_feat_p, selected_title_feat_p = classification_result(df_final_p,[],[], True)

100%|██████████| 1/1 [00:05<00:00,  5.17s/it]


In [8]:
result_p

Unnamed: 0,classifier,features,AUROC,AvgP
0,LinearSVC,text,0.583,0.466
1,LinearSVC,title,0.833,0.804
2,LogisticRegression,text,0.855,0.809
3,LogisticRegression,title,0.849,0.813
4,RandomForestClassifier,text,0.911,0.878
5,RandomForestClassifier,title,0.867,0.823


# Results for BuzzFeed

In [9]:
result_b, selected_text_feat_b, selected_title_feat_b = classification_result(df_final_b,[],[], True)

100%|██████████| 1/1 [00:07<00:00,  7.49s/it]


In [10]:
result_b

Unnamed: 0,classifier,features,AUROC,AvgP
0,LinearSVC,text,0.614,0.257
1,LinearSVC,title,0.669,0.317
2,LogisticRegression,text,0.728,0.351
3,LogisticRegression,title,0.787,0.423
4,RandomForestClassifier,text,0.785,0.417
5,RandomForestClassifier,title,0.812,0.424


# Results for Gossipcop

In [11]:
result_g, selected_text_feat_g, selected_title_feat_g = classification_result(df_final_g,[],[], True)

100%|██████████| 1/1 [01:37<00:00, 97.96s/it]


In [12]:
result_g

Unnamed: 0,classifier,features,AUROC,AvgP
0,LinearSVC,text,0.623,0.327
1,LinearSVC,title,0.588,0.309
2,LogisticRegression,text,0.703,0.437
3,LogisticRegression,title,0.663,0.38
4,RandomForestClassifier,text,0.782,0.63
5,RandomForestClassifier,title,0.715,0.49


# Run classification for each group of features.
Now we have the result of classification, lets see which group of feature played important role in classification.

# PolitiFact Stylistic Features Only

In [13]:
result_style, text_style, title_style = classification_result(df_final_p, groupwise_features(selected_text_feat_p, 'text', 'stylistic'), 
                      groupwise_features(selected_title_feat_p, 'title', 'stylistic'), False)

100%|██████████| 1/1 [00:03<00:00,  3.19s/it]


In [14]:
result_style

Unnamed: 0,classifier,features,AUROC,AvgP
0,LinearSVC,text,0.634,0.519
1,LinearSVC,title,0.81,0.785
2,LogisticRegression,text,0.803,0.739
3,LogisticRegression,title,0.823,0.79
4,RandomForestClassifier,text,0.882,0.838
5,RandomForestClassifier,title,0.819,0.729


# PolitiFact Psychology Features Only

In [15]:
result_psych, text_psych, title_psych = classification_result(df_final_p, groupwise_features(selected_text_feat_p, 'text', 'psychology'), 
                      groupwise_features(selected_title_feat_p, 'title', 'psychology'), False)

100%|██████████| 1/1 [00:02<00:00,  2.53s/it]


In [16]:
result_psych

Unnamed: 0,classifier,features,AUROC,AvgP
0,LinearSVC,text,0.693,0.643
1,LinearSVC,title,0.778,0.695
2,LogisticRegression,text,0.668,0.622
3,LogisticRegression,title,0.761,0.675
4,RandomForestClassifier,text,0.723,0.662
5,RandomForestClassifier,title,0.791,0.691


# PolitiFact Complexity Features Only

In [17]:
result_comp, text_comp, title_comp = classification_result(df_final_p, groupwise_features(selected_text_feat_p, 'text', 'complexity'), 
                      groupwise_features(selected_title_feat_p, 'title', 'complexity'), False)

100%|██████████| 1/1 [00:02<00:00,  2.02s/it]


In [18]:
result_comp

Unnamed: 0,classifier,features,AUROC,AvgP
0,LinearSVC,text,0.62,0.493
1,LinearSVC,title,0.588,0.505
2,LogisticRegression,text,0.62,0.493
3,LogisticRegression,title,0.588,0.505
4,RandomForestClassifier,text,0.804,0.708
5,RandomForestClassifier,title,0.583,0.486


# BuzzFeed Stylistic Features Only

In [20]:
result_style, text_style, title_style = classification_result(df_final_b, groupwise_features(selected_text_feat_b, 'text', 'stylistic'), 
                      groupwise_features(selected_title_feat_b, 'title', 'stylistic'), False)

100%|██████████| 1/1 [00:04<00:00,  4.46s/it]


In [21]:
result_style

Unnamed: 0,classifier,features,AUROC,AvgP
0,LinearSVC,text,0.566,0.232
1,LinearSVC,title,0.764,0.412
2,LogisticRegression,text,0.661,0.315
3,LogisticRegression,title,0.772,0.423
4,RandomForestClassifier,text,0.753,0.382
5,RandomForestClassifier,title,0.805,0.433


# BuzFeed Psychology Features Only

In [22]:
result_psych, text_psych, title_psych = classification_result(df_final_b, groupwise_features(selected_text_feat_b, 'text', 'psychology'), 
                      groupwise_features(selected_title_feat_b, 'title', 'psychology'), False)

100%|██████████| 1/1 [00:04<00:00,  4.37s/it]


In [23]:
result_psych

Unnamed: 0,classifier,features,AUROC,AvgP
0,LinearSVC,text,0.621,0.29
1,LinearSVC,title,0.585,0.268
2,LogisticRegression,text,0.641,0.318
3,LogisticRegression,title,0.62,0.28
4,RandomForestClassifier,text,0.681,0.319
5,RandomForestClassifier,title,0.645,0.32


# BuzzFeed Complexity Features Only

In [24]:
result_comp, text_comp, title_comp = classification_result(df_final_b, groupwise_features(selected_text_feat_b, 'text', 'complexity'), 
                      groupwise_features(selected_title_feat_b, 'title', 'complexity'), False)

100%|██████████| 1/1 [00:03<00:00,  3.21s/it]


In [25]:
result_comp

Unnamed: 0,classifier,features,AUROC,AvgP
0,LinearSVC,text,0.598,0.268
1,LinearSVC,title,0.544,0.239
2,LogisticRegression,text,0.602,0.261
3,LogisticRegression,title,0.544,0.239
4,RandomForestClassifier,text,0.63,0.285
5,RandomForestClassifier,title,0.555,0.257


# GossipCop Stylistic Features Only

In [26]:
result_style, text_style, title_style = classification_result(df_final_g, groupwise_features(selected_text_feat_g, 'text', 'stylistic'), 
                      groupwise_features(selected_title_feat_g, 'title', 'stylistic'), False)

100%|██████████| 1/1 [00:50<00:00, 50.38s/it]


In [27]:
result_style

Unnamed: 0,classifier,features,AUROC,AvgP
0,LinearSVC,text,0.581,0.283
1,LinearSVC,title,0.581,0.296
2,LogisticRegression,text,0.652,0.355
3,LogisticRegression,title,0.628,0.343
4,RandomForestClassifier,text,0.752,0.59
5,RandomForestClassifier,title,0.634,0.365


# GossipCop Psychology Features Only

In [28]:
result_psych, text_psych, title_psych = classification_result(df_final_g, groupwise_features(selected_text_feat_g, 'text', 'psychology'), 
                      groupwise_features(selected_title_feat_g, 'title', 'psychology'), False)

100%|██████████| 1/1 [01:01<00:00, 61.38s/it]


In [29]:
result_psych

Unnamed: 0,classifier,features,AUROC,AvgP
0,LinearSVC,text,0.62,0.328
1,LinearSVC,title,0.566,0.286
2,LogisticRegression,text,0.645,0.357
3,LogisticRegression,title,0.613,0.328
4,RandomForestClassifier,text,0.713,0.509
5,RandomForestClassifier,title,0.651,0.407


# GossipCop Complexity Features Only

In [32]:
result_comp, text_comp, title_comp = classification_result(df_final_g, groupwise_features(selected_text_feat_g, 'text', 'complexity'), 
                      groupwise_features(selected_title_feat_g, 'title', 'complexity'), False)

100%|██████████| 1/1 [00:12<00:00, 12.13s/it]


In [33]:
result_comp

Unnamed: 0,classifier,features,AUROC,AvgP
0,LinearSVC,text,0.0,0.0
1,LinearSVC,title,0.517,0.248
2,LogisticRegression,text,0.0,0.0
3,LogisticRegression,title,0.538,0.259
4,RandomForestClassifier,text,0.0,0.0
5,RandomForestClassifier,title,0.553,0.287


Here, we can see 0.0 AUROC and Average precision scores for news body text classification using complexity features. This 

# Run classification with only four features used by Horne and Adali

In [34]:
text = ['text_num_nouns', 'text_lexical_diversity', 'text_WC','text_Quote']
title = ['title_flesch_kincaid_grade_level','title_num_nouns','title_percentage_stopwords','title_wlen']
classification_result(df_final_p, text, title, False)

100%|██████████| 1/1 [00:02<00:00,  2.75s/it]


(               classifier features  AUROC   AvgP
 0               LinearSVC     text  0.544  0.445
 1               LinearSVC    title  0.649  0.531
 2      LogisticRegression     text  0.754  0.663
 3      LogisticRegression    title  0.643  0.530
 4  RandomForestClassifier     text  0.861  0.803
 5  RandomForestClassifier    title  0.735  0.612,
 ['text_num_nouns', 'text_lexical_diversity', 'text_WC', 'text_Quote'],
 ['title_flesch_kincaid_grade_level',
  'title_num_nouns',
  'title_percentage_stopwords',
  'title_wlen'])