author: "Anu Shrestha"

# Combining all features including LIWC features
This file contains the code that merge psychological features from LIWC tool with all remaining (complexity, stylistic and  emotion features) features used in paper.


In [1]:

import nltk
nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
stops = stopwords.words('english')
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag
from nltk.tokenize import RegexpTokenizer
from nltk import sent_tokenize
import re, os

import pandas as pd
import numpy as np
from collections import Counter
import string

from google.colab import drive
drive.mount('/content/gdrive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
Mounted at /content/gdrive


In [8]:
def merge_liwc_and_remaining_features(path, filename_remaining, filename_liwc):
  '''
  function to merge LIWC features and all other features. 
  
  path: path to the folder where Generated_features folder is saved
  filename_remaining: name of pickle file that contains all other features except LIWC
  filename_liwc: name of file that contains LIWC features

  '''
  # path = '/content/gdrive/My Drive/ECIR 2021 Reproducibility/Data/'
  df_remaining_features = pd.read_pickle(os.path.join(path,"Generated_features/features_for_"+filename_remaining+"_noLIWC.pkl"))
  df_liwc = pd.read_csv(os.path.join(path,"Generated_features/"+filename_liwc+".csv"))

  try:
    #format liwc dataframe and remove unwanted columns
    df_liwc['news_id'] = df_liwc.Filename.apply(lambda x: x.strip('.txt') if x.endswith('.txt') else x)
    df_liwc_= df_liwc.drop(columns=['Segment','Filename'])
  except:
    df_liwc_ = df_liwc.drop(columns=['label'])

  #remove duplicate rows if any
  df_remaining_features = df_remaining_features.drop_duplicates(subset='news_id', keep='last')

  #merge both dfs by news_id
  df_merged_features = df_remaining_features.merge(df_liwc_, on='news_id', how='inner')

  #assert if merged dataframe have required number of rows and columns
  assert df_merged_features.shape[1] == (df_remaining_features.shape[1] + df_liwc_.shape[1]-1)
  assert df_merged_features.shape[0] == df_remaining_features.shape[0] == df_liwc_.shape[0]

  df_merged_features.to_pickle(os.path.join(path,"Generated_features/all_features_for_"+filename_remaining+".pkl"))
  print(list(df_merged_features.columns))
  print("Merged and saved all features in file")



In [9]:
path = '/content/gdrive/My Drive/ECIR 2021 Reproducibility/Data'#replace with your path where Generated_features folder is saved
filenames = ['title_politifact','text_politifact','title_buzzfeed','text_buzzfeed','title_gossipcop','text_gossipcop'] #replace with your filenames
LIWC_filenames = ['LIWC2015 Results (title_politifact (836 files))','LIWC2015 Results (text_politifact (836 files))',
                  'title_buzzfeed_liwc','text_buzzfeed_liwc',
                  'LIWC2015 Results (title_gossipcop (19759 files))','LIWC2015 Results (text_gossipcop (19759 files))']#replace with your filenames

for filename_remaining, filename_liwc in zip(filenames, LIWC_filenames):
  merge_liwc_and_remaining_features(path, filename_remaining, filename_liwc)

['news_id', 'news_title', 'label', 'smog_index', 'flesch_reading_ease', 'flesch_kincaid_grade_level', 'coleman_liau_index', 'gunning_fog_index', 'ari_index', 'lix_index', 'dale_chall_score', 'dale_chall_known_fraction', 'num_nouns', 'num_propernouns', 'num_personalnouns', 'num_ppssessivenouns', 'num_whpronoun', 'num_determinants', 'num_whdeterminants', 'num_cnum', 'num_adverb', 'num_interjections', 'num_verb', 'num_adj', 'num_vbd', 'num_vbg', 'num_vbn', 'num_vbp', 'num_vbz', 'percentage_stopwords', 'count_uppercased', 'wlen', 'lexical_diversity', 'syllable_count', 'sentence_count', 'Anger', 'Anticipation', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Trust', 'Objective', 'compound', 'neg', 'neu', 'pos', 'WC', 'Analytic', 'Clout', 'Authentic', 'Tone', 'WPS', 'Sixltr', 'Dic', 'function', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they', 'ipron', 'article', 'prep', 'auxverb', 'adverb', 'conj', 'negate', 'verb', 'adj', 'compare', 'interrog', 'number', 'quant', 'affect', 'posemo',