author: "Anu Shrestha"

This python file contains the code for extracting LIWC features from text.
To extract the psychological features we use LIWC tool (2015) from http://liwc.wpengine.com

Psychological features can be extracted in two ways:

* Either by using the software to compute the features from text
* Or by downloading the dictionary provided by the purchased LIWC tool.

This file contains the code to extract features using the downloaded dictionary.

In [None]:
!pip install liwc


import nltk
nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
stops = stopwords.words('english')
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag
from nltk.tokenize import RegexpTokenizer
from nltk import sent_tokenize
import re
import liwc

from textblob import TextBlob
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import *
import string
from matplotlib import pyplot as plt
%matplotlib inline


# In case if the LIWC features are to be computed using dictionary use following code as example to extract required features
def tokenize(text):
    for match in re.finditer(r'\w+', text, re.UNICODE):
        yield match.group(0)

def compute_liwc_from_dict(df, col):
  parse, category_names = liwc.load_token_parser('./LIWC2015_English.dic') #path of LIWC dictionary

  frames=[]
  for text in df[col]:
    text_tokens = tokenize(text)
    text_counts = Counter(category for token in text_tokens for category in parse(token))

    liwc_value_dic = {}
    for k,v in text_counts.items():
      liwc_value_dic['news_title'] = text
      word_count = len([word for word in text.split(' ')])
      liwc_value_dic['WC'] = word_count
      liwc_value_dic['WPS'] = sum([len(sent.split(' ')) for sent in sent_tokenize(text)])/len(sent_tokenize(text))
      liwc_value_dic[k.split(",")[0].split(' ')[0]] = (v/word_count)*100
    frames.append(pd.DataFrame([liwc_value_dic]))
  df_liwc = pd.concat(frames)
  return df.merge(df_liwc, on=col)