In [1]:
import numpy as np
import pandas as pd
import re
from scipy.stats import zscore



## Pre-processing

In [4]:
df = pd.read_csv("dataframes/liwc_test_prob.csv")
df.head(5)

Unnamed: 0,article_index,outlet,original_text,processed_text,probability_0,probability_1,extremity_0,extremity_1,label,Segment,...,Perception,motion,space,visual,auditory,feeling,time,focuspast,focuspresent,focusfuture
0,766,Daily Caller,Journalists were quick to take to Twitter and ...,journalists were quick to take to twitter and ...,0.999892,0.000108,0.499892,0.499892,0,1,...,7.52,0.33,4.58,1.63,0.65,0.0,4.58,4.9,1.96,0.33
1,1554,NYTimes,"In June, Josue, a 21-year-old Honduran, reache...","in june, josue, a 21 - year - old honduran, re...",2.3e-05,0.999977,0.499977,0.499977,1,1,...,12.89,3.58,9.79,0.24,0.0,0.0,4.06,3.1,2.63,0.48
2,1807,HuffPost,A Massachusetts county sheriff has proposed se...,a massachusetts county sheriff has proposed se...,3.9e-05,0.999961,0.499961,0.499961,1,1,...,8.66,0.5,8.17,0.25,0.0,0.0,3.22,2.72,2.72,1.73
3,255,FoxNews,We’ve got a question for all these people clai...,we ’ ve got a question for all these people cl...,0.999971,2.9e-05,0.499971,0.499971,0,1,...,11.14,1.94,7.02,1.45,0.0,0.0,4.36,4.6,2.91,0.24
4,920,CNN,President Donald Trump won the White House in ...,president donald trump won the white house in ...,2.8e-05,0.999972,0.499972,0.499972,1,1,...,11.62,1.21,9.2,0.48,0.24,0.0,3.87,3.39,5.33,0.48


In [35]:
df = df.iloc[:, :9]

In [36]:
df.head(5)

Unnamed: 0,article_index,outlet,original_text,processed_text,probability_0,probability_1,extremity_0,extremity_1,label
0,766,Daily Caller,Journalists were quick to take to Twitter and ...,journalists were quick to take to twitter and ...,0.999892,0.000108,0.499892,0.499892,0
1,1554,NYTimes,"In June, Josue, a 21-year-old Honduran, reache...","in june, josue, a 21 - year - old honduran, re...",2.3e-05,0.999977,0.499977,0.499977,1
2,1807,HuffPost,A Massachusetts county sheriff has proposed se...,a massachusetts county sheriff has proposed se...,3.9e-05,0.999961,0.499961,0.499961,1
3,255,FoxNews,We’ve got a question for all these people clai...,we ’ ve got a question for all these people cl...,0.999971,2.9e-05,0.499971,0.499971,0
4,920,CNN,President Donald Trump won the White House in ...,president donald trump won the white house in ...,2.8e-05,0.999972,0.499972,0.499972,1


In [37]:
df.rename(columns={'probability_0':'Prob_conservative',
                   'probability_1':'Prob_liberal',
                   'extremity_0':'Extremity'}, inplace=True)

In [38]:
df.drop(['article_index', 'extremity_1', 'Prob_conservative'], axis=1, inplace=True)

In [39]:
df.dtypes[df.dtypes != 'float64']

outlet            object
original_text     object
processed_text    object
label              int64
dtype: object

In [40]:
df["label"] = df["label"].astype('category')

In [41]:
df.to_csv('dataframes/test_prob.csv', index=False) 

## Post-LIWC

In [42]:
df = pd.read_csv("dataframes/LIWC-22 Results.csv")
df.head(5)

Unnamed: 0,outlet,original_text,processed_text,Prob_liberal,Extremity,label,Segment,WC,number,quantity,...,Perception,motion,space,visual,auditory,feeling,time,focuspast,focuspresent,focusfuture
0,Daily Caller,Journalists were quick to take to Twitter and ...,journalists were quick to take to twitter and ...,0.000108,0.499892,0,1,306,0.65,1.96,...,7.52,0.33,4.58,1.63,0.65,0.0,4.58,4.9,1.96,0.33
1,NYTimes,"In June, Josue, a 21-year-old Honduran, reache...","in june, josue, a 21 - year - old honduran, re...",0.999977,0.499977,1,1,419,2.86,6.44,...,12.89,3.58,9.79,0.24,0.0,0.0,4.06,3.1,2.63,0.48
2,HuffPost,A Massachusetts county sheriff has proposed se...,a massachusetts county sheriff has proposed se...,0.999961,0.499961,1,1,404,2.72,4.46,...,8.66,0.5,8.17,0.25,0.0,0.0,3.22,2.72,2.72,1.73
3,FoxNews,We’ve got a question for all these people clai...,we ’ ve got a question for all these people cl...,2.9e-05,0.499971,0,1,413,1.94,4.36,...,11.14,1.94,7.02,1.45,0.0,0.0,4.36,4.6,2.91,0.24
4,CNN,President Donald Trump won the White House in ...,president donald trump won the white house in ...,0.999972,0.499972,1,1,413,2.66,4.6,...,11.62,1.21,9.2,0.48,0.24,0.0,3.87,3.39,5.33,0.48


In [43]:
df.drop(['Segment'], axis=1, inplace=True)

In [44]:
df.dtypes[df.dtypes != 'float64']

outlet            object
original_text     object
processed_text    object
label              int64
WC                 int64
dtype: object

In [45]:
df["label"] = df["label"].astype('category')

## Adding moral-emotional Dics

In [46]:
affect_dict_path = '../../data/dictionaries/only_affect_dict.txt'
with open(affect_dict_path, 'r') as file:
    affect_dict = file.readlines()
    # Remove newline characters
    affect_dict = [line.strip() for line in affect_dict]
    
affect_dict = [root.rstrip('*') for root in affect_dict]

In [47]:
moral_dict_path = '../../data/dictionaries/only_moral_dict.txt'
with open(moral_dict_path, 'r') as file:
    moral_dict = file.readlines()
    # Remove newline characters
    moral_dict = [line.strip() for line in moral_dict]
    
moral_dict = [root.rstrip('*') for root in moral_dict]

In [48]:
me_dict_path = '../../data/dictionaries/moral_emotional_dict.txt'
with open(me_dict_path, 'r') as file:
    me_dict = file.readlines()
    # Remove newline characters
    me_dict = [line.strip() for line in me_dict]
    
me_dict = [root.rstrip('*') for root in me_dict]

In [49]:
# Define the function to calculate percentages of words
def calculate_word_percentage(text, word_roots):
    text = text.lower()
    words = re.findall(r'\b\w+\b', text)
    found_words = [word for word in words if any(root in word for root in word_roots)]
    percentage = (len(found_words) / len(words)) * 100
    return percentage

# Apply the function to the DataFrame
df['UniquelyMoral'] = df['processed_text'].apply(lambda x: calculate_word_percentage(x, moral_dict))
df['UniquelyEmotional'] = df['processed_text'].apply(lambda x: calculate_word_percentage(x, affect_dict))
df['MoralEmotional'] = df['processed_text'].apply(lambda x: calculate_word_percentage(x, me_dict))

In [50]:
df.to_csv('dataframes/Dict_Analysis.csv', index=False)

In [51]:
df.head(5)

Unnamed: 0,outlet,original_text,processed_text,Prob_liberal,Extremity,label,WC,number,quantity,Drives,...,visual,auditory,feeling,time,focuspast,focuspresent,focusfuture,UniquelyMoral,UniquelyEmotional,MoralEmotional
0,Daily Caller,Journalists were quick to take to Twitter and ...,journalists were quick to take to twitter and ...,0.000108,0.499892,0,306,0.65,1.96,4.9,...,1.63,0.65,0.0,4.58,4.9,1.96,0.33,5.882353,5.882353,1.633987
1,NYTimes,"In June, Josue, a 21-year-old Honduran, reache...","in june, josue, a 21 - year - old honduran, re...",0.999977,0.499977,1,419,2.86,6.44,3.1,...,0.24,0.0,0.0,4.06,3.1,2.63,0.48,5.48926,9.785203,2.386635
2,HuffPost,A Massachusetts county sheriff has proposed se...,a massachusetts county sheriff has proposed se...,0.999961,0.499961,1,404,2.72,4.46,10.15,...,0.25,0.0,0.0,3.22,2.72,2.72,1.73,5.693069,8.168317,0.990099
3,FoxNews,We’ve got a question for all these people clai...,we ’ ve got a question for all these people cl...,2.9e-05,0.499971,0,413,1.94,4.36,7.02,...,1.45,0.0,0.0,4.36,4.6,2.91,0.24,4.842615,10.411622,2.179177
4,CNN,President Donald Trump won the White House in ...,president donald trump won the white house in ...,0.999972,0.499972,1,413,2.66,4.6,7.02,...,0.48,0.24,0.0,3.87,3.39,5.33,0.48,4.358354,12.590799,0.726392


In [52]:
df.columns

Index(['outlet', 'original_text', 'processed_text', 'Prob_liberal',
       'Extremity', 'label', 'WC', 'number', 'quantity', 'Drives',
       'affiliation', 'achieve', 'power', 'cogproc', 'insight', 'cause',
       'discrep', 'tentat', 'certitude', 'differ', 'emo_pos', 'emo_neg',
       'emo_anx', 'emo_anger', 'emo_sad', 'Social', 'family', 'friend',
       'female', 'male', 'leisure', 'home', 'work', 'money', 'relig',
       'Physical', 'health', 'wellness', 'sexual', 'food', 'death', 'reward',
       'risk', 'Perception', 'motion', 'space', 'visual', 'auditory',
       'feeling', 'time', 'focuspast', 'focuspresent', 'focusfuture',
       'UniquelyMoral', 'UniquelyEmotional', 'MoralEmotional'],
      dtype='object')

In [53]:
df.drop(['outlet', 'original_text', 'processed_text', 'Prob_liberal'], axis=1, inplace=True)

In [54]:
df.to_csv('liwc_data.csv', index=False)

In [55]:
df

Unnamed: 0,Extremity,label,WC,number,quantity,Drives,affiliation,achieve,power,cogproc,...,visual,auditory,feeling,time,focuspast,focuspresent,focusfuture,UniquelyMoral,UniquelyEmotional,MoralEmotional
0,0.499892,0,306,0.65,1.96,4.90,0.98,0.65,3.27,7.84,...,1.63,0.65,0.00,4.58,4.90,1.96,0.33,5.882353,5.882353,1.633987
1,0.499977,1,419,2.86,6.44,3.10,0.95,0.24,1.91,6.68,...,0.24,0.00,0.00,4.06,3.10,2.63,0.48,5.489260,9.785203,2.386635
2,0.499961,1,404,2.72,4.46,10.15,1.73,1.73,6.93,9.16,...,0.25,0.00,0.00,3.22,2.72,2.72,1.73,5.693069,8.168317,0.990099
3,0.499971,0,413,1.94,4.36,7.02,3.15,1.69,2.66,8.96,...,1.45,0.00,0.00,4.36,4.60,2.91,0.24,4.842615,10.411622,2.179177
4,0.499972,1,413,2.66,4.60,7.02,1.45,1.45,4.12,8.96,...,0.48,0.24,0.00,3.87,3.39,5.33,0.48,4.358354,12.590799,0.726392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,0.499933,1,437,3.43,4.81,4.81,0.92,2.97,1.37,9.38,...,0.23,0.00,0.23,3.66,3.43,3.43,1.37,5.949657,12.128146,1.144165
176,0.495781,1,387,2.58,2.58,7.49,1.81,0.52,4.91,5.17,...,0.00,0.00,0.00,3.10,3.36,2.07,0.26,12.144703,7.493540,4.392765
177,0.499833,1,413,5.08,8.72,3.87,1.21,0.00,2.66,9.69,...,0.48,0.24,0.00,3.63,8.96,0.48,0.24,1.937046,12.348668,2.905569
178,0.499985,1,418,2.39,5.26,10.05,2.63,0.48,6.94,4.78,...,0.48,0.00,0.00,4.31,1.44,5.02,0.24,7.655502,9.808612,2.153110


In [24]:
df.columns

Index(['Extremity', 'WC', 'number', 'quantity', 'Drives', 'affiliation',
       'achieve', 'power', 'cogproc', 'insight', 'cause', 'discrep', 'tentat',
       'certitude', 'differ', 'emo_pos', 'emo_neg', 'emo_anx', 'emo_anger',
       'emo_sad', 'Social', 'family', 'friend', 'female', 'male', 'leisure',
       'home', 'work', 'money', 'relig', 'Physical', 'health', 'wellness',
       'sexual', 'food', 'death', 'reward', 'risk', 'Perception', 'motion',
       'space', 'visual', 'auditory', 'feeling', 'time', 'focuspast',
       'focuspresent', 'focusfuture', 'UniquelyMoral', 'UniquelyEmotional',
       'MoralEmotional'],
      dtype='object')

## Z-Scores

Index(['outlet', 'original_text', 'processed_text', 'Prob_liberal',
       'Extremity', 'label', 'WC', 'number', 'quantity', 'Drives',
       'affiliation', 'achieve', 'power', 'cogproc', 'insight', 'cause',
       'discrep', 'tentat', 'certitude', 'differ', 'emo_pos', 'emo_neg',
       'emo_anx', 'emo_anger', 'emo_sad', 'Social', 'family', 'friend',
       'female', 'male', 'leisure', 'home', 'work', 'money', 'relig',
       'Physical', 'health', 'wellness', 'sexual', 'food', 'death', 'reward',
       'risk', 'Perception', 'motion', 'space', 'visual', 'auditory',
       'feeling', 'time', 'focuspast', 'focuspresent', 'focusfuture',
       'UniquelyMoral', 'UniquelyEmotional', 'MoralEmotional'],
      dtype='object')

In [None]:
selected_columns = df.iloc[:, 7:23]

In [None]:
z_scores = selected_columns.apply(zscore)

In [None]:
# Add the z-scores back to the DataFrame
for i, col in enumerate(selected_columns.columns):
    df[f'{col}_zscore'] = z_scores.iloc[:, i]

In [None]:
df.columns[:39]

In [None]:
columns_to_drop = ['original_text', 'processed_text' ,
                   'Probability_con', 'Probability_lib',
                   'Drives', 'Cognition','emo_pos',
                   'emo_neg', 'emo_anx', 'emo_anger', 'emo_sad',
                   'Social', 'Lifestyle', 'Physical', 'focuspast',
                   'focuspresent', 'focusfuture', 'Moral', 'Affect',
                   'Moral-Emotional',]

In [None]:
df_z = df.drop(columns=columns_to_drop)

In [None]:
df_z.to_csv('z_score_extremity.csv', index=False) 

In [None]:
df.describe()