In [None]:
#Suppress all warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import re
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk import word_tokenize
import nltk

path = 'C:/Users/shrim/Desktop/Research/Social Media Analysis/publicsphere'

### Load comments content

In [None]:
df = pd.read_csv(f'{path}/data/sample/Data_ReadyForAnalysis_WithComments&MetaInfo.csv') # change file here
data = df[['ID','commentText']]

In [None]:
def processing_no_stemming(text):
    unstemmed = re.sub(' +',' ',' '.join([re.sub(r'http\S+|[^a-zA-Z0-9@]|\'', ' '
                                                 , w.lower()) for w in str(text).split()]).strip())
    return unstemmed

def processing_w_stemming(text):
    # define stemmer
    ps = PorterStemmer()
    stemmed = re.sub(' +',' ',' '.join([ps.stem(re.sub(r'http\S+|[^a-zA-Z0-9@]|\'',' '
                                               , w.lower())) for w in str(text).split()]).strip())
    return stemmed

In [None]:
#Stemming
data['CommentsStemmed'] = data['commentText'].apply(lambda row: processing_w_stemming(row))

# Produce another column with comments not stemmed, but processed:
data['CommentsNonStemmed'] = data['commentText'].apply(lambda row: processing_no_stemming(row))

### Tokenization and POS tagging

Please note that the POS are tagged by the universal tagset to align with the formality score equation.

In [None]:
#Tokenization
data['tokens'] = data['CommentsNonStemmed'].apply(lambda x: word_tokenize(x))

#POS tagging
data['POS'] = data['tokens'].apply(lambda x: pos_tag(x, tagset = "universal"))

### Calculating language formality


Calculating the formality scores using nouns, adjectives, articles and prepositions versus pronouns, adverbs, verbs and interjections.

The equation, using corresponding tags in the NLTK's universal POS Tagset, is as the following:

<font size = "3"><center> $\frac{{{NOUN}+{ADJ}+{DET}+{PRT}-{PRON}-{ADV}-{VERB}-{CONJ}}+100}{2}$ </center></font> 
(Heylighen & Dewaele, 2002)

In [None]:
#Count of POS per row
data['POS_count'] = data['POS'].apply(lambda x: nltk.FreqDist(tag for (word, tag) in x))

data['POS_count'].head(20)

0     {'ADV': 3, 'NOUN': 8, 'VERB': 5, 'DET': 5, 'AD...
1     {'NOUN': 16, 'VERB': 11, 'ADV': 8, 'ADP': 9, '...
2                                           {'NOUN': 2}
3            {'DET': 1, 'NOUN': 2, 'VERB': 3, 'ADV': 1}
4                       {'ADP': 1, 'NUM': 1, 'NOUN': 1}
5     {'NOUN': 10, 'VERB': 14, 'DET': 3, 'PRON': 6, ...
6     {'NOUN': 10, 'VERB': 14, 'DET': 3, 'PRON': 6, ...
7     {'ADJ': 2, 'NOUN': 21, 'ADP': 12, 'DET': 16, '...
8     {'NOUN': 8, 'PRON': 3, 'VERB': 4, 'CONJ': 1, '...
9     {'PRON': 2, 'VERB': 2, 'ADJ': 3, 'DET': 3, 'NO...
10    {'PRON': 2, 'VERB': 2, 'ADJ': 3, 'DET': 3, 'NO...
11    {'PRON': 1, 'VERB': 2, 'DET': 2, 'NOUN': 4, 'A...
12    {'PRON': 1, 'VERB': 2, 'DET': 2, 'NOUN': 4, 'A...
13    {'NOUN': 5, 'VERB': 2, 'ADV': 1, 'ADP': 1, 'DE...
14    {'PRON': 4, 'VERB': 6, 'DET': 4, 'ADJ': 2, 'NO...
15            {'VERB': 2, 'ADJ': 1, 'ADV': 1, 'PRT': 1}
16    {'DET': 8, 'NOUN': 11, 'VERB': 9, 'ADJ': 5, 'A...
17    {'DET': 8, 'NOUN': 11, 'VERB': 9, 'ADJ': 5

In [None]:
#Calculating formality score
numerator = ['NOUN','ADJ','DET','PRT']
denominator = ['PRON','ADV','VERB','CONJ']

#Insert formality column with value 0 
data['formality'] = 0

for i in range(len(data)):
    num_temp = 0
    den_temp = 0
    
    for tag in range(3):
        num_temp += data['POS_count'][i][numerator[tag]]
        den_temp += data['POS_count'][i][denominator[tag]]
        
        formality_score = (num_temp - den_temp +100)/2
        
        data['formality'][i] = formality_score

### Comparing scores

Like other deliberative qualities, both dummy and sum rationality scores are used 

In [None]:
#Getting manually coded score
manual_rationality = df[['commentId','TopicRelevance','Reasoning','BackgroundInfo','ExternalEvidence','ExternalEvidence_1_TEXT']]

#create rationality score by summing the 4 indicators 
manual_rationality['rationality_score'] = manual_rationality.sum(axis = 1, numeric_only=True)

#create dummy variable
manual_rationality['dummy'] = manual_rationality['rationality_score'].apply(lambda x: 0 if x==0 else 1)

manual_rationality

#merge dataframes
data = data.merge(manual_rationality)
data = data.drop_duplicates(subset=['commentId'], ignore_index = True)

data

Unnamed: 0,commentId,commentText,CommentsStemmed,CommentsNonStemmed,tokens,POS,POS_count,formality,TopicRelevance,Reasoning,BackgroundInfo,ExternalEvidence,ExternalEvidence_1_TEXT,rationality_score,dummy
0,UgwtCALfP60D8ZvhHOp4AaABAg,Only thing needed is the roasts of the fuckers...,onli thing need is the roast of the fucker tha...,only thing needed is the roasts of the fuckers...,"[only, thing, needed, is, the, roasts, of, the...","[(only, ADV), (thing, NOUN), (needed, VERB), (...","{'ADV': 3, 'NOUN': 8, 'VERB': 5, 'DET': 5, 'AD...",53.0,1,1,0,0,,2,1
1,Ugw2XXTMmSzbW49HvD14AaABAg.8v7l4MgiomY8v7oDw3r2nq,"Saehar Bokhari how about u expand on that, bec...",saehar bokhari how about u expand on that beca...,saehar bokhari how about u expand on that beca...,"[saehar, bokhari, how, about, u, expand, on, t...","[(saehar, NOUN), (bokhari, VERB), (how, ADV), ...","{'NOUN': 16, 'VERB': 11, 'ADV': 8, 'ADP': 9, '...",54.5,1,0,1,0,,2,1
2,Ugiz8nfgau9byHgCoAEC,Pure evil,pure evil,pure evil,"[pure, evil]","[(pure, NOUN), (evil, NOUN)]",{'NOUN': 2},51.0,1,0,0,0,,1,1
3,Ugiw24w2DD-8EXgCoAEC,the beek didn't even apologize. amazing,the beek didn t even apologize amaz,the beek didn t even apologize amazing,"[the, beek, didn, t, even, apologize, amazing]","[(the, DET), (beek, NOUN), (didn, NOUN), (t, V...","{'DET': 1, 'NOUN': 2, 'VERB': 3, 'ADV': 1}",49.5,0,0,0,0,,0,0
4,UggcaoEIJpYwe3gCoAEC,Under 301 club!,under 301 club,under 301 club,"[under, 301, club]","[(under, ADP), (301, NUM), (club, NOUN)]","{'ADP': 1, 'NUM': 1, 'NOUN': 1}",50.5,0,0,0,0,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3221,UgynUnOlxIjSh3ARy4F4AaABAg.8pMRfMuIMsc8pMahcyl8w-,@LAST CALL Oh that's what you use yourself?,@last call oh that what you use yourself,@last call oh that s what you use yourself,"[@, last, call, oh, that, s, what, you, use, y...","[(@, NOUN), (last, ADJ), (call, NOUN), (oh, VE...","{'NOUN': 2, 'ADJ': 1, 'VERB': 3, 'ADP': 1, 'PR...",48.5,0,0,0,0,,0,0
3222,UgwhaLzvJAORa5kfved4AaABAg,ABC is owned by the Bilderberg Group.,abc is own by the bilderberg group,abc is owned by the bilderberg group,"[abc, is, owned, by, the, bilderberg, group]","[(abc, NOUN), (is, VERB), (owned, VERB), (by, ...","{'NOUN': 3, 'VERB': 2, 'ADP': 1, 'DET': 1}",51.0,0,0,0,0,,0,0
3223,UgwNbLpnlTffTnh9dTJ4AaABAg,lol both these guys look like brothers . One i...,lol both these guy look like brother one is mu...,lol both these guys look like brothers one is ...,"[lol, both, these, guys, look, like, brothers,...","[(lol, ADJ), (both, DET), (these, DET), (guys,...","{'ADJ': 4, 'DET': 2, 'NOUN': 3, 'VERB': 3, 'AD...",53.0,0,0,0,0,,0,0
3224,Ugy7xo56npDOiGcxqNp4AaABAg.8e5xq7bbujt8e8VLuajhuv,yeah no one seems to remember he said that.......,yeah no one seem to rememb he said that i reme...,yeah no one seems to remember he said that i r...,"[yeah, no, one, seems, to, remember, he, said,...","[(yeah, ADV), (no, DET), (one, NOUN), (seems, ...","{'ADV': 3, 'DET': 5, 'NOUN': 5, 'VERB': 9, 'PR...",50.0,0,0,0,0,,0,0


In [None]:
#Correlation matrix
data[['formality','rationality_score','dummy']].corr()

Unnamed: 0,formality,rationality_score,dummy
formality,1.0,0.203761,0.118002
rationality_score,0.203761,1.0,0.819416
dummy,0.118002,0.819416,1.0
