# Comparing dictionary and manual coding: Rationality
Data: *Data_FixedCodingMistakes_noICR_ReadyForAnalysis_with_comments*. 

This script will get rationality score based on the following methods:
1. Manually coding result: RATIONALITY_DUMMY
2. Flesch-Kincaid index via function `textstat.flesch_reading_ease`
3. Nithyanand et al. (2017b): a list of websites that is related to fake news
4. Munger and Phillips (2019): “alternative information networks”

In [1]:
import pandas as pd
import os
import textstat
import numpy as np
import re

### Read manual code and get the socre
only use colums about meta-information and about rationality

In [2]:
compare_rationality = pd.read_csv('data/full_data.csv')
compare_rationality.head()

Unnamed: 0,StartDate,RecordedDate,IPAddress,Finished,Coder,ID,Mark_ID,Genre,topiccode,Platform,...,dislikeCount_video,likeCount_video,date_difference,commentCount_video,replyCount_comment,topic,subscribers,HATELIST_FOCUSED_DUMMY,Time_comment_year,Time_video_year
0,5/30/2021 13:03:17,5/30/2021 13:04:17,62.194.51.29,1,6,UgyPHwv8G0cDE6-wEgl4AaABAg.8_0ZjJKSJty8_0kXGkAd2U,119,0,0,1,...,,,,,,,,0,2017,2017.0
1,10/11/2021 10:34:05,10/11/2021 10:36:46,213.127.109.191,1,6,Ugx2WXq9UdV8mPPjejJ4AaABAg.8yHCKV0Boe58yYRxEQEF45,282,1,2,1,...,195.0,3817.0,743.0,1748.0,,economy,3630000.0,0,2019,2019.0
2,9/9/2021 18:49:48,9/9/2021 18:51:32,213.127.110.0,1,6,1110578710648890000,372,2,4,2,...,,,,,,,,0,2019,
3,6/6/2021 16:12:46,6/6/2021 16:16:16,213.127.76.145,1,6,UgwUPFScjJ0MCeaP2F54AaABAg.8lvp3fc9Euf8lvvgsUgEgV,769,0,0,1,...,,,,,,,,0,2018,2018.0
4,6/13/2021 13:25:49,6/13/2021 13:27:28,213.127.82.232,1,6,UgwWKCWtSJdFvjGHvTp4AaABAg.8kUC5dGrQ2H8kUDRihE2f3,1206,0,0,1,...,,,,,,,,0,2018,2018.0


In [3]:
len(compare_rationality)

3862

Double check dummy variable calculation

In [4]:
def mannual_concrete(compare_rationality):
    for index, d in compare_rationality.iterrows():
        allsum = sum(d[['Reasoning','BackgroundInfo','ExternalEvidence']])
        compare_rationality.loc[index,'mannual_sum'] = allsum
        if allsum >= 1:
            compare_rationality.loc[index,'mannual_dummy'] = 1
        else:
            compare_rationality.loc[index,'mannual_dummy'] = 0
    return compare_rationality

In [5]:
compare_rationality = mannual_concrete(compare_rationality)

In [6]:
f = compare_rationality[['RATIONALITY_DUMMY','mannual_dummy']]
f.corr()

Unnamed: 0,RATIONALITY_DUMMY,mannual_dummy
RATIONALITY_DUMMY,1.0,1.0
mannual_dummy,1.0,1.0


### Calculate FK score

In [7]:
def F_K_score(compare_rationality):
    for index, t in compare_rationality.iterrows():
        if type(t['commentText']) != float:
            f=textstat.flesch_reading_ease(str(t['commentText'])) 
            compare_rationality.loc[index,'FK_SCORE'] = f
        else:
            compare_rationality.loc[index,'FK_SCORE'] = np.nan
    return compare_rationality

In [8]:
compare_rationality = pd.DataFrame(F_K_score(compare_rationality))

### Find false sources

In [9]:
def detect_false_sources(text):
    urls = re.findall('[^ :/((www))]*?\.com',text)
    urls = urls+re.findall('[^ :/((www))]*?\.org',text)
    count = 0
    for url in urls:
        name = '.'.join(url.split('.')[-2:])
        if name in false_sources:
            print(repr(name))
            count +=1
    return count

In [10]:
def detect_ain(text):
    count = 0
    for a in ain:
        ainn = re.findall(a,text)
        if ainn:
            print(ainn)
            count += len(ain)
    return count

In [11]:
false_sources=pd.read_csv('https://raw.githubusercontent.com/BigMcLargeHuge/opensources/master/sources/sources.csv')["Unnamed: 0"].tolist()
ain = pd.read_csv('data/dictionaries/labeled_ain.csv')['title'].tolist()
print(len(false_sources),len(ain))

833 51


In [12]:
for index, text in compare_rationality.iterrows():
    compare_rationality.loc[index, 'FALSE_SCOURCES'] = 0
    if type(text['commentText']) == float:
        compare_rationality.loc[index, 'FALSE_SCOURCES'] = np.nan
        continue
    compare_rationality.loc[index, 'FALSE_SCOURCES'] = detect_false_sources(text['commentText'])
    compare_rationality.loc[index, 'FALSE_SCOURCES'] += detect_ain(text['commentText'])

'dailykos.com'
['Project Veritas']


As the result shows, only two comments contain false resources. So this methods may not sensitive in detecting rationality.

### Check descriptive result

In [13]:
compare_rationality.loc[compare_rationality['FK_SCORE']<10,['commentText','RATIONALITY_DUMMY','FK_SCORE']]

Unnamed: 0,commentText,RATIONALITY_DUMMY,FK_SCORE
72,Hahaha...karma bitch!!!,0,-6.70
97,@Fernando Margellan BODEGA BABIES!!!,0,8.20
164,Exactly,0,-47.99
182,beautiful!,0,-47.99
205,"""Comedy""",0,-47.99
...,...,...,...
3725,I LOVE YOUUUUUUUUUUUUUUUU,0,-278.44
3779,Hahahahahhahahahhaha you don't have a clue.,0,6.17
3782,Hahahahahahaha. They are going to federal jail...,0,-22.27
3792,HAHAHAHahahahah...hah...huh?,0,-555.59


In [14]:
compare_rationality['FK_SCORE'].describe()

count    3862.000000
mean       73.362890
std        33.888733
min      -555.590000
25%        60.820000
50%        77.740000
75%        91.780000
max       206.840000
Name: FK_SCORE, dtype: float64

The reason FK score doesn't work may be that comments are short. The equation of [FK score](https://en.wikipedia.org/wiki/Flesch–Kincaid_readability_tests) is not suitable for short texts.

In [15]:
compare_rationality['RATIONALITY_DUMMY'].describe()

count    3862.000000
mean        0.180476
std         0.384634
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: RATIONALITY_DUMMY, dtype: float64

In [16]:
compare_rationality

Unnamed: 0,StartDate,RecordedDate,IPAddress,Finished,Coder,ID,Mark_ID,Genre,topiccode,Platform,...,replyCount_comment,topic,subscribers,HATELIST_FOCUSED_DUMMY,Time_comment_year,Time_video_year,mannual_sum,mannual_dummy,FK_SCORE,FALSE_SCOURCES
0,5/30/2021 13:03:17,5/30/2021 13:04:17,62.194.51.29,1,6,UgyPHwv8G0cDE6-wEgl4AaABAg.8_0ZjJKSJty8_0kXGkAd2U,119,0,0,1,...,,,,0,2017,2017.0,0.0,0.0,121.22,0.0
1,10/11/2021 10:34:05,10/11/2021 10:36:46,213.127.109.191,1,6,Ugx2WXq9UdV8mPPjejJ4AaABAg.8yHCKV0Boe58yYRxEQEF45,282,1,2,1,...,,economy,3630000.0,0,2019,2019.0,0.0,0.0,88.74,0.0
2,9/9/2021 18:49:48,9/9/2021 18:51:32,213.127.110.0,1,6,1110578710648890000,372,2,4,2,...,,,,0,2019,,0.0,0.0,68.77,0.0
3,6/6/2021 16:12:46,6/6/2021 16:16:16,213.127.76.145,1,6,UgwUPFScjJ0MCeaP2F54AaABAg.8lvp3fc9Euf8lvvgsUgEgV,769,0,0,1,...,,,,0,2018,2018.0,1.0,1.0,84.68,0.0
4,6/13/2021 13:25:49,6/13/2021 13:27:28,213.127.82.232,1,6,UgwWKCWtSJdFvjGHvTp4AaABAg.8kUC5dGrQ2H8kUDRihE2f3,1206,0,0,1,...,,,,0,2018,2018.0,0.0,0.0,89.75,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3857,8/19/2021 14:50:13,8/19/2021 14:54:28,62.194.51.29,1,6,1152219467579100000,10000695,0,4,2,...,,,,0,2019,,0.0,0.0,70.29,0.0
3858,8/19/2021 15:10:27,8/19/2021 15:12:21,62.194.51.29,1,6,1085362296472430000,10007008,1,4,2,...,,,,0,2019,,0.0,0.0,74.86,0.0
3859,10/6/2021 16:08:39,10/6/2021 16:10:42,213.127.113.113,1,6,UghFY3QJ6nmT_ngCoAEC.7-H0Z7--wxd8goqpaPs-bl,20000102,0,3,1,...,,east,6740000.0,0,2018,2010.0,0.0,0.0,93.14,0.0
3860,10/15/2021 18:30:04,10/15/2021 18:35:40,213.127.109.191,1,6,UgyWabsmmnq3zam4DgZ4AaABAg,20000418,2,3,1,...,0.0,east,6800000.0,0,2018,2015.0,0.0,0.0,96.18,0.0


In [17]:
compare_rationality.to_csv("outputs/automated_results/Rationality_FK&fake.csv",index=False)