In [1]:
import pandas as pd
import numpy as np

In [4]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    # Legacy Python that doesn't verify HTTPS certificates by default
    pass
else:
    # Handle target environment that doesn't support HTTPS verification
    ssl._create_default_https_context = _create_unverified_https_context

import feedparser
# To text preprocessing
import xml.sax.saxutils as saxutils
import ast
import re
# To get claimReview
from bs4 import BeautifulSoup
import requests
    
# Get links list from websites feed
def get_articles_url(url):
    d = feedparser.parse(url)
    linksList = []
    for post in d.entries: linksList.append(post.link)
    return linksList

# Save dataset to tsv file
def save_tsv_pandas(data, file_name):
    data.to_csv("./" + file_name + ".tsv", sep='\t',index=True)

# Load dataset from tsv file
def load_tsv_pandas(file_name):
    return pd.read_csv(file_name+".tsv", sep='\t', index_col=0)

# Update dataset. URL is primary key.
def update_dataset(dataset, new_entries):
    temp_df = dataset.append(new_entries)
    temp_df = temp_df.drop_duplicates()
    return temp_df

def re_char(str):
    return re.sub('[^A-Za-z0-9 \!\@\#\$\%\&\*\:\,\.\;\:\-\_\"\'\]\[\}\{\+\á\à\é\è\í\ì\ó\ò\ú\ù\ã\õ\â\ê\ô\ç\|]+', '',str)

# Text Preprocessing
def text_pre_proc(str):
    aux = saxutils.unescape(str.replace('&quot;', ''))
    #remove not allowed characters
    aux = re.sub('[^A-Za-z0-9 \!\@\#\$\%\&\*\:\,\.\;\:\-\_\"\'\]\[\}\{\+\á\à\é\è\í\ì\ó\ò\ú\ù\ã\õ\â\ê\ô\ç\|]+', '',aux)
    my_dict = ast.literal_eval(aux)
    return my_dict

# Get ClaimReview
def get_claimReview(url):
    response = requests.get(url, timeout=30)
    content = BeautifulSoup(response.content, "html.parser")
    claimList = []
    for claimR in content.findAll('script', attrs={"type": "application/ld+json"}):
        linha = []
        try:
            my_dict = text_pre_proc(claimR.get_text(strip=True))
            linha.append(url)
            linha.append(my_dict['author']['url'])
            linha.append(my_dict['datePublished'])
            linha.append(my_dict['claimReviewed'])
            try: linha.append(my_dict['reviewBody'])
            except:
                try:
                    linha.append(my_dict['description'])
                except:
                    linha.append('Empty')
            linha.append(re_char(content.title.get_text().replace('<title>','').replace('</title>','')))
            linha.append(my_dict['reviewRating']['ratingValue'])
            linha.append(my_dict['reviewRating']['bestRating'])
            linha.append(my_dict['reviewRating']['alternateName'])
            linha.append(my_dict['itemReviewed']['@type'])
            claimList.append(linha)
        except:
            pass
    return claimList

# Main Function
def main():
    websites = ["https://aosfatos.org/noticias/feed/", "https://apublica.org/tag/truco/feed/", "https://piaui.folha.uol.com.br/lupa/feed/"]
    toprow = ['URL', 'Author', 'datePublished', 'claimReviewed', 'reviewBody', 'title', 'ratingValue', 'bestRating', 'alternativeName', 'contentType']
    # Step 1 - Get links list of the last articles
    linksList = []
    for url in websites: linksList.extend(get_articles_url(url))
    print ("Numero de links: {}".format(len(linksList)))
    # Step 2 - Get Claim Review
    claimList = []
    count = 0
    for url in linksList:
        count = count + 1
        print ("{} de {} > ".format(count,len(linksList)) + url)
        lineList = get_claimReview(url)
        for line in lineList: claimList.append(line)
    # Step 3 - Create pandas DataFrame with the new entries
    new_entries = pd.DataFrame(claimList, columns=toprow)
    new_entries = new_entries.set_index('URL')
    # Step 4 - Load the old version of the dataset, update and save
    dataset = load_tsv_pandas('factCkBr')
    factCkBr = update_dataset(dataset, new_entries)
    save_tsv_pandas(factCkBr, 'new_factCkBR')

In [5]:
main()

Numero de links: 70
1 de 70 > https://www.aosfatos.org/noticias/em-pronunciamento-bolsonaro-infla-acoes-do-governo-durante-pandemia/
2 de 70 > https://www.aosfatos.org/noticias/esculturas-de-olhos-sangrando-nao-foram-expostas-em-recife-mas-no-chile-em-2019/
3 de 70 > https://www.aosfatos.org/noticias/montagem-surgida-em-2018-e-atribuida-em-posts-protestos-recentes-contra-bolsonaro/
4 de 70 > https://www.aosfatos.org/noticias/e-falso-que-vacina-contra-covid-19-pode-criar-novas-variantes-do-coronavirus/
5 de 70 > https://www.aosfatos.org/noticias/e-falso-que-papa-francisco-disse-que-apresentara-um-lider-global-em-2022/
6 de 70 > https://www.aosfatos.org/noticias/ao-defender-tratamento-precoce-heinze-falseia-fatos-e-distorce-dados-na-cpi-da-covid-19/
7 de 70 > https://www.aosfatos.org/noticias/virologista-nao-disse-que-vacinados-contra-covid-19-morrerao-em-dois-anos/
8 de 70 > https://www.aosfatos.org/noticias/na-cpi-nise-yamaguchi-defende-tratamento-precoce-com-informacoes-enganosas/
9 d

69 de 70 > https://piaui.folha.uol.com.br/lupa/2021/06/01/dia-imprensa-bolsonaro-ataque/
70 de 70 > https://piaui.folha.uol.com.br/lupa/2021/05/31/verificamos-aeroporto-bolsonaro-equador/


In [3]:
df = pd.read_csv("new_factCkBR.tsv", sep='\t', index_col=0).reset_index()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   URL              1313 non-null   object 
 1   Author           1313 non-null   object 
 2   datePublished    1313 non-null   object 
 3   claimReviewed    1300 non-null   object 
 4   reviewBody       1301 non-null   object 
 5   title            1313 non-null   object 
 6   ratingValue      1309 non-null   float64
 7   bestRating       1313 non-null   int64  
 8   alternativeName  1309 non-null   object 
 9   contentType      0 non-null      float64
dtypes: float64(2), int64(1), object(7)
memory usage: 102.7+ KB


In [7]:
df.Author.value_counts()

https:piaui.folha.uol.com.brlupa    528
https:apublica.org                  415
https:www.aosfatos.org              370
Name: Author, dtype: int64

In [8]:
df.loc[df.claimReviewed.isna(), 'claimReviewed'] = df.loc[df.claimReviewed.isna()].title

In [9]:
df.claimReviewed.isna().value_counts()

False    1313
Name: claimReviewed, dtype: int64

In [10]:
df.alternativeName.value_counts()

Falso                      615
falso                      328
Verdadeiro                 119
Exagerado                   87
Sem contexto                42
distorcido                  29
Distorcido                  25
Impossível provar           20
Discutível                  12
Ainda é cedo para dizer      6
Subestimado                  6
insustentável                5
exagerado                    4
Verdadeiro, mas              4
De olho                      3
impreciso                    2
outros                       1
verdadeiro                   1
Name: alternativeName, dtype: int64

In [11]:
fake = {
    'Falso'                   : True,
    'falso'                   : True,
    'Verdadeiro'              : False,
    'Exagerado'               : False,
    'Sem contexto'            : False,
    'distorcido'              : True,
    'Distorcido'              : True,
    'Impossível provar'       : True,
    'Discutível'              : False,
    'Subestimado'             : False,
    'Ainda é cedo para dizer' : False,
    'insustentável'           : True,
    'Verdadeiro, mas'         : False,
    'exagerado'               : False,
    'De olho'                 : False,
    'impreciso'               : True,
    'verdadeiro'              : False,
    'outros'                  : False,
}

df['label'] = df.alternativeName.map(fake)

In [12]:
df.loc[df.label.isna()]

Unnamed: 0,URL,Author,datePublished,claimReviewed,reviewBody,title,ratingValue,bestRating,alternativeName,contentType,label
471,https://apublica.org/2018/08/truco-erros-e-ace...,https:apublica.org,2018-08-15,'Ninguém tem qualquer comprovação de que os 12...,"Jair Bolsonaro PSL, no programa Mariana Godoy ...",Erros e acertos dos candidatos em 5 frases sob...,,8,,,
550,https://apublica.org/2018/05/truco-em-artigo-e...,https:apublica.org,2018-05-25,Os investimentos em pesquisa desabaram.,"Em artigo no 'Le Monde', o ex-presidente Luiz ...","Em artigo escrito da prisão, Lula distorce dad...",,8,,,
559,https://apublica.org/2018/05/truco-temer-minim...,https:apublica.org,2018-05-09,Temer minimiza desemprego com dados falsos - A...,,Temer minimiza desemprego com dados falsos - A...,,8,,,
748,https://apublica.org/2016/06/truco-verdades-e-...,https:apublica.org,2016-06-03,Neste momento em que temos um déficit foi apr...,O deputado federal Nelson Marchezan Jr. PSDB-R...,Verdades e mentiras sobre o pacotão de reajust...,,8,,,


In [13]:
df.label.fillna(True, inplace=True)

In [14]:
df.rename(columns={'claimReviewed': 'text'}, inplace=True)

In [15]:
df.head(2)

Unnamed: 0,URL,Author,datePublished,text,reviewBody,title,ratingValue,bestRating,alternativeName,contentType,label
0,https://aosfatos.org/noticias/governo-bolsonar...,https:www.aosfatos.org,2019-07-22,Espaço dedicado para os eleitores do Bolsonaro...,Publicações que circulam nas redes sociais vej...,Governo Bolsonaro não suspendeu distribuição d...,1.0,5,falso,,True
1,https://aosfatos.org/noticias/nao-e-miriam-lei...,https:www.aosfatos.org,2019-07-22,Vos apresento a funcionária protegida pela Glo...,Uma foto de um treinamento de defesa contra as...,Não é Miriam Leitão quem segura fuzil ao lado ...,1.0,5,falso,,True


In [16]:
df[['label', 'text']].to_csv('factck.br.csv', index=False)

In [1]:
import pandas as pd

df = pd.read_csv('factck.br.csv')
df.shape

(1313, 2)

In [2]:
df.label.isna().value_counts()

False    1313
Name: label, dtype: int64

In [3]:
df.label.value_counts()

True     1028
False     285
Name: label, dtype: int64