# Conversão de Todos os Arquivos

In [251]:
import pandas as pd
from collections import Counter, OrderedDict
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

plt.rcParams['figure.dpi']= 150

veja = pd.read_csv('tables/veja.csv')
estadao = pd.read_csv('tables/estadao.csv')
folha = pd.read_csv('tables/folha.csv')
uol = pd.read_csv('tables/uol.csv')

def getColumnsWithThemes(table):
    return list(filter(lambda column: 'Tema' in column, table.columns))

def getAllThemes(table):
    columnsWithTheme = getColumnsWithThemes(table)
    allThemes = Counter()
    
    for column in columnsWithTheme:
        for theme in table[column]:
            allThemes[theme] += 1
            
    allThemes = dict(allThemes)
    
    return allThemes

def filterTableByYear(table, year):
    return table[table.date.str.contains(year)]

def get2018Table(table):
    return filterTableByYear(table, '2018')
def get2017Table(table):
    return filterTableByYear(table, '2017')
    

vejaThemes = getAllThemes(veja)
estadaoThemes = getAllThemes(estadao)
folhaThemes = getAllThemes(folha)
uolThemes = getAllThemes(uol)

baseThemes = [\
 'saude',\
 'entretenimento',\
 'politica',\
 'esporte',\
 'ciencia',\
 'brasil',\
 'educacao',\
 'tecnologia',\
 'blog',\
 'economia',\
 'outros',\
 'internacional'\
]

def matchBaseTheme(s):
    for theme in baseThemes:
        if (theme in s):
            return (theme, s)
    if (s == 'nan'):
        return ('nan', s)
    
    return None

def createConversionTable(themes, cleanUpFunction):
    conversionTable = {}

    data = map(\
        lambda x: cleanUpFunction(x),\
        list(themes.keys())\
    )
    for theme, originalTheme in data:
        conversionTable[originalTheme] = theme
        
    return conversionTable

def countThemes(themes, conversionTable):
    quantityOfThemes = Counter()
    
    for theme, quantity in themes.items():
        theme = str(theme)
        quantityOfThemes[conversionTable[theme]] += quantity
            
    return OrderedDict(quantityOfThemes.most_common())

def printTable(table, cleanupFunction, filename, title):
    themesTable = getAllThemes(table)
    conversionTable = createConversionTable(themesTable, cleanupFunction)
    quantityOfThemes = countThemes(themesTable, conversionTable)

    index = np.arange(len(quantityOfThemes.keys()))
    plt.bar(index, quantityOfThemes.values())
    plt.rcParams['figure.dpi']= 300
    plt.xlabel('Tema', fontsize=10)
    plt.ylabel('Quantitidade', fontsize=10)
    plt.xticks(index, quantityOfThemes.keys(), fontsize=8, rotation=50)
    plt.title(title)
    plt.savefig('results/' + filename, bbox_inches='tight')
    plt.cla()
    plt.clf()
    plt.close()
    

def cleanVejaThemes(themeString):
    themeString = str(themeString)

    if ('esporte' in themeString or\
        'placar' in themeString):
        return ('esporte', themeString)
    

    if ('blog/rio-grande-do-sul' in themeString or\
        'blog/parana' in themeString or\
        'blog/bahia' in themeString):
        return ('brasil', themeString)

    if ('blog/parlatorio' in themeString or\
        'blog/noblat' in themeString or\
        'blog/sergio-praca' in themeString or\
        'blog/felipe-moura-brasil' in themeString or\
        'blog/mailson-da-nobrega' in themeString or\
        'blog/desvendados' in themeString):
        return ('politica', themeString)


    if ('blog/dora-kramer' in themeString):
          return ('economia', themeString)
    
    if ('blog' in themeString):
        return ('blog', themeString)
    
    if ('galeria-fotos' in themeString or\
        'especiais' in themeString or\
        'gastronomia' in themeString or\
        'mundo' in themeString or\
        'ideias' in themeString or\
        '?p=2643144' in themeString or\
        'revista-veja' in themeString or\
        '?p=1833069' in themeString or\
        'tveja' in themeString):
        return ('outros', themeString)
    
    if (matchBaseTheme(themeString) is not None):
        return matchBaseTheme(themeString)
    
    return (themeString, themeString)


def cleanUolThemes(themeString):
    themeString = str(themeString)
   
    if (\
        'sport' in themeString or\
        'espn' in themeString or\
        'andrerocha' in themeString or\
        'copadomundo' in themeString or\
        'placar' in themeString or\
        'marcelrizzo' in themeString or\
        'corneta' in themeString\
       ):
        return ('esporte', themeString)
    
    if ('blog' in themeString or\
        'liabock' in themeString or\
        'paulosampaio' in themeString
       ):
        return ('blog', themeString)
    
    if (\
        'jamilchade' in themeString or\
        'poder' in themeString or\
        'carlosmelo' in themeString or\
        'congressoemfoco' in themeString or\
        'josiasdesouza' in themeString or\
        'eleicoes' in themeString\
       ):
        return ('politica', themeString)
    
    if ('noticiasdatv' in themeString or\
        'glamurama' in themeString or\
        'estilo' in themeString or\
        'maxima' in themeString or\
        'chicobarney' in themeString or\
        'revistatrip' in themeString or\
        'chicobarney' in themeString or\
        'mensageirosideral' in themeString or\
        'humor' in themeString or\
        'tvfama' in themeString or\
        'comidasebebeidas' in themeString or\
        'paginacinco' in themeString or\
        'superpop' in themeString or\
        'memes' in themeString or\
        'turismo' in themeString or\
        'virgula' in themeString or\
        'carnaval' in themeString or\
        'tvefamosos' in themeString or\
        'f5' in themeString or\
        'ilustrada' in themeString or\
        'comidasebebidas' in themeString or\
        'cinema' in themeString or\
        'musica' in themeString or\
        'mauriciostycer' in themeString or\
        'caras' in themeString or\
        'cenapop' in themeString or\
        'amauryjr' in themeString or\
        'cenapop' in themeString or\
        'natelinha' in themeString or\
        'recreio' in themeString\
       ):
        return ('entretenimento', themeString)
    
    if ('gizmodo' in themeString or\
        'ciencia' in themeString):
        return ('tecnologia', themeString)
    
    if ('mundo' in themeString or\
        'jornalismo' in themeString or\
        'carros' in themeString or\
        'paranormal' in themeString or\
        'universa' in themeString or\
        'paranaportal' in themeString or\
        'aovivo' in themeString or\
        'uol' in themeString or\
        'piaui' in themeString or\
        'zip' in themeString or\
        'aventurasnahistoria' in themeString or\
        'viagem' in themeString or\
        'cotidiano' in themeString\
       ):
        return ('outros', themeString)
    
    if ('colunas' in themeString or\
        'opiniao' in themeString):
        return ('blog', themeString)
    
    if ('mercado' in themeString or\
        'todosabordo' in themeString):
        return ('economia', themeString)
    
    if ('vivabem' in themeString\
       ):
        return ('saude', themeString)
    
    if (matchBaseTheme(themeString) is not None):

        return matchBaseTheme(themeString)
    return ('outros', themeString)

def cleanEstadaoThemes(themeString):
    themeString = str(themeString)
    if ('gadget' in themeString):
        return ('tecnologia', themeString)
    if ('sao-paulo' in themeString):
        return ('brasil', themeString)
    if ('opiniao' in themeString):
        return ('blog', themeString)
    if ('emais' in themeString):
        return ('entretenimento', themeString)
    if ('cultura' in themeString):
        return ('entretenimento', themeString)
    if ('paladar' in themeString):
        return ('entretenimento', themeString)
    if ('inovacao' in themeString):
        return ('tecnologia', themeString)
    
    if ('fotos-galerias-cidade' in themeString):
        return ('brasil', themeString)
    
    if (matchBaseTheme(themeString) is not None):
        return matchBaseTheme(themeString)
    
    
    if ('galeria' in themeString):
        return ('entretenimento', themeString)
    
    return ('outros', themeString)


def cleanFolhaThemes(themeString):
    themeString = str(themeString)
    
    if ('poder' in themeString):
        return ('politica', themeString)
    
    if ('mercado' in themeString):
        return ('economia', themeString)
    
    if ('colunas' in themeString or\
        'colunista' in themeString or\
        'opiniao' in themeString
       ):
        return ('blog', themeString)
    if (matchBaseTheme(themeString) is not None):
        return matchBaseTheme(themeString)
    
    return ('outros', themeString)

# Veja

In [252]:
def quantityOfThemes(table, cleanupFunction):
    themesTable = getAllThemes(table)
    conversionTable = createConversionTable(themesTable, cleanupFunction)
    quantityOfThemes = countThemes(themesTable, conversionTable)
    return quantityOfThemes



vejaPd = pd.DataFrame([quantityOfThemes(get2017Table(veja), cleanVejaThemes), quantityOfThemes(get2018Table(veja), cleanVejaThemes)], index=['2017', '2018']).transpose()
uolPd = pd.DataFrame([quantityOfThemes(get2017Table(uol), cleanUolThemes), quantityOfThemes(get2018Table(uol), cleanUolThemes)], index=['2017', '2018']).transpose()
estadaoPd = pd.DataFrame([quantityOfThemes(get2017Table(estadao), cleanEstadaoThemes), quantityOfThemes(get2018Table(estadao), cleanEstadaoThemes)], index=['2017', '2018']).transpose()
folhaPd = pd.DataFrame([quantityOfThemes(get2017Table(folha), cleanFolhaThemes), quantityOfThemes(get2018Table(folha), cleanFolhaThemes)], index=['2017', '2018']).transpose()


# estadaoPd
vejaPd



#print(quantityOfThemes(get2017Table(folha), cleanFolhaThemes))

Unnamed: 0,2017,2018
entretenimento,850,587
brasil,689,325
blog,577,705
outros,363,260
economia,328,384
politica,253,729
saude,168,69
esporte,165,189
ciencia,75,31
educacao,24,23


In [None]:
def quantityOfThemes(table, cleanupFunction):
    themesTable = getAllThemes(table)
    conversionTable = createConversionTable(themesTable, cleanupFunction)
    quantityOfThemes = countThemes(themesTable, conversionTable)
    return quantityOfThemes

vejaPd = pd.DataFrame([quantityOfThemes(get2017Table(veja), cleanVejaThemes), quantityOfThemes(get2018Table(veja), cleanVejaThemes)], index=['2017', '2018']).transpose()
uolPd = pd.DataFrame([quantityOfThemes(get2017Table(uol), cleanUolThemes), quantityOfThemes(get2018Table(uol), cleanUolThemes)], index=['2017', '2018']).transpose()
estadaoPd = pd.DataFrame([quantityOfThemes(get2017Table(estadao), cleanEstadaoThemes), quantityOfThemes(get2018Table(estadao), cleanEstadaoThemes)], index=['2017', '2018']).transpose()
folhaPd = pd.DataFrame([quantityOfThemes(get2017Table(folha), cleanFolhaThemes), quantityOfThemes(get2018Table(folha), cleanFolhaThemes)], index=['2017', '2018']).transpose()



# Gerar tabela com todas as categorias
names = ['veja', 'uol', 'estadao', 'folha']
pds = [vejaPd, uolPd, estadaoPd, folhaPd]
for name, currentPd in zip(names, pds):
    print(name)
    for i in currentPd.transpose().keys():
        print(i)
    print()

In [None]:
def quantityOfThemes(table, cleanupFunction):
    themesTable = getAllThemes(table)
    conversionTable = createConversionTable(themesTable, cleanupFunction)
    quantityOfThemes = countThemes(themesTable, conversionTable)
    return quantityOfThemes

data2017 = pd.DataFrame(
    [quantityOfThemes(get2017Table(veja), cleanVejaThemes), quantityOfThemes(get2017Table(uol), cleanUolThemes), quantityOfThemes(get2017Table(estadao), cleanEstadaoThemes), quantityOfThemes(get2017Table(folha), cleanFolhaThemes)],
    index=['veja, uol', 'estadao', 'folha']
)

print(data2017)

data2018 = quantityOfThemes(get2018Table(veja), cleanVejaThemes)

df = pd.DataFrame([data2017, data2018], index=['2017', '2018']).transpose()
print(df)

percentagesOf2017 = df['2017'] * 100 / df['2017'].sum()
percentagesOf2018 = df['2018'] * 100 / df['2018'].sum()

print(percentagesOf2017)
print(percentagesOf2018)

print(stats.chi2_contingency(df))
print('chi square:')
contigency = stats.chi2_contingency([percentagesOf2017, percentagesOf2018])


df['Total (categoria)'] = df.sum(axis=1)
df = df.transpose()
df['Total (ano)'] = df.sum(axis=1)


print(df.transpose())

# contigency

print(stats.chisquare([850, 587]))
print(stats.chisquare(df))


In [None]:
get2017Table(veja)

In [11]:
printTable(get2017Table(veja), cleanVejaThemes, 'veja-2017-total.png', 'Noticias Veja 2017')
printTable(get2018Table(veja), cleanVejaThemes, 'veja-2018-total.png', 'Noticias Veja 2018')

Unnamed: 0,date,1 - Título,1 - URL,1 - Tema,2 - Título,2 - URL,2 - Tema,3 - Título,3 - URL,3 - Tema,...,7 - Tema,8 - Título,8 - URL,8 - Tema,9 - Título,9 - URL,9 - Tema,10 - Título,10 - URL,10 - Tema
1,"""2017-01-01T21:06:58.000Z""","Homem mata filho, ex-mulher e mais dez pessoas...",http://web.archive.org/web/20170101220658/http...,brasil,Seis vencedores vão dividir prêmio da Mega Sen...,http://web.archive.org/web/20170101220658/http...,brasil,Leia carta deixada por autor de chacina em Cam...,http://web.archive.org/web/20170101220658/http...,brasil,...,brasil,Viúva do embaixador recebia amante em casa enq...,http://web.archive.org/web/20170101220658/http...,brasil,Mariah Carey larga show antes do fim na Times ...,http://web.archive.org/web/20170101220658/http...,entretenimento,Matthew McConaughey: 'Sou louco pela novela Vi...,http://web.archive.org/web/20170101220658/http...,entretenimento
2,"""2017-01-03T17:24:44.000Z""",Leia carta deixada por autor de chacina em Cam...,http://web.archive.org/web/20170103182444/http...,brasil,"Terremoto de magnitude 4,6 atinge o Maranhão",http://web.archive.org/web/20170103182444/http...,brasil,Trump diz à Coreia do Norte que 'não haverá ar...,http://web.archive.org/web/20170103182444/http...,mundo,...,mundo,Azealia Banks ofende Brasil nas redes sociais ...,http://web.archive.org/web/20170103182444/http...,entretenimento,Anvisa proíbe lote de arroz por encontrar feze...,http://web.archive.org/web/20170103182444/http...,economia,Fugitivo de Manaus que postou foto em redes é ...,http://web.archive.org/web/20170103182444/http...,brasil
3,"""2017-01-04T18:17:04.000Z""",Acidente do filho tira Bonner do 'Jornal Nacio...,http://web.archive.org/web/20170104191704/http...,entretenimento,O 'xerife' do massacre em Manaus,http://web.archive.org/web/20170104191704/http...,brasil,Ana Furtado substitui Fátima e comenta acident...,http://web.archive.org/web/20170104191704/http...,entretenimento,...,politica,"Massacrada, Ronda recebe 45 dias de suspensão ...",http://web.archive.org/web/20170104191704/http...,esporte,Vídeo mostra presos decapitados e rivais comem...,http://web.archive.org/web/20170104191704/http...,?p=1833069,Conheça o mesentério: o novo órgão do corpo hu...,http://web.archive.org/web/20170104191704/http...,saude
4,"""2017-01-05T19:22:09.000Z""",Monica Iozzi e globais causam com nude coletivo,http://web.archive.org/web/20170105202209/http...,entretenimento,Deputado quer erradicar o vício em masturbação,http://web.archive.org/web/20170105202209/http...,politica,"Com lesão na coluna, amigo do filho de Bonner ...",http://web.archive.org/web/20170105202209/http...,entretenimento,...,entretenimento,Trump cancela projetos no Brasil,http://web.archive.org/web/20170105202209/http...,economia,As companhias aéreas mais seguras em 2016 – e ...,http://web.archive.org/web/20170105202209/http...,economia,"Após Soninha ‘perder a hora’, Doria vai multar...",http://web.archive.org/web/20170105202209/http...,brasil
5,"""2017-01-06T20:23:41.000Z""",Ao menos 33 presos são mortos em RR; ministro ...,http://web.archive.org/web/20170106212341/http...,brasil,Zilu passa mal ao volante e bate carro em Alph...,http://web.archive.org/web/20170106212341/http...,entretenimento,Governo sabia que prisão tinha até metralhador...,http://web.archive.org/web/20170106212341/http...,brasil,...,mundo,Racha em facção que se aliou ao PCC deixa cade...,http://web.archive.org/web/20170106212341/http...,brasil,Gestão Doria 'envelopa' área em viaduto com mo...,http://web.archive.org/web/20170106212341/http...,brasil,Vera Fischer critica Globo e novela de Gloria ...,http://web.archive.org/web/20170106212341/http...,entretenimento
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350,"""2017-12-27T20:44:38.000Z""",Sandra Annenberg revela idade ao vivo e públic...,http://web.archive.org/web/20171227214438/http...,entretenimento,Pabllo Vittar desmente que receberá R$ 5 milhõ...,http://web.archive.org/web/20171227214438/http...,entretenimento,Relembre 26 artistas que saíram de cena em 2017,http://web.archive.org/web/20171227214438/http...,entretenimento,...,blog,"Solto, Garotinho afirma ter documentos provand...",http://web.archive.org/web/20171227214438/http...,brasil,A cada salsicha consumida você perde 15 minuto...,http://web.archive.org/web/20171227214438/http...,saude,Caso Miller: Gilmar diz que 'não aceita' ser c...,http://web.archive.org/web/20171227214438/http...,politica
351,"""2017-12-28T21:27:34.000Z""",Anvisa proíbe a venda de quatro marcas de azeite,http://web.archive.org/web/20171228222734/http...,saude,Relembre 26 artistas que saíram de cena em 2017,http://web.archive.org/web/20171228222734/http...,entretenimento,Acertar a Mega é 500 mil vezes mais difícil qu...,http://web.archive.org/web/20171228222734/http...,economia,...,economia,Polícia Federal descobriu fraude bilionária na...,http://web.archive.org/web/20171228222734/http...,blog,Gloria Perez homenageia Daniella: Filho não se...,http://web.archive.org/web/20171228222734/http...,entretenimento,Juíza foi presa dirigindo bêbada e agrediu pol...,http://web.archive.org/web/20171228222734/http...,blog
352,"""2017-12-29T21:51:23.000Z""","Quatro anos depois de acidente, o que se sabe ...",http://web.archive.org/web/20171229225123/http...,esporte,Os números mais sorteados da história da Mega-...,http://web.archive.org/web/20171229225123/http...,economia,Raquel e Carmen,http://web.archive.org/web/20171229225123/http...,blog,...,politica,Ex-mulher de Gilmar Mendes agrediu repórter em...,http://web.archive.org/web/20171229225123/http...,blog,Salário mínimo será de R$ 954 a partir de 1° d...,http://web.archive.org/web/20171229225123/http...,economia,Suplemento de cálcio e vitamina D não reduz fr...,http://web.archive.org/web/20171229225123/http...,saude
353,"""2017-12-30T22:33:44.000Z""","Quatro anos depois de acidente, o que se sabe ...",http://web.archive.org/web/20171230233344/http...,esporte,Mega da Virada: saiba até quando apostar e com...,http://web.archive.org/web/20171230233344/http...,economia,"Sem Lula, Dilma aparece bem em pesquisas eleit...",http://web.archive.org/web/20171230233344/http...,blog,...,entretenimento,Cinco passos simples para organizar suas finan...,http://web.archive.org/web/20171230233344/http...,tveja,Japonês da PF esperou Odebrecht sair da prisão...,http://web.archive.org/web/20171230233344/http...,blog,Morar em Portugal: o novo sonho da classe médi...,http://web.archive.org/web/20171230233344/http...,tveja


# UOL

In [9]:
printTable(get2017Table(uol), cleanUolThemes, 'uol-2017-total.png', 'Noticias UOL 2017')
printTable(get2018Table(uol), cleanUolThemes, 'uol-2018-total.png', 'Noticias UOL 2018')

# Estadao 

In [5]:
printTable(get2017Table(estadao), cleanEstadaoThemes, 'estadao-2017-total.png', 'Noticias Estadão 2017')
printTable(get2018Table(estadao), cleanEstadaoThemes, 'estadao-2018-total.png', 'Noticias Estadão 2018')

# Folha

In [7]:
printTable(get2017Table(folha), cleanFolhaThemes, 'folha-2017-total.png', 'Noticias Folha 2017')
printTable(get2018Table(folha), cleanFolhaThemes, 'folha-2018-total.png', 'Noticias Folha 2018')