In [None]:
import string

import gensim
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from nltk import FreqDist, Text, word_tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud

## Reading and treating data

In [None]:
df = pd.read_csv("../input/brazilian-tax-news/brtaxnews.csv", sep=";", parse_dates=["data"], dayfirst=True)

In [None]:
df.head()

In [None]:
df["data"].isnull().sum()

In [None]:
df.fillna("", inplace=True)

In [None]:
df.head()

## First visualizations

In [None]:
ax = df["portal"].value_counts().plot.bar(title="Quantidade de notícias em cada Portal", rot=0, figsize=(8,6))
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

In [None]:
df_dias = df.groupby(df["data"].dt.date).count()
df_dias[df_dias["portal"] == df_dias["portal"].max()]

In [None]:
df[df["data"] == "2020-09-22"]

In [None]:
df_dias_mes = df.groupby(df["data"].dt.day).count()["data"]
ax = df_dias_mes.plot.bar(figsize=(16,8), rot=0, title="Quantidade de notícias tributárias publicadas em cada dia do mês")
ax.axes.get_xaxis().get_label().set_visible(False);

In [None]:
df_dias_sem = df.groupby(df["data"].dt.weekday).count()["data"]
ax = df_dias_sem.plot.bar(figsize=(14,7), rot=0, title="Quantidade de notícias tributárias publicadas em cada dia da semana");
ax.set_xticklabels(["Segunda", "Terça", "Quarta", "Quinta", "Sexta", "Sábado", "Domingo"])
ax.axes.get_xaxis().get_label().set_visible(False);

## Exploring texts using nltk

In [None]:
df_jota = df[df["portal"] == "Jota"]
df_valor = df[df["portal"] == "Valor Econômico"]

In [None]:
def df2list(df, columns):
    return [row[column] for index, row in df.iterrows() for column in columns]

In [None]:
noticias = ". ".join(df2list(df, ["titulo", "descricao"]))
noticias[:300]

In [None]:
noticias_jota = ". ".join(df2list(df_jota, ["titulo", "descricao"]))
noticias_jota[:300]

In [None]:
noticias_valor = ". ".join(df2list(df_valor, ["titulo", "descricao"]))
noticias_valor[:300]

In [None]:
tokens = word_tokenize(noticias)
text = Text(tokens)

tokens_jota = word_tokenize(noticias_jota)
text_jota = Text(tokens_jota)

tokens_valor = word_tokenize(noticias_valor)
text_valor = Text(tokens_valor)

### Grouped

In [None]:
text.collocations()

In [None]:
text.similar("tributo")

In [None]:
text.similar("economia")

In [None]:
text.concordance("covid")

### Jota

In [None]:
text_jota.collocations()

In [None]:
text_jota.similar("tributo")

In [None]:
text_jota.similar("economia")

In [None]:
text_jota.similar("pandemia")

In [None]:
text_jota.similar("bolsonaro")

### Valor

In [None]:
text_valor.collocations()

In [None]:
text_valor.similar("tributo")

In [None]:
text_valor.similar("economia")

In [None]:
text_valor.similar("pandemia")

In [None]:
text_valor.similar("icms")

### FreqDist

In [None]:
stop = stopwords.words('portuguese') + list(string.punctuation) + list("’‘'r") + ["sobre", "diz", "ser", "pode"]
fd = FreqDist(w.lower() for w in text if w.lower() not in stop)

In [None]:
fd

In [None]:
fd_jota = FreqDist(w.lower() for w in text_jota if w.lower() not in stop)
fd_jota

In [None]:
fd_valor = FreqDist(w.lower() for w in text_valor if w.lower() not in stop)
fd_valor

## word2vec

Using gensim as shown by Marlesson @ https://www.kaggle.com/marlesson/vocabulary-analysis-word2vec
and Pierremegret @ https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial

In [None]:
g_tokens = []
for index, row in df.iterrows():
    g_tokens.append([w.lower() for w in word_tokenize(f"{row['titulo']}. {row['descricao']}.") if w.lower() not in stop])

In [None]:
model = gensim.models.Word2Vec(g_tokens, min_count=5, size=200, workers=4, window=5)

In [None]:
model.wv.most_similar("tributo")

In [None]:
model.wv.most_similar("economia")

In [None]:
model.wv.most_similar("cofins")

## Wordclouds

In [None]:
def gen_wordcloud(cloud_text):
    return WordCloud(
        width=1280,
        height=720,
        stopwords=stop,
        background_color="white",
        max_words=30,
        prefer_horizontal=1,
        colormap="plasma"
    ).generate(cloud_text)

In [None]:
cloud = gen_wordcloud(noticias)
cloud_jota = gen_wordcloud(noticias_jota)
cloud_valor = gen_wordcloud(noticias_valor)

### Grouped

In [None]:
plt.figure(figsize=(16,8))
plt.axis("off")
plt.imshow(cloud, interpolation="bilinear")
plt.show()

### Jota

In [None]:
plt.figure(figsize=(16,8))
plt.axis("off")
plt.imshow(cloud_jota, interpolation="bilinear")
plt.show()

### Valor

In [None]:
plt.figure(figsize=(16,8))
plt.axis("off")
plt.imshow(cloud_valor, interpolation="bilinear")
plt.show()