### Começamos realizando o load de algumas bibliotecas auxiliares que iremos utilizar

In [None]:
from json import load as jsload
from tweepy import OAuthHandler, API
from warnings import catch_warnings, filterwarnings
from random import shuffle
from matplotlib import pyplot
from wordcloud import WordCloud

### Agora importamos as funções do spacy que iremos utilizar e carregar o modelo pré treinado

In [None]:
from spacy import load
from spacy.util import minibatch, compounding

nlp = load("pt_core_news_md")
# sim, é só isso

### Coloquei as chaves de acesso a minha conta no Twitter em um arquivo separado :D

In [None]:
with open("keys.json") as file:
    keys = jsload(file)

In [None]:
auth = OAuthHandler(
    consumer_key = keys["cunsumer_key"],
    consumer_secret = keys["consumer_secret"]
)

auth.set_access_token(
    key = keys["access_token"],
    secret = keys["access_token_secret"]
)
api = API(auth)

### Vamos verificar as stop words carregadas neste modelo

In [None]:
print("de" in nlp.Defaults.stop_words)
print("então" in nlp.Defaults.stop_words)
print("assim" in nlp.Defaults.stop_words)
print("e" in nlp.Defaults.stop_words)
print("a" in nlp.Defaults.stop_words)
print("RT" in nlp.Defaults.stop_words)

In [None]:
# Adicionar key words a, e, o
nlp.Defaults.stop_words |= {"a", "e", "o"}

# Para adicionar permanetemente é necessário alterar os arquivos da biblioteca

### Vamos baixar os dados do Twitter

In [None]:
# Capturando dados do Twitter
tweets = {each._json["user"]["name"] : each._json["text"]  for each in api.search(q = "#pybr2020", lang = "pt", count = 200)}
tweets

### Vamos criar uma função que realize o filtro de palavras que desejamos trabalhar

In [None]:
def word_filter(word, cut_stop = True):
    if word.is_stop and cut_stop:
        return False
    elif word.is_punct:
        return False
    elif word.suffix_ == "…":
        return False
    elif word.like_url:
        return False
    elif word.like_email:
        return False
    elif word.like_num:
        return False
    elif word.prefix_ == "@":
        return False
    elif word.text in [" ", "\n", "\n\n", "...", 'RT']:
        return False
    elif not word.text.isalnum():
        return False
    return True

In [None]:
def PlotCloud(wc):
    pyplot.figure(figsize = (10, 10))
    pyplot.imshow(WordCloud(width = 500, background_color = "purple", random_state = 10).generate(wc))
    pyplot.axis("off")

### Realizando o pré processamento das palavras

In [None]:
# Pré-processamento: Stop Words e Lemmatazing
processeded = []
# interando sobre cada tweet
for user, tweet in tweets.items():
    row = []
    for word in nlp(tweet): # este é o pipeline
        # filtrando as palavras
        if word_filter(word):
            # após selecionar as palavras, é adicionado o seu formato lematizado
            lemm = nlp.vocab[word.text]
            row.append(lemm.text)
    print(f"{user} : {row}")
    processeded.append(row)

In [None]:
# Eu prefiro assim
processeded = []
ner = []
adj = []
for each in tweets.values():
    doc = nlp(each)
    processeded.append([nlp.vocab[word.text].text for word in doc if word_filter(word)])
    ner.append([(ent.text, ent.label_) for ent in doc.ents])

In [None]:
ner

In [None]:
adj = []
# interando sobre cada tweet
for user, tweet in tweets.items():
    for word in nlp(tweet): # este é o pipeline
        # filtrando as palavras
        if word_filter(word) and word.tag_[:3] == "ADJ":
            # Adicionamos apenas os adjetivos à lista
            adj.append(word.text)
PlotCloud(" ".join(adj) + " ")

In [None]:
trainning_data = []

# Adicionando a nova Entidade Nomeada
key = "python"
# Este será o label da EN
label = "devlang"

# Atribuindo 
for tweet in api.search(q = "#pythonfofoqueiro", lang = "pt", count = 200):
    if tweet.text.lower().__contains__(" " + key + " "):
        doc = nlp(tweet.text.lower())
        text = " ".join([word.text for word in doc if word_filter(word, cut_stop = False)])
        pos = text.index(key)
        trainning_data.append(
            (text, { "entities" : [(pos, pos + len(key), label)]})
        )
    ner = nlp.get_pipe("ner")
    ner.add_label(label)
optimizer = nlp.begin_training()
other_pipes = [p for p in nlp.pipe_names if p not in ["ner", "trf_wordpiecer", "trf_tok2vec"]]
with nlp.disable_pipes(*other_pipes), catch_warnings():
    filterwarnings("once", category = UserWarning, module = "spacy")

    sizes = compounding(1.0, 4.0, 1.001)
    for itn in range(30):
        shuffle(trainning_data)
        batches = minibatch(trainning_data, size = sizes)
        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd = optimizer, drop = 0.35, losses = losses)
        print("Losses", losses)

In [None]:
# teste o novo modelo de EN
test_text = "Nesta palestra vimos como Python pode ser usado em NLP"
doc = nlp(test_text)
print(f"Entidades encontradas em: {test_text}")
for ent in doc.ents:
    print(ent.text, ent.label_)