### Inicialmente é realizando o load das bibliotecas necessárias para o processo

In [1]:
# core
from json import load as jsload
from warnings import catch_warnings, filterwarnings
from random import shuffle
from time import strftime, mktime
from datetime import datetime
# Captação dos dados
from tweepy import OAuthHandler, API
# NLP
from spacy import load
# Auxiliar
from emoji import get_emoji_regexp
# Deep Learning
import tez
import torch
import torch.nn as nn
import transformers
from sklearn import metrics, model_selection, preprocessing
from transformers import AdamW, get_linear_schedule_with_warmup

### Realizando o load modelo pré treinado spacy

In [6]:
nlp = load("en_core_web_md")

NameError: name 'load' is not defined

### Coloquei as chaves de acesso a minha conta no Twitter em um arquivo separado :D

In [3]:
with open("key.json") as file:
    keys = jsload(file)

In [4]:
auth = OAuthHandler(
    consumer_key = keys["API key"],
    consumer_secret = keys["API secret key"]
)

auth.set_access_token(
    key = keys["Access token"],
    secret = keys["Access token secret"]
)
api = API(auth)

### Carregando o modelo treinado para identificação das emoções em textos

In [None]:
class config:
    def __init__(self):
        self.val_strategy = "batch"

class EmotionClassifier(tez.Model):
    def __init__(self, num_train_steps, num_classes):
        super().__init__()
        self.bert = transformers.SqueezeBertModel.from_pretrained("squeezebert/squeezebert-uncased")
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, num_classes)
        self.num_train_steps = num_train_steps
        self.config = config()
        
    def fetch_optimizer(self):
        param_optimizer = list(self.named_parameters())
        no_decay = ["bias", "LayerNorm.bias"]
        optimizer_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.001,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            }
        ]
        opt = AdamW(optimizer_parameters, lr=3e-5)
        return opt
    
    def fetch_scheduler(self):
        sch = get_linear_schedule_with_warmup(
            self.optimizer, num_warmup_steps=0, num_training_steps=self.num_train_steps
        )
        return sch
    
    def loss(self, outputs, targets):
        if targets is None:
            return None
        return nn.BCEWithLogitsLoss()(outputs, targets.float())
    
    def monitor_metrics(self, outputs, targets):
        if targets is None:
            return {}
        
        outputs = torch.sigmoid(outputs)
        outputs = outputs.cpu().detach().numpy()
        targets = targets.cpu().detach().numpy()
        
        fpr_micro, tpr_micro, _ = metrics.roc_curve(targets.ravel(), outputs.ravel())
        auc_micro = metrics.auc(fpr_micro, tpr_micro)
        return {"auc": auc_micro}
    
    def forward(self, ids, mask, targets= None):
        o_2 = self.bert(ids, attention_mask=mask)["pooler_output"]
        b_o = self.bert_drop(o_2)
        output = self.out(b_o)
        loss = self.loss(output, targets)
        acc = self.monitor_metrics(output, targets)
        return output, loss, acc

### Criando função para estruturação das predições e tratamento dos textos

In [68]:
def caracters_to_remove(w):
    return any([
                  w.is_bracket
                , w.is_punct
                , w.is_quote
                , w.is_stop
                , w.is_space
                , w.text in ["\n"]
                , not w.i and w.text == "RT"
                ])

def format_date(d):
    return int(mktime(datetime.strptime(d, "%a %b %d %H:%M:%S +0000 %Y").timetuple()))

def buildStructure(raw_tweet):
    tweet = dict(raw_tweet._json)
    text_w_emojis = " ".join([w.text for w in nlp(tweet["full_text"]) if not caracters_to_remove(w)])
    
    clean_text = get_emoji_regexp().sub("", text_w_emojis)
    
    structure = {
        "user_id" : tweet["user"]["id_str"],
        "screen_name" : tweet["user"]["screen_name"],
        "followers" : tweet["user"]["followers_count"],
        "retweet_count" : tweet["retweet_count"],
        "favorited" : tweet["favorited"],
        "created_at": format_date(tweet["created_at"]),
        "id": tweet["id_str"],
        "text": clean_text,
        "hashtags": [h["text"] for h in tweet["entities"]["hashtags"]],
        "user_mentions": [m["screen_name"] for m in tweet["entities"]["user_mentions"]],
        "urls": tweet["entities"]["urls"],
        "type": tweet["metadata"]["result_type"],
        "retweet": "retweeted_status" in tweet
    }
    
    return structure

In [5]:
search = api.search(q = "#bitcoin", lang = "en", count = 200, tweet_mode = "extended")

In [69]:
tweets = [buildStructure(t) for t in search]

In [71]:
tweets[1]

{'user_id': '1305183711168692225',
 'screen_name': 'Monobody4',
 'followers': 355,
 'retweet_count': 2429,
 'favorited': False,
 'created_at': 1645073698,
 'id': '1494128204306690050',
 'text': '@BTC_Archive  BREAKING Colorado accept Bitcoin tax payments year',
 'hashtags': ['Bitcoin'],
 'user_mentions': ['BTC_Archive'],
 'urls': [],
 'type': 'recent',
 'retweet': True}

### Vamos baixar os dados do Twitter

In [None]:
# Capturando dados do Twitter
tweets = {each._json["user"]["name"] : each._json["text"]  for each in api.search(q = "#bitcoin", lang = "pt", count = 200)}
tweets

### Vamos criar uma função que realize o filtro de palavras que desejamos trabalhar

In [None]:
def word_filter(word, cut_stop = True):
    if word.is_stop and cut_stop:
        return False
    elif word.is_punct:
        return False
    elif word.suffix_ == "…":
        return False
    elif word.like_url:
        return False
    elif word.like_email:
        return False
    elif word.like_num:
        return False
    elif word.prefix_ == "@":
        return False
    elif word.text in [" ", "\n", "\n\n", "...", 'RT']:
        return False
    elif not word.text.isalnum():
        return False
    return True

### Realizando o pré processamento das palavras

In [None]:
# Pré-processamento: Stop Words e Lemmatazing
processeded = []
# interando sobre cada tweet
for user, tweet in tweets.items():
    row = []
    for word in nlp(tweet): # este é o pipeline
        # filtrando as palavras
        if word_filter(word):
            # após selecionar as palavras, é adicionado o seu formato lematizado
            lemm = nlp.vocab[word.text]
            row.append(lemm.text)
    print(f"{user} : {row}")
    processeded.append(row)

In [None]:
processeded = []
ner = []
adj = []
for each in tweets.values():
    doc = nlp(each)
    processeded.append([nlp.vocab[word.text].text for word in doc if word_filter(word)])
    ner.append([(ent.text, ent.label_) for ent in doc.ents])

In [1]:
from tensorflow import load

2022-02-16 19:53:25.219143: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-16 19:53:25.219175: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


ImportError: cannot import name 'load' from 'tensorflow' (/home/calixto/Documentos/botcryptowatcher/lib/python3.9/site-packages/tensorflow/__init__.py)

In [2]:
import tensorflow as tf

In [4]:
import tensorflow_datasets as tfds

In [6]:
import pathlib

In [None]:
df = tfds.as_dataframe(ds)