# Código do Trabalho Final do Curso BI Master - PUC-Rio

### Utilizando HugginFace - Pytorch

In [None]:
# biblioteca do hugginface
!pip install --upgrade pip
!pip install transformers
!pip install unidecode
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,faiss]

In [None]:
## Importação das bibliotecas
import pandas as pd
import numpy as np
import string
import unidecode
import os, glob, re, sys, random, unicodedata, collections

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

plt.style.use('ggplot')

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.feature_extraction.text import CountVectorizer

from tqdm import tqdm
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from nltk.tokenize import sent_tokenize , word_tokenize
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('vader_lexicon')
nltk.download('stopwords')
## criando lista de stopwords
STOP_WORDS = stopwords.words('portuguese') + list(string.punctuation)
STOP_WORDS.append('\n')

## Importação do BERT - Hugginface
from transformers import AutoModel, AutoTokenizer, BertTokenizer, BertForSequenceClassification
import torch

## Bibliotecas para busca em PT

## Bibliotecas para buscas em EN (apenas para exibição mais didática)
from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers
from haystack.nodes import FARMReader, TransformersReader, DensePassageRetriever, PreProcessor
from haystack.document_stores import FAISSDocumentStore
from haystack.pipelines import ExtractiveQAPipeline


In [3]:
## Verificação da GPU utilizada
!nvidia-smi

Sun May 15 20:37:58 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

INFO - haystack.telemetry -  Haystack sends anonymous usage data to understand the actual usage and steer dev efforts towards features that are most meaningful to users. You can opt-out at anytime by calling disable_telemetry() or by manually setting the environment variable HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation page. More information at https://haystack.deepset.ai/guides/telemetry


In [5]:
## Montagem do Drive e Ponteiramento para o diretório
from google.colab import drive
drive.mount('/content/drive')

work_path = '/content/drive/MyDrive/Classroom/PosGraduacao/ProjetoFinal/'
os.chdir(work_path)

Mounted at /content/drive


In [6]:
random.seed(50)

files = []

for dirname, _, filenames in os.walk('data/machado-de-assis/raw/txt'):
  for filename in filenames:
    files.append(os.path.join(dirname, filename))

print('There are a total of {} files'.format(len(files)), '\n')

There are a total of 116 files 



In [7]:
# Seleção de 10 arquivos aleatórios
sample_books = random.sample(files,10)

docs = []
for fname in sample_books:
    with open(fname , "r") as file:
        text = file.read()
    docs.append(text)

# count term frequency using CountVectorizer from scikit-learn
## limiting number of words just for illustrating the concept
vec = CountVectorizer(max_features=10, stop_words=STOP_WORDS) 
X = vec.fit_transform(docs)
df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out()) #get_feature_names())
books_names = [book.split('/')[-1] for book in sample_books]
df['book'] = books_names
df = df.set_index('book')

print(df)

                           ainda  casa  dia  disse  gilliatt  homem  ser  \
book                                                                       
gazeta.txt                    14    23   36     16         0     17   28   
cartaImprensa.txt              1     0    0      0         0      0    1   
inspiracoesClaustro.txt        1     0    0      1         0      4    8   
historiasMeiaNoite.txt        84    68   59    167         0     58   67   
vicondeCastilho.txt            1     0    0      0         0      0    0   
trabalhadoresMar.txt         121   210  173    146       776    312  179   
instintoNacionalidade.txt     17     2    2      3         0      3    4   
aoAcaso.txt                  140    67  123     34         0     45  167   
ressurreicao.txt              82    80   52    157         0     49   63   
henriqueChaves.txt             2     0    4      0         0      1    1   

                           tempo  todos  tudo  
book                                   

In [8]:
# Let's first get some files that we want to use
doc_dir = "data/machado-de-assis/raw/txt"
# Convert files to dicts
docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)

# Now, let's write the dicts containing documents to our DB.
#document_store.write_documents(docs)

INFO - haystack.utils.preprocessing -  Converting data/machado-de-assis/raw/txt/cr+¡tica/ea.txt
INFO - haystack.utils.preprocessing -  Converting data/machado-de-assis/raw/txt/cr+¡tica/peregrinacao.txt
INFO - haystack.utils.preprocessing -  Converting data/machado-de-assis/raw/txt/cr+¡tica/contituinteSombraLuz.txt
INFO - haystack.utils.preprocessing -  Converting data/machado-de-assis/raw/txt/cr+¡tica/miragens.txt
INFO - haystack.utils.preprocessing -  Converting data/machado-de-assis/raw/txt/cr+¡tica/cenasVida.txt
INFO - haystack.utils.preprocessing -  Converting data/machado-de-assis/raw/txt/cr+¡tica/novaGeracao.txt
INFO - haystack.utils.preprocessing -  Converting data/machado-de-assis/raw/txt/cr+¡tica/pareceresConservatorioDramatico.txt
INFO - haystack.utils.preprocessing -  Converting data/machado-de-assis/raw/txt/cr+¡tica/cantosFantasias.txt
INFO - haystack.utils.preprocessing -  Converting data/machado-de-assis/raw/txt/cr+¡tica/floresFrutos.txt
INFO - haystack.utils.preprocessin

In [9]:
preprocessor = PreProcessor(
    language="pt",
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,)

# Preprocessamento do DB
documentos = preprocessor.process(docs)

100%|██████████| 116/116 [00:04<00:00, 27.76docs/s]


In [10]:
# Now, let's write the dicts containing documents to our DB.
#document_store.write_documents(docs)
document_store.write_documents(documentos)

Writing Documents:   0%|          | 0/19288 [00:00<?, ?it/s]

In [11]:
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    max_seq_len_query=64,
    max_seq_len_passage=256,
    batch_size=16,
    use_gpu=True,
    embed_title=True,
    use_fast_tokenizers=True,
)
# Important:
# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation.
# While this can be a time consuming operation (depending on corpus size), it only needs to be done once.
# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
document_store.update_embeddings(retriever)

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/493 [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-question_encoder-single-nq-base locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...


Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  Loaded facebook/dpr-question_encoder-single-nq-base


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/492 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-ctx_encoder-single-nq-base locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...


Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  Loaded facebook/dpr-ctx_encoder-single-nq-base
INFO - haystack.document_stores.faiss -  Updating embeddings for 19178 docs...


Updating Embedding:   0%|          | 0/19178 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s]

Create embeddings:   0%|          | 0/9184 [00:00<?, ? Docs/s]

In [12]:
#reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)
reader = FARMReader(model_name_or_path="pierreguillou/bert-base-cased-squad-v1.1-portuguese", use_gpu=True)

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find pierreguillou/bert-base-cased-squad-v1.1-portuguese locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...


Downloading:   0%|          | 0.00/862 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/413M [00:00<?, ?B/s]

INFO - haystack.modeling.model.language_model -  Loaded pierreguillou/bert-base-cased-squad-v1.1-portuguese


Downloading:   0%|          | 0.00/494 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/205k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.infer -  Got ya 2 parallel workers to do inference ...
INFO - haystack.modeling.infer -   0     0  
INFO - haystack.modeling.infer -  /w\   /w\ 
INFO - haystack.modeling.infer -  /'\   / \ 


In [13]:
pipe = ExtractiveQAPipeline(reader, retriever)

In [24]:
# You can configure how many candidates the reader and retriever shall return
# The higher top_k for retriever, the better (but also the slower) your answers.
prediction = pipe.run(
    query="Qual foi a causa da morte de Brás Cubas?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}
)

  start_indices = flat_sorted_indices // max_seq_len
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 13.41 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 14.29 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 16.36 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 17.83 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 11.24 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 12.58 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 22.68 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 22.71 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 22.40 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 23.40 Batches/s]


In [25]:
print_answers(prediction, details="medium")


Query: Qual foi a causa da morte de Brás Cubas?
Answers:
[   {   'answer': 'pecados',
        'context': 'oje um casal de virtudes no mesmo espaço de chão que '
                   'sofreu um\n'
                   'casal de pecados. Amanhã pode lá dormir um eclesiástico, '
                   'depois um\n'
                   'assassino, depois um',
        'score': 0.8123836815357208},
    {   'answer': 'pneumonia',
        'context': 'e física e moral, e o corpo fazia-se-me planta, e pedra e\n'
                   'Morri de uma pneumonia; mas se lhe disser que foi menos a\n'
                   'pneumonia, do que uma idéia grandi',
        'score': 0.7883062660694122},
    {   'answer': 'corrupção',
        'context': 'via;\n'
                   'havia a candura dele, que era botão de flor, ainda '
                   'entrecerrado à corrupção da\n'
                   'Tal era o contraste desses dois caracteres, que a estrela '
                   'da viúva',
        'score': 0.38080117106

Os códigos abaixo estão em vias de melhorias para tratamento e processamento dos dados.
Já possível verificar que há um retorno dos dados porém sem um resultado plausível uma vez que o modelo utilizado está baseado no idioma inglês.

Serão realizados testes com o modelo de DPR fornecido pela [Haystack](https://haystack.deepset.ai/overview/intro) e com o modelo de Q&A em PT afim de verificar qual atenderá melhor o projeto

# Até o momento os código abaixo não foram utilizados

In [None]:
# BERT Base
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')
model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')

#tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')
#model = BertForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased')

# BERT Large
#tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-large-portuguese-cased')
#model = AutoModel.from_pretrained('neuralmind/bert-large-portuguese-cased')

#tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-large-portuguese-cased')
#model = BertForSequenceClassification.from_pretrained('neuralmind/bert-large-portuguese-cased')