In [1]:
!pip install --upgrade pip
!pip install transformers
!pip install unidecode
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,faiss]

Collecting pip
  Downloading pip-22.1.2-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.1.1
    Uninstalling pip-22.1.1:
      Successfully uninstalled pip-22.1.1
Successfully installed pip-22.1.2
[0mCollecting farm-haystack[colab,faiss]
  Cloning https://github.com/deepset-ai/haystack.git to /tmp/pip-install-3z2zrk3u/farm-haystack_1c09acaca742414987e9dedd28acf762
  Running command git clone --filter=blob:none --quiet https://github.com/deepset-ai/haystack.git /tmp/pip-install-3z2zrk3u/farm-haystack_1c09acaca742414987e9dedd28acf762
  Resolved https://github.com/deepset-ai/haystack.git to commit e6d8bcdf9b67db1a711d0d73ecced83d47d10e28
  Installing build dependencies ... [?25l- \ | / - \ done
[?25h  Getting requirements to build wheel ... [?25l- done
[

In [2]:
import pandas as pd
import numpy as np
import string
import unidecode
import os, glob, re, sys, random, unicodedata, collections

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

plt.style.use('ggplot')

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.feature_extraction.text import CountVectorizer

from tqdm import tqdm
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from nltk.tokenize import sent_tokenize , word_tokenize
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('vader_lexicon')
nltk.download('stopwords')
## criando lista de stopwords
STOP_WORDS = stopwords.words('portuguese') + list(string.punctuation)
STOP_WORDS.append('\n')

## Importação do BERT - Hugginface
from transformers import AutoModel, AutoTokenizer, AutoModelForPreTraining, BertTokenizer, BertForSequenceClassification, pipeline
import torch

import pathlib
from pathlib import Path

from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers
from haystack.nodes import FARMReader, TransformersReader, DensePassageRetriever, PreProcessor
from haystack.document_stores import FAISSDocumentStore
from haystack.pipelines import ExtractiveQAPipeline



[nltk_data] Downloading package words to /usr/share/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

In [4]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/livros/memorial-de-aires.txt
/kaggle/input/livros/esau.txt
/kaggle/input/livros/ressurreicao.txt
/kaggle/input/livros/memoriasBras.txt
/kaggle/input/livros/maoLuva.txt
/kaggle/input/livros/quincas.txt
/kaggle/input/livros/casaVelha.txt
/kaggle/input/livros/helena.txt
/kaggle/input/livros/domCasmurro.txt
/kaggle/input/livros/iaia.txt


In [5]:
# Let's first get some files that we want to use
doc_dir = dirname
# Convert files to dicts
all_docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)

#Preprocessamento em Português
preprocessor = PreProcessor(
    language="pt",
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="passage",
    split_length=100,
    split_respect_sentence_boundary=False,
)
docs = preprocessor.process(all_docs)

# Now, let's write the dicts containing documents to our DB.
document_store.write_documents(docs)

100%|██████████| 10/10 [00:00<00:00, 399.64docs/s]


Writing Documents:   0%|          | 0/10 [00:00<?, ?it/s]

In [6]:
# Inicializando o Retriever, Reader e Pipeline
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    max_seq_len_query=64,
    max_seq_len_passage=256,
    batch_size=16,
    use_gpu=True,
    embed_title=True,
    use_fast_tokenizers=True,
)
document_store.update_embeddings(retriever)

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

pipe = ExtractiveQAPipeline(reader, retriever)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/493 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/492 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.


Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Updating Embedding:   0%|          | 0/10 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/16 [00:00<?, ? Docs/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/473M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [7]:
perguntas = ['Quem é Capitu?', 'Quem foi Mascarenhas?', 'Quem é a filha de Dona Eusébia e do Vilaça?', 'Rubião e Cristiano viram sócios em que?', 'Por que Estevão desejava morrer?', 'Salvador era pai de quem?', 'Com quem Jorge se casa ao voltar do Paraguai?', 'Por quem Pedro e Paulo estavam apaixonados?', 'Quem escreve sobre Tristão e Fidélia?', 'Como Escobar morreu?']

resultados = pd.DataFrame(columns=['Pergunta','Documento','Score'])
pred = []
for pergunta in tqdm(perguntas, desc='Perguntas'):
    prediction = pipe.run(
        query=pergunta, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 3}}
    )
    j = 0
    for i in range(len(resultados), len(resultados)+3):
        resultados.loc[i] = [pergunta, prediction['answers'][j].meta['name'], prediction['answers'][j].score]
        j += 1

Perguntas:   0%|          | 0/10 [00:00<?, ?it/s]
Inferencing Samples:   0%|          | 0/20 [00:00<?, ? Batches/s][A
Inferencing Samples:   5%|▌         | 1/20 [00:00<00:16,  1.13 Batches/s][A
Inferencing Samples:  10%|█         | 2/20 [00:01<00:11,  1.63 Batches/s][A
Inferencing Samples:  15%|█▌        | 3/20 [00:01<00:09,  1.87 Batches/s][A
Inferencing Samples:  20%|██        | 4/20 [00:02<00:07,  2.01 Batches/s][A
Inferencing Samples:  25%|██▌       | 5/20 [00:03<00:09,  1.56 Batches/s][A
Inferencing Samples:  30%|███       | 6/20 [00:03<00:07,  1.77 Batches/s][A
Inferencing Samples:  35%|███▌      | 7/20 [00:03<00:06,  1.93 Batches/s][A
Inferencing Samples:  40%|████      | 8/20 [00:04<00:05,  2.04 Batches/s][A
Inferencing Samples:  45%|████▌     | 9/20 [00:04<00:05,  2.10 Batches/s][A
Inferencing Samples:  50%|█████     | 10/20 [00:05<00:04,  2.08 Batches/s][A
Inferencing Samples:  55%|█████▌    | 11/20 [00:05<00:04,  1.93 Batches/s][A
Inferencing Samples:  60%|██████

In [8]:
resultados

Unnamed: 0,Pergunta,Documento,Score
0,Quem é Capitu?,quincas.txt,0.797495
1,Quem é Capitu?,domCasmurro.txt,0.697046
2,Quem é Capitu?,iaia.txt,0.690444
3,Quem foi Mascarenhas?,quincas.txt,0.518228
4,Quem foi Mascarenhas?,memoriasBras.txt,0.427477
5,Quem foi Mascarenhas?,quincas.txt,0.425616
6,Quem é a filha de Dona Eusébia e do Vilaça?,quincas.txt,0.915381
7,Quem é a filha de Dona Eusébia e do Vilaça?,memoriasBras.txt,0.912015
8,Quem é a filha de Dona Eusébia e do Vilaça?,memoriasBras.txt,0.867204
9,Rubião e Cristiano viram sócios em que?,helena.txt,0.861414
