# Answering questions using Roberta

## Main solution using pre-made model

In [60]:
!pwd

/home/george/Documents/LeWagon/Transformers_Hugging_Face


In [None]:
"""Install requirements"""
# Install the transformers library from HuggingFace
!pip install transformers torch pytesseract
# You'll also need some extra tools that some of these models use under the hood
! pip install sentencepiece sacremoses

In [2]:
"""Import packages"""
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
import pandas as pd

"""Import our question answering model"""
question_answerer = pipeline(model = 'deepset/roberta-base-squad2')

"""For web scraping"""
import requests
from bs4 import BeautifulSoup
import re

2023-11-30 16:06:10.871398: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-11-30 16:06:10.871464: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-11-30 16:06:11.015659: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-30 16:06:13.054510: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-11-30 16:06:13.054670: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: ca

In [None]:
"""Scrape BBC as a possible source of context"""
story = "https://www.bbc.co.uk/news/uk-england-beds-bucks-herts-67407334" # Example article
response = requests.get(story)
soup = BeautifulSoup(response.content)
article = []
for para in soup.find_all("div", {"data-component": "text-block"}):
    article.append(para.text)
article = " ".join(article)
article

questions = ['Where will profit go?','Who produced the song?','What is the song called?',\
             'Who gave the song its first play?','When will the song be released?','Who wrote the song?',\
             'Where was the video filmed?','How has nala been delighting commuters?'\
             "Who's pictures went viral?"] # Example questions for example article

In [None]:
"""Open a file as a possible source of context"""
file = open("quizachu/example_article.txt", "r") # Example file
content = file.read()
print(content)
file.close()

question = ['Where is Spain?'] # Example question

In [44]:
def answer_questions(context = "You did not specify any content", questions = ["Did you mean to specify a question?"]):
    """Takes a list called 'questions' that contains the questions to answer
Takes some text called 'content' as a source for answering questions
Returns a dataframe of the questions with their answers and an assessment of confidence in the answers
If no context or content is provided, returns a dataframe requesting these"""
    
    # List to fill with questions, answers, and confidence
    questions_answers = []
    
    # For each question create an empty dictionary and call the question_answerer model on the question
    for q in questions: 
        q_a_dict = {}
        q_a = question_answerer(question=q, context=context)
        
        # Assign the question, and outputs of the question_answerer model to the dictionary
        q_a_dict['confidence'] = q_a['score']
        q_a_dict['question'] = q
        q_a_dict['answer'] = q_a['answer']
        
        # Add the dictionary to the list and then convert the final list of dicts to a dataframe
        questions_answers.append(q_a_dict) 
    questions_answers = pd.DataFrame(questions_answers)
    
    # Set a large maxcolwidth to allow for potentially long answers
    pd.options.display.max_colwidth = 20000
    return questions_answers

In [51]:
def select_questions(context, questions, c = 0.5, n = 5):
    """Selects the top n questions with the highest confidence level c
User can define how many questions are required and the minimum confidence level"""
    
    # Call answer_questions to get a df of questions and answers
    questions_answers = answer_questions(context, questions)
    
    # Filter for confidence
    conf_questions = questions_answers[questions_answers['confidence'] > c] 
    
    # Return n questions ordered by confidence
    selected_questions = conf_questions.sort_values(by='confidence', ascending=False).head(n)\
    .reset_index().rename(columns={'index':'original_question_number'}) 
    
    
    """Check whether enough questions can be returned and explain why if not"""
    
    # Were enough questions generated?
    if len(questions_answers) < n:
        print(f"Only {len(questions_answers)} questions were generated")
        
        # Did enough questions meet the confidence requirement?
        if len(selected_questions) < n: 
            print(f"Not enough questions met your required confidence level,\
                but here are the {len(selected_questions)} that did:")
        else:
            print(f"Here are your {n} questions")
            
    else:
        # Did enough questions meet the confidence requirement?
        if len(selected_questions) < n: 
            print(f"Not enough questions met your required confidence level,\
                but here are the {len(selected_questions)} that did:")
        else:
            print(f"Here are your {n} questions")
            
    return selected_questions

In [52]:
"""Testing function with scraped article"""
select_questions(article, questions, 0.3, 7)

NameError: name 'article' is not defined

## Importing audio as input for questions or answers

In [None]:
"""Installs to analyse audio"""
!sudo apt install ffmpeg
!pip3 install datasets
!pip install SoundFile
!pip install librosa

In [None]:
"""Example audio to analyse"""
!mkdir data
!curl https://wagon-public-datasets.s3.amazonaws.com/deep_learning_datasets/harvard.wav > data/harvard.wav

In [None]:
"""Packages for audio"""
from scipy.io import wavfile
from IPython.display import Audio

In [None]:
"""Read the audio file and play it to verify"""
rate, audio = wavfile.read("data/harvard.wav")
Audio(audio.T, rate=rate)

In [None]:
"""Transcription of a downloaded wav file"""

from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa  

# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
model.config.forced_decoder_ids = None

# Whisper requires a sampling rate of 16000 so must convert this with librosa
audio, rate = librosa.load('data/harvard.wav', sr=16000)
input_features = processor(audio, sampling_rate=rate, return_tensors="pt").input_features 

# generate token ids
predicted_ids = model.generate(input_features)
# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)

transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)


In [None]:
"""Transcription of a flac file from hugging face"""

from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset

# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
model.config.forced_decoder_ids = None

# load dummy dataset and read audio files
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = ds[0]["audio"]
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features 

# generate token ids
predicted_ids = model.generate(input_features)
# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)

transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)


In [None]:
transcription

## 1.3 Processing visual input for questions or answers

### Final OCR extraction code

In [None]:
"""For """
!sudo apt install tesseract-ocr  
!sudo apt install libtesseract-dev
!pip install Pillow pytesseract

In [49]:
"""This is not answering questions. It simply performs OCR on images.
This would enable output from images to be put into the question answerer
This should work with images obtained from the snipping tool.
It does not recognise handwriting."""

import pytesseract
from PIL import Image

def ocr_document(image_path):
    # Open the image using the Pillow library
    image = Image.open(image_path)

    # Perform OCR using Tesseract
    text = pytesseract.image_to_string(image)

    return text

# Example usage
image_path = '/home/george/Downloads/magnification.jpg'
result_text = ocr_document(image_path)

print("OCR Result:")
print(result_text)


OCR Result:
Calculating the size of an object
You will want to calculate the size of /
objects under the microscope. There /
is a simple formula for this, based
‘on the magnification triangle.

As long as you know or can measure
two of the factors, you can find the
third.

magnification = size ohimage

size of real object

For example, if you know you are working at magnification x40, and

the image of the cell you are looking at measures 1mm, you can

work out the actual diameter of the cell:

size of real object = eee Oramnage)
magnification

so

=| mm=
=49mm 0.025 mm or 25 um

Your cell has a diameter of 25 um.

 

Magnifying and resolving power

Microscopes are useful because they magnify things, making them
look bigger. The height of an average person magnified by one of the
best light microscopes would look about 3.5km, and by an electron
microscope about 3500 km. There is, however, a minimum distance
between two objects when you can see them clearly as two separate
things. If th

In [53]:
"""Here we use the question answering model to answer questions about the OCR text"""
select_questions(result_text, ['Why are microscopes useful?','What is resolution?'], 0.1, 2)

Here are your 2 questions


Unnamed: 0,original_question_number,confidence,question,answer
0,1,0.526828,What is resolution?,the ability to distinguish between two separate points
1,0,0.236706,Why are microscopes useful?,"they magnify things, making them\nlook bigger"


### These are previous attempts at various image processing

In [4]:
"""Example images for processing"""
"""Text"""
# Invoice
invoice = 'https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png'
# Simple poster
simple = 'https://www.11thhourracingteam.org/wp-content/uploads/11th-hour-racing-team-how-to-create-a-sustainability-policy-horizontal-3-1-1536x1056.png'
# Complex poster
complicated = 'https://cdn.greenmatch.co.uk/cdn-cgi/image/format=auto/2/2023/07/MAY23_4_02-Plastic-Waste_Global-Waste_2-1-663x1024.png'
# Microscopes text book page via web link
microscope = 'https://m.media-amazon.com/images/I/71Ts-QXYIhL._SL1500_.jpg'
# Magnification text book page downloaded to absolute file path
magnification = '/home/george/Downloads/magnification.jpg'

"""Handwriting"""
# Nice clear handwriting and cursive handwriting
clear = 'https://steemitimages.com/DQmcdbSGrnA9zeqWrYHD8EkNjvF9uxQCAeB7qnucUShpNDe/IMG_7345.PNG'
# Tricky handwriting
tricky = 'https://www.researchgate.net/profile/Neeta-Nain/publication/299666231/figure/fig1/AS:491693964304386@1494240384780/Example-image-of-a-general-handwritten-text-paragraph-from-IAM-dataset-4.png'
y5 = 'https://thelinksprimary.org.uk/wp-content/uploads/2023/10/Handwriting-Y6.png'

#### This is for reading images with text in, eg invoices or posters

In [3]:
"""First model - this answers questions about documents
- this works for very simple documents 
but struggles for anything which implies relationships (e.g. two text boxes that relate to one another)"""
ocr = pipeline(model = 'impira/layoutlm-invoices') #This struggles to find relationships between objects

Some weights of the model checkpoint at impira/layoutlm-invoices were not used when initializing LayoutLMForQuestionAnswering: ['token_classifier_head.weight', 'token_classifier_head.bias']
- This IS expected if you are initializing LayoutLMForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LayoutLMForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
"""Question-answer format"""
ocr(image='/home/george/Downloads/magnification.jpg',question="What does this page say?")

[{'score': 0.7278719544410706,
  'answer': 'Calculating the size of an object',
  'start': 0,
  'end': 5}]

#### This is for reading handwriting

In [None]:
"""This works well for single lines of handwriting but does not support multiple lines.
I need to split multiple line files into single lines."""

hw = pipeline(model = 'microsoft/trocr-base-handwritten')

In [59]:
"""This attempts to split images. It is the first time I gave up and got chatgpt to write code for me.
It does not work very well - it identifies words but does not link them correctly as lines."""

import cv2
import os
import pytesseract

def split_and_save_handwritten_lines(image_path, output_directory):
    # Read the image using OpenCV
    image = cv2.imread(image_path)

    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Use adaptive thresholding to preprocess the image
    _, binary_image = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # Find contours in the binary image
    contours, _ = cv2.findContours(binary_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Create the output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)

    # List to store individual line images
    line_images = []

    # Minimum width and height threshold for a contour to be considered a line
    min_width_threshold = 300
    min_height_threshold = 20

    # Iterate through contours
    for i, contour in enumerate(contours):
        # Get bounding box for each contour
        x, y, w, h = cv2.boundingRect(contour)

        # Filter out contours based on width and height
        if w > min_width_threshold and h > min_height_threshold:
            # Crop the original image to extract the line
            line_image = image[y:y+h, x:x+w]

            # Save the line image to the output directory
            output_path = os.path.join(output_directory, f'line_{i+1}.png')
            cv2.imwrite(output_path, line_image)

            # Append the line image to the list
            line_images.append(line_image)

    return line_images

# Example usage
image_path = '/home/george/Downloads/Handwriting-Y4.png'
output_directory = '/home/george/Downloads/split_text'
lines = split_and_save_handwritten_lines(image_path, output_directory)

# Print the paths of saved line images
for i, line_image in enumerate(lines, start=1):
    print(f"Saved line {i} to {os.path.join(output_directory, f'line_{i}.png')}")


Saved line 1 to /home/george/Downloads/split_text/line_1.png


## Other things

In [None]:
"""These are possible ways to better process images"""
"""visual bert needs more configuring"""
https://huggingface.co/daki97/visualbert_finetuned_easy_vqa
https://huggingface.co/docs/transformers/model_doc/visual_bert#overview # overview is part of the url, not a comment
https://github.com/huggingface/transformers/blob/main/examples/research_projects/visual_bert/demo.ipynb
"""layout needs more configuring"""
https://huggingface.co/docs/transformers/model_doc/layoutlmv3
"""should work for extracting printed text, but only works for single lines"""
https://huggingface.co/microsoft/trocr-base-printed
"""suggestions on how to split into multiple lines"""
https://github.com/microsoft/unilm/issues/628
https://discuss.huggingface.co/t/trocr-fine-tuning/13293/3
"""vision encoder requires more configuration"""
https://huggingface.co/docs/transformers/model_doc/vision-encoder-decoder
"""Generate LaTEX from images"""
https://huggingface.co/Norm/nougat-latex-base