# Answering questions using Roberta

## Main solution using roberta

In [60]:
!pwd

/home/george/Documents/LeWagon/Transformers_Hugging_Face


In [2]:
"""Install requirements"""
# Install the transformers library from HuggingFace
!pip install transformers torch pytesseract
# You'll also need some extra tools that some of these models use under the hood
! pip install sentencepiece sacremoses
# to read pdf
!pip install pdfquery

Collecting transformers
  Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.5/123.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch
  Downloading torch-2.1.1-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting pytesseract
  Using cached pytesseract-0.3.10-py3-none-any.whl (14 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Using cached huggingface_hub-0.19.4-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2023.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloadin

In [234]:
"""Import packages"""
from transformers import pipeline
from transformers.pipelines.pt_utils import KeyDataset
import pandas as pd

"""Import our question answering model""" """tiny is twice as fast but performs less well"""
question_answerer = pipeline(model = 'deepset/roberta-base-squad2')
# question_answerer = pipeline(model = 'deepset/tinyroberta-squad2')

In [235]:
def answer_questions_with_confidence(context = "You did not specify any content", questions = ["Did you mean to specify a question?"]):
    """Takes a list called 'questions' that contains the questions to answer
    Takes some text called 'content' as a source for answering questions
    Returns a dataframe of the questions with their answers and an assessment of confidence in the answers
    If no context or content is provided, returns a dataframe requesting these"""
    
    # List to fill with questions, answers, and confidence
    questions_answers = []
    
    # For each question create an empty dictionary and call the question_answerer model on the question
    for q in questions: 
        q_a_dict = {}
        q_a = question_answerer(question=q, context=context)
        
        # Assign the question, and outputs of the question_answerer model to the dictionary
        q_a_dict['confidence'] = q_a['score']
        q_a_dict['question'] = q
        q_a_dict['answer'] = q_a['answer'].replace('\n', ' ')
        
        # Add the dictionary to the list and then convert the final list of dicts to a dataframe
        questions_answers.append(q_a_dict) 
    questions_answers = pd.DataFrame(questions_answers)
    
    # Set a large maxcolwidth to allow for potentially long answers
    pd.options.display.max_colwidth = 20000
    
    return questions_answers

In [10]:
def select_top_n_questions(context, questions, c = 0.5, n = 5):
    """Selects the top n questions with the highest confidence level c
    User can define how many questions are required and the minimum confidence level"""
    
    # Call answer_questions to get a df of questions and answers
    questions_answers = answer_questions_with_confidence(context, questions)
    
    # Filter for confidence
    conf_questions = questions_answers[questions_answers['confidence'] > c] 
    
    # Return n questions ordered by confidence
    selected_questions = conf_questions.sort_values(by='confidence', ascending=False).head(n)\
    .reset_index().rename(columns={'index':'original_question_number'}) 
    
    """Check whether enough questions can be returned and explain why if not"""
    
    # Were enough questions generated?
    if len(questions_answers) < n:
        print(f"Only {len(questions_answers)} questions were generated")
        
        # Did enough questions meet the confidence requirement?
        if len(selected_questions) == 0:
            print("No questions met your required confidence level.")
        elif len(selected_questions) < n: 
            print(f"Not enough questions met your required confidence level,\
 but here {'is' if len(selected_questions) == 1 else 'are'} the {len(selected_questions)} that did:")
        else:
            print(f"Here are your {n} questions")
            
    else:
        # Did enough questions meet the confidence requirement?
        if len(selected_questions) == 0:
            print("No questions met your required confidence level.")
        elif len(selected_questions) < n: 
            print(f"Not enough questions met your required confidence level,\
 but here {'is' if len(selected_questions) == 1 else 'are'} the {len(selected_questions)} that did:")
        else:
            print(f"Here are your {n} questions")
            
    return selected_questions

## Training model

### Installs and imports

In [5]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.0.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting flatbuffers>=23.5.26 (from tensorflow)
  Downloading flatbuffers-23.5.26-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.5.4-py3-none-any.whl (19 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting h5py>=2.9.0 (from tensorflow)
  Downloading h5py-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.5 kB)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-16.0.6-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting ml-dtypes~=0.2.0 (

In [1]:
from transformers import TFAutoModelForSequenceClassification
import pandas as pd
import tensorflow as tf
from transformers import AutoTokenizer

2023-12-05 17:02:04.308737: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-05 17:02:04.308782: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-05 17:02:04.310262: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Models

In [152]:
from transformers import TFAutoModel
base_model = TFAutoModel.from_pretrained("deepset/roberta-base-squad2", from_pt = True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['qa_outputs.bias', 'qa_outputs.weight', 'roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [66]:
tuning_model_bert_small = TFAutoModelForSequenceClassification.from_pretrained("prajjwal1/bert-small", from_pt = True)
tokenizer_bert_small = AutoTokenizer.from_pretrained("prajjwal1/bert-small", padding_side = "right")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [133]:
tuning_model = TFAutoModelForSequenceClassification.from_pretrained("deepset/roberta-base-squad2", from_pt = True)
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2", padding_side = "right", add_prefix_space=True)
# tokenizer.add_token_specials({'pad_token': '<s>'}) # not necessary

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['qa_outputs.bias', 'qa_outputs.weight', 'roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-strea

### Single sentences for testing

In [None]:
import tensorflow as tf

# tokenize question
q_tokens = tokenizer("Why must my tokenizers and model match?", return_tensors="tf")
q_tokens['input_ids']

# Get individual tensors from the batch
input_tensor = tf.expand_dims(q_tokens['input_ids'][0], 0)
# Pass tokenized input through the model
outputs = base_model.predict(input_tensor)

outputs.last_hidden_state.shape

In [172]:
# tokenize answer

a_tokens = tokenizer("Because different models use different input ids's!", return_tensors="tf")
a_tokens

# Get individual tensors from the batch
input_tensor = tf.expand_dims(a_tokens['input_ids'][0], 0)
# Pass tokenized input through the model
outputs = base_model.predict(input_tensor)

outputs.last_hidden_state.shape



(1, 12, 768)

In [None]:
# decode - requires reducing tensor dimension
qs = tf.squeeze(q_tokens['input_ids'])

decoded = tokenizer.decode(qs) 
decoded

In [415]:
# Drop columns with no answers
train_data_squad = train_data_squad.drop(train_data_squad[train_data_squad['only_answer_text'].str.len() == 0].index)\
    .drop(df_test[df_test['only_answer_text'].str.len() == 0].index)

# For any cells containing lists, replace with the list contents
train_data_squad = train_data_squad.explode(['only_answer_text','only_answer_start'])

In [None]:
# Tokenize
squad_question_tokens = tokenizer(train_data_squad['question'].tolist(), max_length=31, padding = "max_length", truncation = True, return_tensors="tf")
squad_question_tokens['input_ids'].shape

squad_answer_tokens = tokenizer(train_data_squad['only_answer_text'].tolist(), max_length=31, padding = "max_length", truncation = True, return_tensors="tf")
squad_answer_tokens['input_ids'].shape

squad_context_tokens = tokenizer(train_data_squad['context'].tolist(), max_length=31, padding = "max_length", truncation = True, return_tensors="tf")
squad_context_tokens['input_ids'].shape

In [402]:
# Split answers into text and start position (of the answer in the context) and add these as new columns
train_data_squad['only_answer_text'] = [value['text'] for value in train_data_squad['answers']]
train_data_squad['only_answer_start'] = [value['answer_start'] for value in train_data_squad['answers']]
train_data_squad

Unnamed: 0,id,title,context,question,answers,only_answer_text,only_answer_start
0,56be85543aeaaa14008c9063,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".",When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start': [269]}",[in the late 1990s],[269]
1,56be85543aeaaa14008c9065,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".",What areas did Beyonce compete in when she was growing up?,"{'text': ['singing and dancing'], 'answer_start': [207]}",[singing and dancing],[207]
2,56be85543aeaaa14008c9066,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".",When did Beyonce leave Destiny's Child and become a solo singer?,"{'text': ['2003'], 'answer_start': [526]}",[2003],[526]
3,56bf6b0f3aeaaa14008c9601,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".",In what city and state did Beyonce grow up?,"{'text': ['Houston, Texas'], 'answer_start': [166]}","[Houston, Texas]",[166]
4,56bf6b0f3aeaaa14008c9602,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".",In which decade did Beyonce become famous?,"{'text': ['late 1990s'], 'answer_start': [276]}",[late 1990s],[276]
...,...,...,...,...,...,...,...
130046,5735d259012e2f140011a09d,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to promote international relations has established an International Relations Secretariat (IRC). KMC's first international relationship was established in 1975 with the city of Eugene, Oregon, United States. This activity has been further enhanced by establishing formal relationships with 8 other cities: Motsumoto City of Japan, Rochester of the USA, Yangon (formerly Rangoon) of Myanmar, Xi'an of the People's Republic of China, Minsk of Belarus, and Pyongyang of the Democratic Republic of Korea. KMC's constant endeavor is to enhance its interaction with SAARC countries, other International agencies and many other major cities of the world to achieve better urban management and developmental programs for Kathmandu.",In what US state did Kathmandu first establish an international relationship?,"{'text': ['Oregon'], 'answer_start': [229]}",[Oregon],[229]
130047,5735d259012e2f140011a09e,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to promote international relations has established an International Relations Secretariat (IRC). KMC's first international relationship was established in 1975 with the city of Eugene, Oregon, United States. This activity has been further enhanced by establishing formal relationships with 8 other cities: Motsumoto City of Japan, Rochester of the USA, Yangon (formerly Rangoon) of Myanmar, Xi'an of the People's Republic of China, Minsk of Belarus, and Pyongyang of the Democratic Republic of Korea. KMC's constant endeavor is to enhance its interaction with SAARC countries, other International agencies and many other major cities of the world to achieve better urban management and developmental programs for Kathmandu.",What was Yangon previously known as?,"{'text': ['Rangoon'], 'answer_start': [414]}",[Rangoon],[414]
130048,5735d259012e2f140011a09f,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to promote international relations has established an International Relations Secretariat (IRC). KMC's first international relationship was established in 1975 with the city of Eugene, Oregon, United States. This activity has been further enhanced by establishing formal relationships with 8 other cities: Motsumoto City of Japan, Rochester of the USA, Yangon (formerly Rangoon) of Myanmar, Xi'an of the People's Republic of China, Minsk of Belarus, and Pyongyang of the Democratic Republic of Korea. KMC's constant endeavor is to enhance its interaction with SAARC countries, other International agencies and many other major cities of the world to achieve better urban management and developmental programs for Kathmandu.",With what Belorussian city does Kathmandu have a relationship?,"{'text': ['Minsk'], 'answer_start': [476]}",[Minsk],[476]
130049,5735d259012e2f140011a0a0,Kathmandu,"Kathmandu Metropolitan City (KMC), in order to promote international relations has established an International Relations Secretariat (IRC). KMC's first international relationship was established in 1975 with the city of Eugene, Oregon, United States. This activity has been further enhanced by establishing formal relationships with 8 other cities: Motsumoto City of Japan, Rochester of the USA, Yangon (formerly Rangoon) of Myanmar, Xi'an of the People's Republic of China, Minsk of Belarus, and Pyongyang of the Democratic Republic of Korea. KMC's constant endeavor is to enhance its interaction with SAARC countries, other International agencies and many other major cities of the world to achieve better urban management and developmental programs for Kathmandu.",In what year did Kathmandu create its initial international relationship?,"{'text': ['1975'], 'answer_start': [199]}",[1975],[199]


In [287]:
from datasets import load_dataset

dataset = load_dataset("squad_v2")  # contains test and validation

Downloading builder script:   0%|          | 0.00/5.28k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.02k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/801k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

### Load, explore, and clean q_a pairs data

In [4]:
df = pd.read_csv("questions_for_training/S08_question_answer_pairs.txt", sep='\t')

In [None]:
df_clean = df.dropna()

In [237]:
df.head(50)

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of the United States?,yes,easy,easy,S08_set3_a4
1,Abraham_Lincoln,Was Abraham Lincoln the sixteenth President of the United States?,Yes.,easy,easy,S08_set3_a4
2,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1863?,yes,easy,medium,S08_set3_a4
3,Abraham_Lincoln,Did Lincoln sign the National Banking Act of 1863?,Yes.,easy,easy,S08_set3_a4
4,Abraham_Lincoln,Did his mother die of pneumonia?,no,easy,medium,S08_set3_a4
5,Abraham_Lincoln,Did his mother die of pneumonia?,No.,easy,easy,S08_set3_a4
6,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months,medium,easy,S08_set3_a4
7,Abraham_Lincoln,How many long was Lincoln's formal education?,18 months.,medium,medium,S08_set3_a4
8,Abraham_Lincoln,When did Lincoln begin his political career?,1832,medium,easy,S08_set3_a4
9,Abraham_Lincoln,When did Lincoln begin his political career?,1832.,medium,medium,S08_set3_a4


In [139]:
df_clean["tokenized"] = df_clean['Question'].map(lambda x: tokenizer(x)["input_ids"])

df_clean["len_tokenized"] = df_clean["tokenized"].map(lambda x: len(x))

df_clean["len_tokenized"].max()
# Max question length 31
# Max answer length 93


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["tokenized"] = df_clean['Question'].map(lambda x: tokenizer(x)["input_ids"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean["len_tokenized"] = df_clean["tokenized"].map(lambda x: len(x))


31

In [38]:
lengths = []
for n in range(len(df)):
    print(df['Question'][n])
    print(type(df['Question'][n]))
#     lengths.append(len(str(df['Question'][n])))
# print(max(lengths))

# Answers 423
# Questions 586

Was Abraham Lincoln the sixteenth President of the United States?
<class 'str'>
Was Abraham Lincoln the sixteenth President of the United States?
<class 'str'>
Did Lincoln sign the National Banking Act of 1863?
<class 'str'>
Did Lincoln sign the National Banking Act of 1863?
<class 'str'>
Did his mother die of pneumonia?
<class 'str'>
Did his mother die of pneumonia?
<class 'str'>
How many long was Lincoln's formal education?
<class 'str'>
How many long was Lincoln's formal education?
<class 'str'>
When did Lincoln begin his political career?
<class 'str'>
When did Lincoln begin his political career?
<class 'str'>
What did The Legal Tender Act of 1862 establish?
<class 'str'>
What did The Legal Tender Act of 1862 establish?
<class 'str'>
Who suggested Lincoln grow a beard?
<class 'str'>
Who suggested Lincoln grow a beard?
<class 'str'>
When did the Gettysburg address argue that America was born?
<class 'str'>
When did the Gettysburg address argue that America was born?
<class 'str'>
Di

### SQUAD import

In [286]:
!pip install datasets

Collecting datasets
  Using cached datasets-2.15.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=8.0.0 (from datasets)
  Downloading pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting pyarrow-hotfix (from datasets)
  Using cached pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Using cached multiprocess-0.70.15-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets)
  Using cached fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
Collecting aiohttp (from datasets)
  Using cached aiohttp-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)
Collecting multidict<7.0,>=4.5 (from aiohttp->datasets)
  Using cached multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manyl

### Squad explore

In [299]:
train_data_squad = pd.DataFrame.from_dict(dataset['train'])
valid_data_squad = pd.DataFrame.from_dict(dataset['validation'])

In [300]:
train_data_squad.head()

Unnamed: 0,id,title,context,question,answers
0,56be85543aeaaa14008c9063,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".",When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start': [269]}"
1,56be85543aeaaa14008c9065,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".",What areas did Beyonce compete in when she was growing up?,"{'text': ['singing and dancing'], 'answer_start': [207]}"
2,56be85543aeaaa14008c9066,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".",When did Beyonce leave Destiny's Child and become a solo singer?,"{'text': ['2003'], 'answer_start': [526]}"
3,56bf6b0f3aeaaa14008c9601,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".",In what city and state did Beyonce grow up?,"{'text': ['Houston, Texas'], 'answer_start': [166]}"
4,56bf6b0f3aeaaa14008c9602,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".",In which decade did Beyonce become famous?,"{'text': ['late 1990s'], 'answer_start': [276]}"


### Work in progress

#### Actual questions and answers from df_clean

In [None]:
# Tokenize questions and answers
question_tokens = tokenizer(df_clean['Question'].tolist(), max_length=31, padding = "max_length", truncation = True, return_tensors="tf")
question_tokens['input_ids'].shape

answer_tokens = tokenizer(df_clean['Answer'].tolist(), max_length=93, padding = "max_length", truncation = True, return_tensors="tf")
answer_tokens['input_ids'].shape

In [None]:
# Embed questions and answers using base model
q_embeddings = base_model.predict(question_tokens["input_ids"])
a_embeddings = base_model.predict(answer_tokens["input_ids"])

In [280]:
a_embeddings.last_hidden_state[:,0,:][0]

array([ 4.08690959e-01,  4.93379831e-01, -2.05413222e+00, -6.40193582e-01,
        1.37747538e+00, -1.70050681e-01, -1.32165045e-01, -6.47218004e-02,
        1.33166802e+00,  1.83643833e-01,  3.32193673e-01, -1.45272553e-01,
        6.53128922e-01,  2.14906597e+00, -2.24932313e-01,  6.98875368e-01,
        1.02188952e-01,  8.63780141e-01, -1.07471454e+00,  2.10869238e-01,
        1.16799045e+00,  6.70837224e-01,  7.05815673e-01,  9.06614304e-01,
       -3.67928110e-02,  2.44543746e-01,  1.31401026e+00, -5.10728247e-02,
       -5.77018380e-01,  1.90765488e+00,  3.06510806e-01,  6.64363801e-01,
        2.40759980e-02, -3.57501954e-01,  8.35342109e-01,  8.49480927e-03,
       -7.03092635e-01,  1.50315642e-01, -5.06952778e-02,  1.90156832e-01,
       -7.11647093e-01, -4.56458986e-01, -4.01505232e-01,  5.64848781e-01,
       -8.01333249e-01,  1.09610140e+00, -2.18562365e-01,  3.34662616e-01,
        4.03636284e-02, -7.48753905e-01, -3.55858713e-01, -8.17893371e-02,
       -1.13955975e+00,  

In [164]:
# Find required shape of input layer
q_embeddings.last_hidden_state.shape

(1148, 31, 768)

In [None]:
# Create X and y
X = q_embeddings.last_hidden_state[:,0,:]
y = a_embeddings.last_hidden_state[:,0,:]

In [277]:
X

array([[ 0.22716677,  0.30360517, -2.4623933 , ..., -0.35451695,
        -0.22256944, -0.03869351],
       [ 0.22716698,  0.30360538, -2.462393  , ..., -0.35451669,
        -0.22256906, -0.03869365],
       [ 0.26436937,  0.40877575, -2.0705335 , ..., -0.11665837,
        -0.5100673 ,  0.09670612],
       ...,
       [ 0.31911156,  0.37626874, -2.1082852 , ..., -0.20078221,
        -0.60245574, -0.11896776],
       [ 0.31911156,  0.37626874, -2.1082852 , ..., -0.20078221,
        -0.60245574, -0.11896776],
       [ 0.1454277 ,  0.2322737 , -2.4382212 , ..., -0.13180733,
        -0.35522392,  0.01232196]], dtype=float32)

In [274]:
y_test[1]
y = tf.squeeze(q_tokens['input_ids'])

decoded = tokenizer.decode(qs) 
decoded

array([ 2.08725005e-01,  5.01426697e-01, -2.10185099e+00, -6.84273303e-01,
        1.43889809e+00, -2.25048304e-01, -1.23420522e-01, -1.01065524e-02,
        1.43256664e+00,  1.32539213e-01,  2.24693879e-01, -1.92586064e-01,
        6.38542950e-01,  2.11760545e+00, -2.36335918e-01,  8.12754035e-01,
       -1.79602522e-02,  8.05653036e-01, -1.13572955e+00,  1.80632636e-01,
        1.13128090e+00,  7.28021443e-01,  7.23214209e-01,  9.11190510e-01,
       -5.04088849e-02,  4.17113066e-01,  1.32801938e+00, -9.73763168e-02,
       -5.82256198e-01,  2.01138043e+00,  2.78219730e-01,  5.20721376e-01,
       -8.78851265e-02, -2.86367178e-01,  9.65058208e-01, -9.37496722e-02,
       -6.56549394e-01,  1.92702621e-01,  3.81495208e-02,  1.05042040e-01,
       -9.01373267e-01, -4.74188268e-01, -4.33959693e-01,  8.24443042e-01,
       -8.43610823e-01,  1.08136749e+00, -2.81595856e-01,  3.52865905e-01,
       -4.92013730e-02, -7.37659752e-01, -4.69671130e-01, -1.24587774e-01,
       -1.09619522e+00,  

In [184]:
# Train model and evaluate performance

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Flatten

input_shape = (768,) 

dense_model = Sequential([
    Dense(256, activation='relu', input_shape=input_shape),
    Flatten(),
    Dense(1, activation='relu') # linear, softmax, gelu, geglu (see here for geglu https://github.com/pouyaardehkhani/ActTensor )
])

# linear: loss: 0.6473 - mae: 0.5263
# gelu: loss: 0.6464 - mae: 0.5257
# relu: loss: 0.6472 - mae: 0.5250

dense_model.compile(loss='mse', optimizer='adam', metrics=['mae'])
# dense_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

dense_model.fit(X_train, y_train, validation_split=0.2, epochs=20)

dense_model.evaluate(X_test, y_test)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


[0.6472066640853882, 0.5249581336975098]

In [284]:
dense_model.predict(X_test)



array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],

### Legacy / testing

In [None]:
to_train = pipeline(model = 'deepset/roberta-base-squad2')

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")

tokens = tokenizer.encode("This is easy!", return_tensors = "tf")

print(tokens)

In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained("deepset/roberta-base-squad2", from_pt = True)

output_tokens = model.generate(tokens)

print(output_tokens)

print(tokenizer.decode(output[0]))

## Sources

### PDF

In [None]:
"""Open a pdf file as a possible source of context"""
from pdfquery import PDFQuery

pdf = PDFQuery('/home/george/Downloads/intelligent_M_and_A.pdf')
pdf.load()

# Use CSS-like selectors to locate the elements
text_elements = pdf.pq('LTTextLineHorizontal')

# Extract the text from the elements
text = [t.text for t in text_elements]

print(text)

### Text files

In [None]:
"""Open a text file as a possible source of context"""
file = open("/home/george/Documents/example_article.txt", "r") # Example file
content = file.read()
print(content)
file.close()

question = ['Where is Spain?'] # Example question

### OCR

In [148]:
"""Extract text using OCR
This works with images obtained from the snipping tool.
This works with photographs of text but images need to be rotated to ensure they are the correct way up.
It does not recognise handwriting."""

# """For """
# !sudo apt install tesseract-ocr  
# !sudo apt install libtesseract-dev
# !pip install Pillow pytesseract

import pytesseract
from PIL import Image

def ocr_document(image_path):
    # Open the image using the Pillow library
    image = Image.open(image_path)

    # Perform OCR using Tesseract
    text = pytesseract.image_to_string(image)

    return text

In [151]:
# Example usage
image_path = '/home/george/Downloads/articletoocrquizachu/funding_rotated.jpg'
result_text = ocr_document(image_path)

print("OCR Result:")
print(result_text)

OCR Result:
BE PREPARED TO BE FRUSTRATED WHEN IT
COMES TO FUNDING

There’s nothing like fundraising to make you realise how far female-
founded startups still have to go in order to be treated as equals in
business.

Building a startup is hard. Building a startup based on a product that
is aimed specifically at women, led by a female founder, and built bya
predominantly female team, is harder, much harder.

During the 18 months that we have been building the Propelle
platform, we have invariably met two different (male-dominated)
investment camps. The first don’t understand why there’s a need

for this type of product and have even gone as far as questioning
whether women want to manage their own money(!). The second
“WomenWash” the issue - they champion the idea, they take the
meetings (they check the D&I tickbox off'as they do) whilst making
a big show about it, but ultimately ask questions like “Who's looking
after your kids when you work?”, “Are you planning on having any
more chil

In [149]:
# Example usage
image_path = '/home/george/Downloads/articletoocrquizachu/funding.jpg'
result_text = ocr_document(image_path)

print("OCR Result:")
print(result_text)

OCR Result:
‘pUTUI UI SassouIsng paj-ustI0OM

CIM yh 7 Ware saynol JUBUNSaAUT [eUODIpeD sup Ayjeyusurepuny
*Ppayse jo9 JaAIU pynom

WIOOIPIeOg & UI SOD pur siapunoy apeur yey) ssump - ,,<UaappyD a1001
Aue Savy uo Surauryd no ary, ‘,,2410m noA uaym spPy inoA 1233ye
SUDO] S,0yM,, 24x] suonsanb yse AyayeurNN yng 441 NOGe Moys 31q e
SUDICUL ISpIyM (Op Avy) se Jo xoqyID PEC 24) yey Ast) ssupe:0eur
dU} dye} Ast ‘vapt ou} uoIdueypD Ast) - ansst stp , YseMUcUIOM,,
puodeas au], *(j)AdUOU UMO Tet) sSeURUT 0} JULM UITIOM Jay}94M
Surmoypsenb sev ivy sv 9u08 Usd ary pur Npord Jo adAQ sity 10;
paeu b sare) Ay pur}sispun },uop 3s1y ay], ‘sdured JUSUNseAul
(payeuTMOp-ayeul) JUarayIp OM) Jour Ajqerreaut sary am “WLoje|d
ayjedorg au SUIpying Ueeq aary aM Jey} SyUOUT gy et) suLING
Japzey Yonul “aprey st ‘wres} apeutay ApUeuTMOpeid

v Aq IN pure ‘apunoy sfeutay & Aq poy ‘uaui0m ye Ayes yDeds poutre si
yeu Jonpoad v uo paseq dnzrejs v SUIpyINg “prey st dngzeys e Supping
‘ssouIsng

ur spenba se payva} aq 0}

### Scraping

In [213]:
"""Import packages"""
import requests
from bs4 import BeautifulSoup
import re

In [210]:
"""Scrape Wikipedia as a possible source of context"""

import requests
from bs4 import BeautifulSoup

def scrape_wikipedia_article(url):
    # Send an HTTP request to the URL
    response = requests.get(url)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the main article text (adjust the selector based on the structure of the webpage)
        article_text = soup.find('div', {'id': 'mw-content-text'}).get_text(separator='\n')
        
        return article_text
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return None

# Example usage
url = 'https://en.wikipedia.org/wiki/Ai_(chimpanzee)'
wikipedia_article_text = scrape_wikipedia_article(url)

if wikipedia_article_text:
    print(wikipedia_article_text)



Subject of the Ai project


Ai
Species
chimpanzee
Born
c.
 1976
 (age 
46–47)
Guinean Forests of West Africa
Offspring
Ayumu (chimpanzee)


Ai
 (born in 1976, estimated) is a female 
western chimpanzee
 (
Pan troglodytes verus
),
[1]
 currently living at the 
Primate Research Institute
 of 
Kyoto University
 (acronym KUPRI). She is the first subject of the 
Ai project
, a research program started in 1978 by Kiyoko Murofushi and 
Tetsuro Matsuzawa
 which is aimed at understanding chimpanzee 
cognition
 through computer interface experiments.
[2]






Biography
[
edit
]


Ai was born in 1976 (estimated), in the 
Guinean Forests of West Africa
.
[1]
 Born wild, Ai was soon taken into captivity and sold to KUPRI in 1977 by an animal trader (this type of sale became illegal in 1980 with 
Japan
's ratification of 
CITES
).
[1]
  She was the first subject of KUPRI’s chimpanzee project, which was intended to become Japan’s first ape-language study in the vein of earlier ape-language studies.


In [214]:
"""Scrape BBC as a possible source of context"""

import requests
from bs4 import BeautifulSoup

def scrape_bbc_article(url):
    # Send an HTTP request to the URL
    response = requests.get(url)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the main article text (adjust the selector based on the structure of the webpage)
        article = []
        for para in soup.find_all("div", {"data-component": "text-block"}):
            article.append(para.text)
        article_text = " ".join(article)
        
        return article_text
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return None

# Example usage - cat article
url = 'https://www.bbc.co.uk/news/uk-england-beds-bucks-herts-67407334'
bbc_article_text = scrape_bbc_article(url).replace('\n', ' ')

if bbc_article_text:
    print(bbc_article_text)


A cat whose pictures went viral for regularly visiting a railway station is releasing a Christmas single. Four-year-old Nala has been delighting commuters who have been taking photos of her at Stevenage station. Owner Natasha Ambler revealed the cat was releasing a single called Meow and has been approached for a book deal. The ginger tabby has also recorded a video for the song due to be released this week, under the name Nala the Station Cat. It has been produced by Danny Kirsch, who wrote it with Joe Killington, while Nala is also co-credited as a songwriter, as well as a vocalist. Ms Ambler said "we want to spread the happiness that Stevenage has had, and she's had on socials to the world". The single is officially released on Wednesday and BBC Three Counties Radio's Justin Dealey gave the single an exclusive first play on Sunday. "I'm slightly lost for words," said the presenter after the song finished. Nala's owner replied: "So am I to be fair." The musical cat does not yet have 

### Importing audio

In [None]:
"""Installs to analyse audio"""
!sudo apt install ffmpeg
!pip3 install datasets
!pip install SoundFile
!pip install librosa

In [None]:
"""Example audio to analyse"""
!mkdir data
!curl https://wagon-public-datasets.s3.amazonaws.com/deep_learning_datasets/harvard.wav > data/harvard.wav

In [None]:
"""Packages for audio"""
from scipy.io import wavfile
from IPython.display import Audio

In [None]:
"""Read the audio file and play it to verify"""
rate, audio = wavfile.read("data/harvard.wav")
Audio(audio.T, rate=rate)

In [None]:
"""Transcription of a downloaded wav file"""

from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa  

# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
model.config.forced_decoder_ids = None

# Whisper requires a sampling rate of 16000 so must convert this with librosa
audio, rate = librosa.load('data/harvard.wav', sr=16000)
input_features = processor(audio, sampling_rate=rate, return_tensors="pt").input_features 

# generate token ids
predicted_ids = model.generate(input_features)
# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)

transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)


In [None]:
"""Transcription of a flac file from hugging face"""

from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset

# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
model.config.forced_decoder_ids = None

# load dummy dataset and read audio files
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = ds[0]["audio"]
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features 

# generate token ids
predicted_ids = model.generate(input_features)
# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)

transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)


In [None]:
transcription

### Reading handwriting

In [None]:
"""This works well for single lines of handwriting but does not support multiple lines.
I need to split multiple line files into single lines."""

hw = pipeline(model = 'microsoft/trocr-base-handwritten')

In [59]:
"""This attempts to split images. It is the first time I gave up and got chatgpt to write code for me.
It does not work very well - it identifies words but does not link them correctly as lines."""

import cv2
import os
import pytesseract

def split_and_save_handwritten_lines(image_path, output_directory):
    # Read the image using OpenCV
    image = cv2.imread(image_path)

    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Use adaptive thresholding to preprocess the image
    _, binary_image = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # Find contours in the binary image
    contours, _ = cv2.findContours(binary_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Create the output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)

    # List to store individual line images
    line_images = []

    # Minimum width and height threshold for a contour to be considered a line
    min_width_threshold = 300
    min_height_threshold = 20

    # Iterate through contours
    for i, contour in enumerate(contours):
        # Get bounding box for each contour
        x, y, w, h = cv2.boundingRect(contour)

        # Filter out contours based on width and height
        if w > min_width_threshold and h > min_height_threshold:
            # Crop the original image to extract the line
            line_image = image[y:y+h, x:x+w]

            # Save the line image to the output directory
            output_path = os.path.join(output_directory, f'line_{i+1}.png')
            cv2.imwrite(output_path, line_image)

            # Append the line image to the list
            line_images.append(line_image)

    return line_images

# Example usage
image_path = '/home/george/Downloads/Handwriting-Y4.png'
output_directory = '/home/george/Downloads/split_text'
lines = split_and_save_handwritten_lines(image_path, output_directory)

# Print the paths of saved line images
for i, line_image in enumerate(lines, start=1):
    print(f"Saved line {i} to {os.path.join(output_directory, f'line_{i}.png')}")


Saved line 1 to /home/george/Downloads/split_text/line_1.png


In [13]:
"""Question-answer format"""
ocr(image='/home/george/Downloads/magnification.jpg',question="What does this page say?")

[{'score': 0.7278719544410706,
  'answer': 'Calculating the size of an object',
  'start': 0,
  'end': 5}]

### Text in more complex layouts, eg invoices or posters

In [None]:
"""First model - this answers questions about documents
- this works for very simple documents 
but struggles for anything which implies relationships (e.g. two text boxes that relate to one another)"""
ocr = pipeline(model = 'impira/layoutlm-invoices') #This struggles to find relationships between objects

## Tests

In [None]:
"""Tested distilbert to see if quicker. Some gains but less accurate"""
"""In tests, 2 means distilbert passed, 1 means roberta passed"""
question_answerer2 = pipeline(model = 'distilbert-base-uncased-distilled-squad') 

#### Underground questions

In [None]:
underground_context = """The history of the London Underground began in the 19th century with the construction of the Metropolitan Railway,
the world's first underground railway. The Metropolitan Railway, which opened in 1863 using gas-lit wooden carriages hauled by steam 
locomotives, worked with the District Railway to complete London's Circle line in 1884. Both railways expanded, the Metropolitan eventually 
extending as far as Verney Junction in Buckinghamshire, more than 50 miles (80 km) from Baker Street and the centre of London. 
The first deep-level tube line, the City and South London Railway, opened in 1890 with electric trains. 
This was followed by the Waterloo & City Railway in 1898, the Central London Railway in 1900, and the Great Northern and City Railway 
in 1904. The Underground Electric Railways Company of London (UERL) was established in 1902 to fund the electrification of the 
District Railway and to complete and operate three tube lines, the Baker Street and Waterloo Railway, the Charing Cross, Euston and 
Hampstead Railway and the Great Northern, Piccadilly and Brompton Railway, which opened in 1906–07. By 1907 the District and 
Metropolitan Railways had electrified the underground sections of their lines.

Under a joint marketing agreement between most of the companies in the early years of the 20th century, UNDERGROUND signs appeared 
outside stations in central London. World War I delayed extensions of the Bakerloo and Central London Railways, and people used the 
tube stations as shelters during Zeppelin air raids by June 1915. After the war, government-backed financial guarantees were used to 
expand the network, and the tunnels of the City and South London and Charing Cross, Euston and Hampstead Railways were linked at Euston and 
Kennington, although the combined service was not named the Northern line until later. The Piccadilly line was extended north to 
Cockfosters and took over District line branches to Harrow (later Uxbridge) and Hounslow. In 1933, the underground railways and all London 
area tram and bus operators were merged into the London Passenger Transport Board (LPTB). The outlying branches of the Metropolitan were 
closed; various upgrades were planned. The Bakerloo line's extension to take over the Metropolitan's Stanmore branch, and extensions of 
the Central and Northern lines, formed part of the 1930s New Works Programme. The outbreak of World War II in 1939 halted or interrupted 
some of this work, and many tube stations were used as air-raid shelters. """

In [None]:
underground_questions = ["In what year did the London Underground begin?", # 1963
"In what year did the UERL begin to be built?", # 1902 - 2, 1
"In what year did the London Underground begin?",
"What railway line was built in the 18th century?",
"In what year did the Thames & City Railway open?",
"How many kilometers long did it take to travel through London's Station?",
"When did the London Underground first operate?",
"In what year was the London Underground Station opened?",
"How many kilometers long was the London Underground Station?",
"In what year did the Thames & City Railway open?",
"What railway line was built in London in the 18th century?",
"What railway line was built in the London Underground?",
"When did the UERL begin to run through the London Underground?",
"When did the London Underground begin?",
"What was the name of the first railway station in London?",
"In what year did the London Underground begin?",
"In what year was the London Underground Station closed?",
"What railway line opened in 1880?",
"What railway line was built in London in 1880?",
"What railway line was built in the 18th century?"]

In [17]:
answer_questions_with_confidence2(underground_context, underground_questions) # underground qs using distilbert approx 15 seconds

Unnamed: 0,confidence,question,answer
0,0.930403,In what year did the London Underground begin?,1933
1,0.99683,In what year did the UERL begin to be built?,1902
2,0.930403,In what year did the London Underground begin?,1933
3,0.549939,What railway line was built in the 18th century?,Metropolitan Railway
4,0.881689,In what year did the Thames & City Railway open?,1898
5,0.771743,How many kilometers long did it take to travel through London's Station?,80
6,0.830885,When did the London Underground first operate?,1933
7,0.914037,In what year was the London Underground Station opened?,1933
8,0.794559,How many kilometers long was the London Underground Station?,80
9,0.881689,In what year did the Thames & City Railway open?,1898


In [18]:
answer_questions_with_confidence(underground_context, underground_questions) #using roberta approx 35 seconds

Unnamed: 0,confidence,question,answer
0,0.2551449,In what year did the London Underground begin?,19th century
1,0.958134,In what year did the UERL begin to be built?,1902
2,0.2551449,In what year did the London Underground begin?,19th century
3,4.702023e-07,What railway line was built in the 18th century?,Piccadilly
4,2.806922e-06,In what year did the Thames & City Railway open?,1898
5,0.1153477,How many kilometers long did it take to travel through London's Station?,80
6,0.2953299,When did the London Underground first operate?,19th century
7,0.2517956,In what year was the London Underground Station opened?,1863
8,0.5730436,How many kilometers long was the London Underground Station?,80
9,2.806922e-06,In what year did the Thames & City Railway open?,1898


#### Cat questions

In [215]:
cat_url = 'https://www.bbc.co.uk/news/uk-england-beds-bucks-herts-67407334'
bbc_cat_text = scrape_bbc_article(cat_url)

In [209]:
# Example questions - cat article
bbc_cat_questions = ['Where will profit go?','Who produced the song?','What is the song called?',\
             'Who gave the song its first play?','When will the song be released?','Who wrote the song?',\
             'Where was the video filmed?','How has nala been delighting commuters?',\
             "Who's pictures went viral?", 'All proceeds from the single will be what?', 'What links Danny and Joe?']

In [22]:
bbc_cat_questions # 2 if passed by 2, 1 if passed by 1
# RSPCA and Stevenage homelessness charity Feed Up Warm Up 2, 1
# Danny Kirsch 2, 1
# Meow 2, 1
# Justin Dealey 2, 1
# Stevenage railway station 2, 1
# regularly visiting a railway station 2, 1
# Nala 1
# donated to the RSPCA and Stevenage homelessness charity Feed Up Warm Up 1
# producer / writer 1

['Where will profit go?',
 'Who produced the song?',
 'What is the song called?',
 'Who gave the song its first play?',
 'When will the song be released?',
 'Who wrote the song?',
 'Where was the video filmed?',
 'How has nala been delighting commuters?',
 "Who's pictures went viral?",
 'All proceeds from the single will be what?',
 'What links Danny and Joe?']

In [26]:
answer_questions_with_confidence2(bbc_article_text, bbc_cat_questions) # 12 seconds

Unnamed: 0,confidence,question,answer
0,0.535661,Where will profit go?,RSPCA and Stevenage homelessness charity Feed Up Warm Up
1,0.981917,Who produced the song?,Danny Kirsch
2,0.883597,What is the song called?,Meow
3,0.992343,Who gave the song its first play?,Justin Dealey
4,0.852285,When will the song be released?,before Christmas
5,0.482969,Who wrote the song?,Danny Kirsch
6,0.920912,Where was the video filmed?,Stevenage railway station
7,0.444611,How has nala been delighting commuters?,taking photos of her at Stevenage station
8,0.617798,Who's pictures went viral?,A cat
9,0.024473,All proceeds from the single will be what?,Feed Up Warm Up


In [29]:
answer_questions_with_confidence(bbc_article_text, bbc_cat_questions) # 13 seconds

Unnamed: 0,confidence,question,answer
0,0.101562,Where will profit go?,RSPCA and Stevenage homelessness charity Feed Up Warm Up
1,0.963533,Who produced the song?,Danny Kirsch
2,0.81919,What is the song called?,Meow
3,0.781142,Who gave the song its first play?,Justin Dealey
4,0.682033,When will the song be released?,Wednesday
5,0.440879,Who wrote the song?,"Danny Kirsch, who wrote it with Joe Killington"
6,0.842697,Where was the video filmed?,Stevenage railway station
7,0.508643,How has nala been delighting commuters?,taking photos of her at Stevenage station
8,0.308242,Who's pictures went viral?,Nala
9,0.2376,All proceeds from the single will be what?,donated to the RSPCA and Stevenage homelessness charity Feed Up Warm Up


#### Chimp questions

In [32]:
chimp_questions = ['What is a typical group size in the wild?', 'Who is the son of Ai?', 'What can Ai do with the keyboard?',\
                   'What was Ai the first chimpanzee to do?']
chimp_answers = ['20', # 2
                 'Ayumu', # 2
                 'interact with a computer', 
                 'use arabic numerals to represent numbers'] # 2

In [33]:
answer_questions_with_confidence2(wikipedia_article_text, chimp_questions) # 13 seconds

Unnamed: 0,confidence,question,answer
0,0.491928,What is a typical group size in the wild?,20 chimpanzees
1,0.982641,Who is the son of Ai?,Ayumu
2,0.654109,What can Ai do with the keyboard?,paint and draw
3,0.64358,What was Ai the first chimpanzee to do?,learned to use Arabic numerals to represent numbers


In [35]:
answer_questions_with_confidence2(wikipedia_article_text, chimp_questions) # 15 seconds

Unnamed: 0,confidence,question,answer
0,0.491928,What is a typical group size in the wild?,20 chimpanzees
1,0.982641,Who is the son of Ai?,Ayumu
2,0.654109,What can Ai do with the keyboard?,paint and draw
3,0.64358,What was Ai the first chimpanzee to do?,learned to use Arabic numerals to represent numbers


## Example images

In [4]:
"""Example images for processing"""
"""Text"""
# Invoice
invoice = 'https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png'
# Simple poster
simple = 'https://www.11thhourracingteam.org/wp-content/uploads/11th-hour-racing-team-how-to-create-a-sustainability-policy-horizontal-3-1-1536x1056.png'
# Complex poster
complicated = 'https://cdn.greenmatch.co.uk/cdn-cgi/image/format=auto/2/2023/07/MAY23_4_02-Plastic-Waste_Global-Waste_2-1-663x1024.png'
# Microscopes text book page via web link
microscope = 'https://m.media-amazon.com/images/I/71Ts-QXYIhL._SL1500_.jpg'
# Magnification text book page downloaded to absolute file path
magnification = '/home/george/Downloads/magnification.jpg'

"""Handwriting"""
# Nice clear handwriting and cursive handwriting
clear = 'https://steemitimages.com/DQmcdbSGrnA9zeqWrYHD8EkNjvF9uxQCAeB7qnucUShpNDe/IMG_7345.PNG'
# Tricky handwriting
tricky = 'https://www.researchgate.net/profile/Neeta-Nain/publication/299666231/figure/fig1/AS:491693964304386@1494240384780/Example-image-of-a-general-handwritten-text-paragraph-from-IAM-dataset-4.png'
y5 = 'https://thelinksprimary.org.uk/wp-content/uploads/2023/10/Handwriting-Y6.png'

## Other ideas to explore

In [None]:
"""These are possible ways to better process images"""
"""visual bert needs more configuring"""
https://huggingface.co/daki97/visualbert_finetuned_easy_vqa
https://huggingface.co/docs/transformers/model_doc/visual_bert#overview # overview is part of the url, not a comment
https://github.com/huggingface/transformers/blob/main/examples/research_projects/visual_bert/demo.ipynb
"""layout needs more configuring"""
https://huggingface.co/docs/transformers/model_doc/layoutlmv3
"""should work for extracting printed text, but only works for single lines"""
https://huggingface.co/microsoft/trocr-base-printed
"""suggestions on how to split into multiple lines for extracted text and handwriting"""
https://github.com/microsoft/unilm/issues/628
https://discuss.huggingface.co/t/trocr-fine-tuning/13293/3
"""vision encoder requires more configuration"""
https://huggingface.co/docs/transformers/model_doc/vision-encoder-decoder
"""Generate LaTEX from images"""
https://huggingface.co/Norm/nougat-latex-base
"""Automate snipping"""
https://support.techsmith.com/hc/en-us/articles/115002022732?ipc_item_name=snagit&ipc_platform=windows&utm_campaign=sw23&utm_medium=snagit&utm_source=product
https://www.movavi.com/learning-portal/snipping-tool-for-linux.html
https://screencloud.net/
https://www.codeproject.com/Articles/485883/Create-your-own-Snipping-Tool
https://www.codeinwp.com/blog/best-website-screenshot-tools/#gref
# Note it may not be possible to snip elements outside the browser from within the browser, for security reasons

## Exploration of retraining model

In [133]:
from transformers import TFAutoModelForSequenceClassification

In [134]:
question_answerer_train = TFAutoModelForSequenceClassification.from_pretrained('deepset/roberta-base-squad2', from_pt=True)

pytorch_model.bin:   0%|          | 0.00/496M [00:00<?, ?B/s]

2023-12-01 16:37:16.271633: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-12-01 16:37:16.292324: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2023-12-01 16:37:16.292539: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (george-ThinkPad-X220-Tablet): /proc/driver/nvidia/version does not exist
2023-12-01 16:37:17.115447: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 154414080 exceeds 10% of free system memory.
2023-12-01 16:37:17.564863: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 154414080 exceeds 10% of free system memory.
2023-12-01 16:37:17.660538: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 154414080 exceeds 10% of free system memory.
2023-12-01 16:

In [None]:
question_answerer_train.