In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

In [2]:
#CoQA is a Conversational Question Answering dataset released by Stanford NLP in 2019. It is a large-scale dataset for building Conversational Question Answering Systems. 
coqa = pd.read_json('http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json')
coqa.head()

Unnamed: 0,version,data
0,1,"{'source': 'wikipedia', 'id': '3zotghdk5ibi9ce..."
1,1,"{'source': 'cnn', 'id': '3wj1oxy92agboo5nlq4r7..."
2,1,"{'source': 'gutenberg', 'id': '3bdcf01ogxu7zdn..."
3,1,"{'source': 'cnn', 'id': '3ewijtffvo7wwchw6rtya..."
4,1,"{'source': 'gutenberg', 'id': '3urfvvm165iantk..."


Data cleaning. Get rid of version 

In [3]:
del coqa["version"]

In [4]:
coqa.head()

Unnamed: 0,data
0,"{'source': 'wikipedia', 'id': '3zotghdk5ibi9ce..."
1,"{'source': 'cnn', 'id': '3wj1oxy92agboo5nlq4r7..."
2,"{'source': 'gutenberg', 'id': '3bdcf01ogxu7zdn..."
3,"{'source': 'cnn', 'id': '3ewijtffvo7wwchw6rtya..."
4,"{'source': 'gutenberg', 'id': '3urfvvm165iantk..."


For every question-answer pair, we will be attaching the linked story to it

In [5]:
#required columns in our dataframe
cols = ["text","question","answer"]
#list of lists to create our dataframe
comp_list = []
for index, row in coqa.iterrows():
    for i in range(len(row["data"]["questions"])):
        temp_list = []
        temp_list.append(row["data"]["story"])
        temp_list.append(row["data"]["questions"][i]["input_text"])
        temp_list.append(row["data"]["answers"][i]["input_text"])
        comp_list.append(temp_list)
new_df = pd.DataFrame(comp_list, columns=cols) 
#saving the dataframe to csv file for further loading
new_df.to_csv("CoQA_data.csv", index=False)

Data Loading from Local CSV file which is the cleaned version

In [6]:
data = pd.read_csv("CoQA_data.csv")
data.tail()

Unnamed: 0,text,question,answer
108642,(CNN) -- Cristiano Ronaldo provided the perfec...,Who was a sub?,Xabi Alonso
108643,(CNN) -- Cristiano Ronaldo provided the perfec...,Was it his first game this year?,Yes
108644,(CNN) -- Cristiano Ronaldo provided the perfec...,What position did the team reach?,third
108645,(CNN) -- Cristiano Ronaldo provided the perfec...,Who was ahead of them?,Barca.
108646,(CNN) -- Cristiano Ronaldo provided the perfec...,By how much?,six points


Some profile info

In [7]:
print("Number of question and answers: ", len(data))

Number of question and answers:  108647


Taking pretrained models for BertForQuestionAnswering class from the transformers library.

In [8]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad', return_dict=True)
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Random testing of question

In [9]:
# Random Qn and test pair
random_num = np.random.randint(0,len(data))
question = data["question"][random_num]
text = data["text"][random_num]

# qiestopm amd text pair testing. Let’s tokenize the question and text as a pair.
input_ids = tokenizer.encode(question, text)
#print("here is how the input_ids look like", input_ids)
print("The input has a total of {} tokens.".format(len(input_ids)))

# to see what the tokernizer is doing.
tokens = tokenizer.convert_ids_to_tokens(input_ids)
for token, id in zip(tokens, input_ids):
    print('{:8}{:8,}'.format(token,id))

# for [CLS] vs [SEP] see https://towardsdatascience.com/question-answering-with-a-fine-tuned-bert-bc4dafd45626#:~:text=we%20can%20see%20two%20special%20tokens%20%5Bcls%5D%20and%20%5Bsep%5D.%20

# [CLS] token stands for classification and is there to represent sentence-level classification and is used when we are classifying. Another token used by BERT is [SEP]. It is used to separate the two pieces of text. You can see two [SEP] tokens in the above screenshots, one after the question and another after the text.

The input has a total of 278 tokens.
[CLS]        101
what       2,054
was        2,001
it         2,009
considered   2,641
?          1,029
[SEP]        102
germany    2,762
is         2,003
a          1,037
federal    2,976
republic   3,072
consisting   5,398
of         1,997
sixteen    7,032
federal    2,976
states     2,163
(          1,006
german     2,446
:          1,024
bun       21,122
##des      6,155
##land     3,122
,          1,010
or         2,030
land       2,455
)          1,007
.          1,012
[          1,031
a          1,037
]          1,033
since      2,144
today      2,651
'          1,005
s          1,055
germany    2,762
was        2,001
formed     2,719
from       2,013
an         2,019
earlier    3,041
collection   3,074
of         1,997
several    2,195
states     2,163
,          1,010
it         2,009
has        2,038
a          1,037
federal    2,976
constitution   4,552
,          1,010
and        1,998
the        1,996
constituent  13,794
states     2,16

Transformers library can create segment embeddings on its own using PretrainedTokenizer.encode_plus(). But, we can even create our own. For that, we just need to specify a 0 or 1 for each token.  Segment embeddings help BERT in differentiating a question from the text. 

In [10]:
#first occurence of [SEP] token
sep_idx = input_ids.index(tokenizer.sep_token_id)
print("SEP token index: ", sep_idx)
#number of tokens in segment A (question) - this will be one more than the sep_idx as the index in Python starts from 0
num_seg_a = sep_idx+1
print("Number of tokens in segment A: ", num_seg_a)
#number of tokens in segment B (text)
num_seg_b = len(input_ids) - num_seg_a
print("Number of tokens in segment B: ", num_seg_b)
#creating the segment ids
segment_ids = [0]*num_seg_a + [1]*num_seg_b
print("segment_IDs", segment_ids)
print("Number of segment_ids", len(segment_ids))
#making sure that every input token has a segment id
assert len(segment_ids) == len(input_ids)

SEP token index:  6
Number of tokens in segment A:  7
Number of tokens in segment B:  271
segment_IDs [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Number of segment_ids 278


Feeding to model

In [11]:
#token input_ids to represent the input and token segment_ids to differentiate our segments - question and text
output = model(torch.tensor([input_ids]), 
token_type_ids=torch.tensor([segment_ids]))

Looking at the most probable start and end words and providing answers only if the end token is after the start token.

In [12]:
#tokens with highest start and end scores
answer_start = torch.argmax(output.start_logits)
answer_end = torch.argmax(output.end_logits)
if answer_end >= answer_start:
    answer = " ".join(tokens[answer_start:answer_end+1])
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")
    
print("\nQuestion:\n{}".format(question.capitalize()))
print("\nAnswer:\n{}.".format(answer.capitalize()))


Question:
What was it considered?

Answer:
De facto state.


<!-- Question:
Who is the acas director?

Answer:
Agnes karin ##gu. -->

<!-- what’s this “##” in the reply? Keep on reading! 📙
BERT uses wordpiece tokenization. In BERT, rare words get broken down into subwords/pieces. Wordpiece tokenization uses ## to delimit tokens that have been split. An example of this: “Karin” is a common word so wordpiece does not split it. However, “Karingu” is a rare word so wordpiece split it into the words, “Karin” and “##gu”. Notice that it has added ## before gu to indicate that it is the second piece of the split word.
The idea behind using wordpiece tokenization is to reduce the size of the vocabulary which improves training performance. Consider the words, run, running, runner. Without wordpiece tokenization, the model has to store and learn the meaning of all three words independently. However, with wordpiece tokenization, each of the three words would be split into ‘run’ and the related ‘##SUFFIX’ (if any suffix at all — for example, “run”, “##ning”, “##ner”). Now, the model will learn the context of the word “run” and the rest of the meaning would be encoded in the suffix, which would be learned from other words with similar suffixes. -->

In [13]:
answer = tokens[answer_start]
for i in range(answer_start+1, answer_end+1):
    if tokens[i][0:2] == "##":
        answer += tokens[i][2:]
    else:
        answer += " " + tokens[i]
# The above answer will now become: Agnes karingu

QA into a function easy for use

In [14]:
def question_answer(question, text):
    
    #tokenize question and text as a pair
    input_ids = tokenizer.encode(question, text)
    
    #string version of tokenized ids
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    
    #segment IDs
    #first occurence of [SEP] token
    sep_idx = input_ids.index(tokenizer.sep_token_id)
    #number of tokens in segment A (question)
    num_seg_a = sep_idx+1
    #number of tokens in segment B (text)
    num_seg_b = len(input_ids) - num_seg_a
    
    #list of 0s and 1s for segment embeddings
    segment_ids = [0]*num_seg_a + [1]*num_seg_b
    assert len(segment_ids) == len(input_ids)
    
    #model output using input_ids and segment_ids
    output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
    
    #reconstructing the answer
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)
    if answer_end >= answer_start:
        answer = tokens[answer_start]
        for i in range(answer_start+1, answer_end+1):
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]
                
    if answer.startswith("[CLS]"):
        answer = "Unable to find the answer to your question."
    
    print("\nPredicted answer:\n{}".format(answer.capitalize()))

Test QA with NY text

In [15]:
text = """New York (CNN) -- More than 80 Michael Jackson collectibles -- including the late pop star's famous rhinestone-studded glove from a 1983 performance -- were auctioned off Saturday, reaping a total $2 million. Profits from the auction at the Hard Rock Cafe in New York's Times Square crushed pre-sale expectations of only $120,000 in sales. The highly prized memorabilia, which included items spanning the many stages of Jackson's career, came from more than 30 fans, associates and family members, who contacted Julien's Auctions to sell their gifts and mementos of the singer. Jackson's flashy glove was the big-ticket item of the night, fetching $420,000 from a buyer in Hong Kong, China. Jackson wore the glove at a 1983 performance during \"Motown 25,\" an NBC special where he debuted his revolutionary moonwalk. Fellow Motown star Walter \"Clyde\" Orange of the Commodores, who also performed in the special 26 years ago, said he asked for Jackson's autograph at the time, but Jackson gave him the glove instead. "The legacy that [Jackson] left behind is bigger than life for me,\" Orange said. \"I hope that through that glove people can see what he was trying to say in his music and what he said in his music.\" Orange said he plans to give a portion of the proceeds to charity. Hoffman Ma, who bought the glove on behalf of Ponte 16 Resort in Macau, paid a 25 percent buyer's premium, which was tacked onto all final sales over $50,000. Winners of items less than $50,000 paid a 20 percent premium."""
question = "Where was the Auction held?"
question_answer(question, text)
#original answer from the dataset
print("Original answer:\n", data.loc[data["question"] == question]["answer"].values[0])



Predicted answer:
Hard rock cafe in new york ' s times square
Original answer:
 Hard Rock Cafe


CHATBOT STYLE QA

In [16]:
#Please enter your text: 
#The Vatican Apostolic Library (), more commonly called the Vatican Library or simply the Vat, is the library of the Holy See, located in Vatican City. Formally established in 1475, although it is much older, it is one of the oldest libraries in the world and contains one of the most significant collections of historical texts. It has 75,000 codices from throughout history, as well as 1.1 million printed books, which include some 8,500 incunabula.   The Vatican Library is a research library for history, law, philosophy, science and theology. The Vatican Library is open to anyone who can document their qualifications and research needs. Photocopies for private study of pages from books published between 1801 and 1990 can be requested in person or by mail.   In March 2014, the Vatican Library began an initial four-year project of digitising its collection of manuscripts, to be made available online.   The Vatican Secret Archives were separated from the library at the beginning of the 17th century; they contain another 150,000 items.   Scholars have traditionally divided the history of the library into five periods, Pre-Lateran, Lateran, Avignon, Pre-Vatican and Vatican.   The Pre-Lateran period, comprising the initial days of the library, dated from the earliest days of the Church. Only a handful of volumes survive from this period, though some are very significant.

# Q1 When was the Vat formally opened?
# Q2 How many books does it have?
# Q3 What is the library for?
# Q4 What are the periods of the library history
# Q5 How many books survive the earliest days of the church
# Q6 Is the library online?

# wrong answers
#Q7 How many giraffes are there

# no answers
#Q8 when will the collection be available online?

In [17]:
text = input("Please enter your text: \n")
question = input("\nPlease enter your question: \n")
while True:
    question_answer(question, text)
    
    flag = True
    flag_N = False
    
    while flag:
        response = input("\nDo you want to ask another question based on this text (Y/N)? ")
        if response[0] == "Y":
            question = input("\nPlease enter your question: \n")
            flag = False
        elif response[0] == "N":
            print("\nBye!")
            flag = False
            flag_N = True
            
    if flag_N == True:
        break


Predicted answer:
1475

Predicted answer:
1 . 1 million

Predicted answer:
Research library for history , law , philosophy , science and theology

Predicted answer:
Pre - lateran , lateran , avignon , pre - vatican and vatican

Predicted answer:
Only a handful of volumes

Predicted answer:
The vatican library began an initial four - year project of digitising its collection of manuscripts , to be made available online

Predicted answer:
8 , 500

Bye!


Same Short Text Comparison between Bert and Longformer

Personal data refers to data about an individual who can be identified from that data, or from that data and other information to which the organisation has or is likely to have access. 

The Personal Data Protection Act (PDPA) provides a baseline standard of protection for personal data in Singapore. It complements sector-specific legislative and regulatory frameworks such as the Banking Act and Insurance Act.

It comprises various requirements governing the collection, use, disclosure and care of personal data in Singapore. 

It also provides for the establishment of a national Do Not Call (DNC) Registry. Individuals may register their Singapore telephone numbers with the DNC Registry to opt out of receiving unwanted telemarketing messages from organisations.

The PDPA recognises both the need to protect individuals’ personal data and the need of organisations to collect, use or disclose personal data for legitimate and reasonable purposes.

A data protection regime is necessary to safeguard personal data from misuse and to maintain individuals’ trust in organisations that manage their data.

By regulating the flow of personal data among organisations, the PDPA also aims to strengthen Singapore’s position as a trusted hub for businesses.

The PDPA covers personal data stored in electronic and non-electronic formats. 

It generally does not apply to any individual acting on a personal or domestic basis, any individual acting in his/her capacity as an employee with an organisation , any public agency in relation to the collection, use or disclosure of personal data, any business contact information such as an individual’s name, position or title, business telephone number, business address, business email, business fax number and similar information and organisations are required to comply with the various data protection obligations if they undertake activities relating to the collection, use or disclosure of personal data.

### Questions for PDPA

In [None]:
# Q1. What is Personal Data?
# Q2. Does it overrides the other pieces of legistrations?
# Q3. What is the purpose of PDPA?
# Q4. What do the organisations need to do to ensure they comply with PDPA?/ How do organisations comply with the PDPA? **
# Q5. What are not within the scope of PDPA?
# Q6. How many giraffes are there?

In [None]:
# text = pd.read_csv(r'shorttext.txt',header=None,sep='\t')
# word count = 294
text = "Personal data refers to data about an individual who can be identified from that data, or from that data and other information to which the organisation has or is likely to have access. The Personal Data Protection Act (PDPA) provides a baseline standard of protection for personal data in Singapore. It complements sector-specific legislative and regulatory frameworks such as the Banking Act and Insurance Act. It comprises various requirements governing the collection, use, disclosure and care of personal data in Singapore. It also provides for the establishment of a national Do Not Call (DNC) Registry. Individuals may register their Singapore telephone numbers with the DNC Registry to opt out of receiving unwanted telemarketing messages from organisations. The PDPA recognises both the need to protect individuals’ personal data and the need of organisations to collect, use or disclose personal data for legitimate and reasonable purposes. A data protection regime is necessary to safeguard personal data from misuse and to maintain individuals’ trust in organisations that manage their data. By regulating the flow of personal data among organisations, the PDPA also aims to strengthen Singapore’s position as a trusted hub for businesses. The PDPA covers personal data stored in electronic and non-electronic formats. It generally does not apply to any individual acting on a personal or domestic basis, any individual acting in his/her capacity as an employee with an organisation , any public agency in relation to the collection, use or disclosure of personal data, any business contact information such as an individual’s name, position or title, business telephone number, business address, business email, business fax number and similar information and organisations are required to comply with the various data protection obligations if they undertake activities relating to the collection, use or disclosure of personal data."

In [None]:
print (text)

Personal data refers to data about an individual who can be identified from that data, or from that data and other information to which the organisation has or is likely to have access. The Personal Data Protection Act (PDPA) provides a baseline standard of protection for personal data in Singapore. It complements sector-specific legislative and regulatory frameworks such as the Banking Act and Insurance Act. It comprises various requirements governing the collection, use, disclosure and care of personal data in Singapore. It also provides for the establishment of a national Do Not Call (DNC) Registry. Individuals may register their Singapore telephone numbers with the DNC Registry to opt out of receiving unwanted telemarketing messages from organisations. The PDPA recognises both the need to protect individuals’ personal data and the need of organisations to collect, use or disclose personal data for legitimate and reasonable purposes. A data protection regime is necessary to safeguar

In [21]:
# 2 options to input text. Either 1. assign to a variable text or 2. input into the chatbot at start
# text = input("Please enter your text: \n")
question = input("\nPlease enter your question: \n")
while True:
    question_answer(question, text)
    
    flag = True
    flag_N = False
    
    while flag:
        response = input("\nDo you want to ask another question based on this text (Y/N)? ")
        if response[0] == "Y":
            question = input("\nPlease enter your question: \n")
            flag = False
        elif response[0] == "N":
            print("\nBye!")
            flag = False
            flag_N = True
            
    if flag_N == True:
        break


Predicted answer:
Data about an individual

Predicted answer:
It complements sector - specific legislative and regulatory frameworks such as the banking act and insurance act .

Predicted answer:
Provides a baseline standard of protection for personal data in singapore

Predicted answer:
If they undertake activities relating to the collection , use or disclosure of personal data

Predicted answer:
Any individual acting on a personal or domestic basis , any individual acting in his / her capacity as an employee with an organisation , any public agency in relation to the collection , use or disclosure of personal data , any business contact information

Predicted answer:
Unable to find the answer to your question.

Bye!


## BERT is not for long text 
limited at 512 tokens. Performance on long text by BERT

In [22]:
# word count = 1489
text = "Chow Yun-fat SBS (born 18 May 1955), previously known as Donald Chow, is a Hong Kong actor known for his collaborations with filmmaker John Woo in the action heroic bloodshed films A Better Tomorrow, The Killer, and Hard Boiled, and in the West for his roles as Li Mu-bai in Crouching Tiger, Hidden Dragon and Sao Feng in Pirates of the Caribbean: At World's End. He mainly plays in drama films and has won three Hong Kong Film Awards for Best Actor and two Golden Horse Awards for Best Actor in Taiwan. Chow started his career in movies in 1976 with Goldig Films. Chow was born in Lamma Island, Hong Kong, to Chow Yung-Wan (周容允), who worked on a Shell Oil Company tanker, and Chan Lai-fong (陳麗芳), who was a cleaning lady and vegetable farmer. Chow grew up in a farming community on Lamma Island, in a house with no electricity. He woke up at dawn each morning to help his mother sell herbal jelly and Hakka tea-pudding (客家茶粿) on the streets; in the afternoons, he went to work in the fields. His family moved to Kowloon when he was ten. At 17, Chow left school to help support the family by doing odd jobs including a bellboy, postman, camera salesman, and taxi driver. Chow's life started to change after college when he responded to a newspaper advertisement, and his actor-trainee application was accepted by TVB, the local television station. He signed a three-year contract with the studio and made his acting debut. Chow became a heartthrob and familiar face in soap operas that were exported internationally. According to Chow Yun-fat's filmography, Chow made his debut in 1976 in various movies produced by Goldig Films, including Hot Blood (入冊).  Goldig Films was founded by Gouw Hiap Kian and produced or distributed over 100 movies from 1972 to 1982. Chow's first movie contract was an exclusive acting contract with Goldig Films (note page 3). Chow appeared in the 1980 TV series The Bund on TVB. The series, about the rise and fall of a gangster in 1930s Shanghai, was a hit throughout Asia and made Chow a star. Although Chow continued his TV success, his goal was to become a film actor. However, his occasional ventures into low-budget films in the 1980s after ones by Goldig were disastrous. Most of Chow's movies produced by Goldig Films under exclusive contract in the 1970s achieved high gross revenues of over HK$ 1m per movie. These figures are higher than ones Chow acted in the early 1980s, including Modern Heroes (江湖檔案), Soul Ash (灰靈), The Bund(上海灘), The Bund Part 2(上海灘續集) . Note gross revenues under list of movies. Success finally came when he teamed up with director John Woo in the 1986 gangster action-melodrama A Better Tomorrow, which swept the box offices in Asia and established Chow and Woo as megastars. A Better Tomorrow won him his first Best Actor award at the Hong Kong Film Awards. It was the highest-grossing film in Hong Kong history at the time, and set a new standard for Hong Kong gangster films. Taking the opportunity, Chow quit TV entirely. With his new image from A Better Tomorrow, he made many more 'gun fu' or 'heroic bloodshed' films, such as A Better Tomorrow 2 (1987), Prison on Fire (1987), Prison on Fire II (1991), The Killer (1989), A Better Tomorrow 3 (1990), Hard Boiled (1992) and City on Fire (1987), an inspiration for Quentin Tarantino's Reservoir Dogs. Chow may be best known for playing honorable tough guys, whether cops or criminals, but he has also starred in comedies like Diary of a Big Man (1988) and Now You See Love, Now You Don't (1992) and romantic blockbusters such as Love in a Fallen City (1984) and An Autumn's Tale (1987), for which he was named Best Actor at the Golden Horse Awards. He brought together his disparate personae in the 1989 film God of Gamblers, directed by the prolific Wong Jing, in which he was by turns a suave charmer, a broad comedian, and an action hero. The film surprised many, became immensely popular, broke Hong Kong's all-time box office record, and spawned a series of gambling films as well as several comic sequels starring Andy Lau and Stephen Chow. The often tough demeanour and youthful appearance of Chow's characters has earned him the nickname 'Babyface Killer'. Chow Yun-fat at the premiere of Pirates of the Caribbean: At World's End in 2007. The Los Angeles Times proclaimed Chow Yun-Fat 'the coolest actor in the world'. In the mid '90s, Chow moved to Hollywood in an ultimately unsuccessful attempt to duplicate his success in Asia. His first two films, The Replacement Killers (1998) and The Corruptor (1999), were box office failures. In his next film Anna and the King (1999), Chow teamed up with Jodie Foster, but the film underperformed at the box office. Chow accepted the role of Li Mu-Bai in the (2000) film Crouching Tiger, Hidden Dragon. It became a winner at both the international box office and the Oscars. In 2003, Chow came back to Hollywood and starred in Bulletproof Monk. In 2004, Chow made a surprise cameo in director Dayyan Eng's Chinese rom-com favourite Waiting Alone, it was the first time he was in a mainland Chinese film. In 2006, he teamed up with Gong Li in the film Curse of the Golden Flower, directed by Zhang Yimou. In 2007, Chow played the pirate captain Sao Feng in Pirates of the Caribbean: At World's End. However, his part was omitted when the movie was shown in mainland China, where government censors felt that Chow's character 'vilified and humiliated' Chinese people. In the poorly received film Dragonball Evolution, Chow Yun-fat played Master Roshi. In 2014, Chow returned to Hong Kong cinema in From Vegas to Macau. For the part, he lost 13 kg within 10 months. In 2015 and 2016, Chow reprised his role as Ken in the sequels From Vegas to Macau II and From Vegas to Macau III. In 2018, he co-starred with Aaron Kwok in Project Gutenberg which earned him another Best Actor nomination at the 38th Hong Kong Film Awards. On 26 June 2008, Chow released his first photo collection, which includes pictures taken on the sets of his films. Proceeds from the book's sales were donated to Sichuan earthquake victims. It is published by Louis Vuitton. Chow has been married twice; first was in 1983 to Candice Yu, an actress from Asia Television; the marriage lasted nine months. In 1986, Chow married Singaporean Jasmine Tan. They had a stillborn daughter in 1991. Chow has a goddaughter, Celine Ng, a former child model for Chickeeduck, McDonald's, Toys'R'Us and other companies. Despite his wealth, Chow lives modestly. He is frequently seen at food stalls and on public transportation. In interviews, he has said he plans to leave his fortune to charity. In October 2014, Chow voiced support for students in the Umbrella Movement, a civil rights movement for universal suffrage in Hong Kong. Chow has appeared in over 95 films and over 25 television series. Hong Kong Film Awards he won are : Best Actor Nomination for Hong Kong 1941. Best Actor Nomination for Women, Best Supporting Actor Nomination for Love Unto Waste, Best Actor for A Better Tomorrow, Best Actor Nomination for Prison on Fire, Best Actor Nomination for An Autumn's Tale, Best Actor for City on Fire, Best Original Film Song Nomination for The Diary of a Big Man, Best Original Film Song Nomination for Triads: The Inside Story, Best Actor Nomination for God of Gamblers, Best Actor for All About Ah-Long, Best Actor Nomination for Once a Thief, Best Actor Nomination for Treasure Hunt, Best Actor Nomination for Peace Hotel, Best Actor Nomination for Crouching Tiger, Hidden Dragon, Best Actor Nomination for Curse of the Golden Flower, Best Supporting Actor Nomination for The Postmodern Life of My Aunt, Best Actor Nomination for Project Gutenberg, (14 Best Actor nominations, two Best Supporting Actor nominations, two Best Original Film Song nominations). In Chinese American Film Festival he won Golden Angel for Best Actor in a Leading Role for Project Gutenberg (2019). In 2014, Chow was the second-highest earning actor in Hong Kong, earning HK$170 million (US$21.9 million). His reported net worth is HK$5.6 billion (US$714 million). In 2018, Chow's wife Jasmine Tan informed various Hong Kong media the figure HK$5.6b of Chow's net worth, which was not verified by any third party. Chow also said he would donate 99% of his wealth to charity via setting up a foundation to help those in need. There have been no other reports on who controls the foundation and its ultimate beneficiaries. His University Honoary Awards are Hong Kong Academy for Performing Arts - Honorary Fellow (1999) , City University of Hong Kong - Honorary Doctor of Letters (2001) , Hong Kong Baptist University - Doctor of Humanities, honoris causa (2021)"

In [23]:
print (text)

Chow Yun-fat SBS (born 18 May 1955), previously known as Donald Chow, is a Hong Kong actor known for his collaborations with filmmaker John Woo in the action heroic bloodshed films A Better Tomorrow, The Killer, and Hard Boiled, and in the West for his roles as Li Mu-bai in Crouching Tiger, Hidden Dragon and Sao Feng in Pirates of the Caribbean: At World's End. He mainly plays in drama films and has won three Hong Kong Film Awards for Best Actor and two Golden Horse Awards for Best Actor in Taiwan. Chow started his career in movies in 1976 with Goldig Films. Chow was born in Lamma Island, Hong Kong, to Chow Yung-Wan (周容允), who worked on a Shell Oil Company tanker, and Chan Lai-fong (陳麗芳), who was a cleaning lady and vegetable farmer. Chow grew up in a farming community on Lamma Island, in a house with no electricity. He woke up at dawn each morning to help his mother sell herbal jelly and Hakka tea-pudding (客家茶粿) on the streets; in the afternoons, he went to work in the fields. His fam

### Questions on Chow Yun Fatt

In [None]:
# Q1. Who is Chow Yun-fat's wife?
# Q2. When was Chow Yun-fat born?
# Q3. Which show did he acted in 2007 and what was his role? ** 
# Q4. What are his nominations?
# Q5. What are Chow Yun-fat's University Honoary Awards?
# Q6. What did Chow Yun-fat do in 2003?
# Q7. What is his nickname?
# Q8. How many giraffes are there?

In [24]:
# 2 options to input text. Either 1. assign to a variable text or 2. input into the chatbot at start
# text = input("Please enter your text: \n")
question = input("\nPlease enter your question: \n")
while True:
    question_answer(question, text)
    
    flag = True
    flag_N = False
    
    while flag:
        response = input("\nDo you want to ask another question based on this text (Y/N)? ")
        if response[0] == "Y":
            question = input("\nPlease enter your question: \n")
            flag = False
        elif response[0] == "N":
            print("\nBye!")
            flag = False
            flag_N = True
            
    if flag_N == True:
        break

Token indices sequence length is longer than the specified maximum sequence length for this model (10 > 512). Running this sequence through the model will result in indexing errors


RuntimeError: The size of tensor a (1966) must match the size of tensor b (512) at non-singleton dimension 1

In [None]:
# References https://towardsdatascience.com/question-answering-with-a-fine-tuned-bert-bc4dafd45626; http://www.conradweb.org/~jackg/pubs/ICAIL21_Vold_Conrad.pdf; 