In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
def read_by_paragraph(output, threshold):
    with open("wikipedia_water_corpus.txt") as file:
        text = file.read()

        text_split = text.split('. ')
        print(f"{len(text_split)} paragraphs and sentences")

        for chunk in text_split:
            if output and len(chunk) + len(output[-1]) < threshold:
                output[-1] += ' ' + chunk + '.'
            else:
                output.append(chunk + '.')

    return output

In [4]:
outputs = []
threshold = 2000
output = read_by_paragraph(outputs, threshold)

74364 paragraphs and sentences


In [5]:
print(f"{len(outputs)} paragraphs")

7875 paragraphs


In [6]:
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

In [7]:
#Model
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad', return_dict=False)
#model = BertForQuestionAnswering.from_pretrained('deepset/roberta-base-squad2', return_dict=False)

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad', return_dict=False)
#tokenizer = BertTokenizer.from_pretrained('deepset/roberta-base-squad2d', return_dict=False)

Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [8]:
def tokenize_data(tokenizer, question, paragraph):
    encoding = tokenizer.encode_plus(text=question, text_pair=paragraph)

    inputs = encoding['input_ids']  #Token embeddings
    sentence_embedding = encoding['token_type_ids']  #Segment embeddings
    tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens

    return inputs, sentence_embedding, tokens

In [9]:
# Recover any words that were broken down into subwords
def get_answer(answer):
    corrected_answer = ''

    for word in answer.split():
        
        #If it's a subword token
        if word[0:2] == '##':
            corrected_answer += word[2:]
        else:
            corrected_answer += ' ' + word

    return corrected_answer

In [10]:
def get_answer_from_question(model, inputs, sentence_embedding, tokens):
    start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))

    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)

    answer = ' '.join(tokens[start_index:end_index+1])

    return get_answer(answer)


## Test model

In [11]:
def get_the_answer(model, tokenizer, question, context):
    inputs, sentence_embedding, tokens = tokenize_data(tokenizer, question, context)

    answer = get_answer_from_question(model, inputs, sentence_embedding, tokens)

    return f"{question} {answer}"

### Test with the first paragraph

#### First question (White foam in rivers)

In [11]:
print(outputs[0])

Why Does Water Appear White While Going Over A Waterfall
Water and dissolved oxygen
Aeration
How light reflects off water in a waterfall
Suggested Reading
The water appears white while going over a waterfall because the water is moving at a high pace and the trapped air in the water creates bubbles. The bubbles are what make the waterfall look white.
If water is really stirred up, or moving at a high pace a pace that you can expect from waterfalls, then the trapped air in the water creates bubbles. It is these bubbles tiny air pockets that make a waterfall look white.
Water is colorless; we all know that, right? Still, the color of snow – which is essentially frozen water – is white. Similarly, water that goes over a waterfall also appears to be white, despite actually being colorless.
Waterfall whitish water
Why is that water white? 
What’s going on here?
The answer of this lies in how light interacts with matter, along with aeration.
Thus, in a still water body, you don’t see the dis

In [12]:
question = '''Why is water white when moving?'''
question_spanish = 'Porque el agua se ve blanca cuando se mueve?'

context = outputs[0]

In [13]:
get_the_answer(model, tokenizer, question, context)

'Why is water white when moving?  water is moving at a high pace and the trapped air in the water creates bubbles'

In [14]:
get_the_answer(model, tokenizer, question_spanish, context)

'Porque el agua se ve blanca cuando se mueve?  because the water is moving at a high pace and the trapped air in the water creates bubbles'

### Test with other paragraphs

In [12]:
def find_words_in_paragraph(words_list, output_list):
    for index, paragraph in enumerate(outputs):
        paragraph_lower = paragraph.lower()
        flag = True
        for word in words_list:
            if word not in paragraph_lower:
                flag = False
                break
        
        if flag:
            output_list.append(index)

    return output_list

In [13]:
def get_list_answers(context_pos_list, answers_list, model, tokenizer, question):
    for index, i in enumerate(context_pos_list):
        context = outputs[i]

        try:
            answer = get_the_answer(model, tokenizer, question, context)
        except Exception as e:
            print(e)

        if len(answer.split('?')[-1]) > 3:
            answers_list.append((index, answer))
    
    return answers_list

#### Second question (river movement)

In [43]:
river_pos = []

river_pos = find_words_in_paragraph(["river"], river_pos)

In [44]:
len(river_pos)

2272

In [45]:
movement_pos = []

movement_pos = find_words_in_paragraph(['move', 'river'], movement_pos)

In [46]:
len(movement_pos)

446

In [19]:
print(outputs[movement_pos[6]])

In general, velocity increases with the depth or hydraulic radius and slope of the river channel, while the cross-sectional area scales with the depth and the width: the double-counting of depth shows the importance of this variable in determining the discharge through the channel.
 Effects 
 Fluvial erosion 
In its youthful stage, a river causes erosion in the watercourse, deepening the valley. Hydraulic action loosens and dislodges aggregate which further erodes the banks and the river bed. Over time, this deepens the river bed and creates steeper sides which are then weathered. The steepened nature of the banks causes the sides of the valley to move downslope causing the valley to become V-shaped.
Waterfalls also form in the youthful river valley where a band of hard rock overlays a layer of soft rock. Differential erosion occurs as the river erodes the soft rock more readily than the hard rock, this leaves the hard rock more elevated and stands out from the river below. A plunge po

In [20]:
question = 'Why do rivers move?'
question_spanish = 'Porque los rios se mueven?'

In [21]:
answers_move = []

In [22]:
answers_move = get_list_answers(movement_pos, answers_move, model, tokenizer, question)

Token indices sequence length is longer than the specified maximum sequence length for this model (525 > 512). Running this sequence through the model will result in indexing errors


The size of tensor a (525) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (1001) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (1758) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (1811) must match the size of tensor b (512) at non-singleton dimension 1


In [23]:
#%store answers_move

Stored 'answers_move' (list)


In [24]:
%store -r answers_move

In [None]:
answers_move

In [29]:
print(outputs[movement_pos[3]])

For instance, the Amazon River receives water from more than 1,000 tributaries. Together, a river and its tributaries make up a river system. A river system is also called a drainage basin or watershed. A river’s watershed includes the river, all its tributaries, and any groundwater resources in the area.
The end of a river is its mouth. Here, the river empties into another body of water—a larger river, a lake, or the ocean. Many of the largest rivers empty into the ocean.
The flowing water of a river has great power to carve and shape the landscape. Many landforms, like the Grand Canyon in the U.S. state of Arizona, were sculpted by rivers over time. This process is called weathering or erosion.
The energy of flowing river water comes from the force of gravity, which pulls the water downward. The steeper the slope of a river, the faster the river moves and the more energy it has.
The movement of water in a river is called a current. The current is usually strongest near the river’s so

In [37]:
question = "Where does the flow of rivers come from?"
question_spanish = "De donde viene el movimiento de el rio?"

In [34]:
answer = get_the_answer(model, tokenizer, question, outputs[movement_pos[3]])
answer

'Why do rivers move?  force of gravity'

In [38]:
answer = get_the_answer(model, tokenizer, question_spanish, outputs[movement_pos[3]])
answer

'De donde viene el movimiento de el rio?  [CLS]'

#### Third question (river erosion)

In [49]:
erosion_pos = []

erosion_pos = find_words_in_paragraph(['erosion', 'river'], erosion_pos)

In [50]:
len(erosion_pos)

390

In [73]:
question = "Why do rivers cause erosion?"
question_spanish = "Porque los rios causan erosion?"

In [69]:
answers_erosion = []

answers_erosion = get_list_answers(erosion_pos, answers_erosion, model, tokenizer, question)

The size of tensor a (514) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (585) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (581) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (1759) must match the size of tensor b (512) at non-singleton dimension 1


In [88]:
#%store answers_erosion

Stored 'answers_erosion' (list)


In [None]:
%store -r answers_erosion

In [None]:
answers_erosion

In [71]:
print(outputs[erosion_pos[14]])

It is possible for all three processes to contribute to the development of a valley over geological time. The flat or relatively flat portion of a valley between its sides is referred to as the valley floor and is typically formed by river sediments and may be terraced.
 River valleys 
The development of a river valley is affected by the character of the bedrock over which the river or stream flows, the elevational difference between its top and bottom, and indeed the climate. Typically the flow will increase downstream and the gradient will decrease. In the upper valley, the stream will most effectively erode its bed through corrasion to produce a steep-sided V-shaped valley. The presence of more resistant rock bands, of geological faults, fractures, and folds may determine the course of the stream and result in a twisting course with interlocking spurs. In the middle valley, as numerous streams have coalesced, the valley is typically wider, the flow slower and both erosion and deposi

In [74]:
answer = get_the_answer(model, tokenizer, question, outputs[erosion_pos[14]])
answer

'Why do rivers cause erosion?  strong currents on the outside of its curve eats at the bank'

In [76]:
answer = get_the_answer(model, tokenizer, question_spanish, outputs[erosion_pos[14]])
answer

'Porque los rios causan erosion?  [CLS] porque los rios causan erosion ? [SEP]'

In [72]:
print(outputs[erosion_pos[33]])

For example, rainfall on roofs, pavements, and roads will be collected by rivers with almost no absorption into the groundwater.
A drainage basin is an area of land where all flowing surface water converges to a single point, such as a river mouth, or flows into another body of water, such as a lake or ocean.
 See also 
Continental Divide of the Americas – Principal hydrological divide of North and South America
Integrated catchment management
Interbasin transfer
International Journal of River Basin Management JRBM
International Network of Basin Organizations
Main stem – Final large channel of a riverine system
River basin management plans
River bifurcation – The forking of a river into its distributaries
Tenaja
Time of concentration
Catchment hydrology
 References 
 Citations 
 Sources 
 External links 
Instructional video: Manual watershed delineation is a five-step process
Instructional video: To delineate a watershed you must identify land surface features from topographic contours

In [77]:
answer = get_the_answer(model, tokenizer, question, outputs[erosion_pos[33]])
answer

"Why do rivers cause erosion?  the action of surface processes such as water flow or wind that removes soil , rock , or dissolved material from one location on the earth ' s crust"

In [78]:
answer = get_the_answer(model, tokenizer, question_spanish, outputs[erosion_pos[33]])
answer

'Porque los rios causan erosion?  [SEP]'

In [80]:
print(outputs[erosion_pos[150]])

Example: Seine.
Tropical pluvialThe tropical pluvial regime is characterized by:
very low discharge in the cold season and abundant rainfall in the warm season
minimum can reach very low values
great variability of discharge during the year
Relatively regular from one year to another
 Mixed régimesdouble regime 
Nivo-glacialonly one true maximum, which occurs in the late spring or the early summer from May to July in the case of the Northern hemisphere
relatively high diurnal variations during the hot season
significant yearly variation, but less than in the snow regime
significant flowNivo-pluvialtwo maximums, the first occurring in the spring and the other in autumn
a main low-water in October and a secondary low-water in January
significant inter-annual variationsExample: Issole 
Pluvio-nivala period of rainfall in late autumn, followed by a light increase due to snow melt in early spring
the single minimum occurs in autumn
low amplitudeExample: Mississippi.
 Complex regimes 
The co

In [81]:
answer = get_the_answer(model, tokenizer, question, outputs[erosion_pos[150]])
answer

'Why do rivers cause erosion?  the movement of water across the stream bed exerts a shear stress directly onto the bed'

In [82]:
answer = get_the_answer(model, tokenizer, question_spanish, outputs[erosion_pos[33]])
answer

'Porque los rios causan erosion?  [SEP]'

##### Context obtained from pdf [Processes of a river](https://www.reigate-school.surrey.sch.uk/MainFolder/academic/Revision-2-/Water-on-land-paper-1-physical-rivers.pdf)

In [85]:
erosion_context = """
Erosion
There are four ways that a river
erodes; hydraulic action, corrosion,
corrosion and attrition. Hydraulic action: the force of the water wearing away the bed and bank of the river, Corrosion: the chemical reaction between the water and the bed and bank of
the river, wearing it away, Corrasion/abrasion: where bedload in the river wears away its bed and bank and Attrition: where rocks in the water become smaller and rounder by hitting
each other"""

In [86]:
answer = get_the_answer(model, tokenizer, question, erosion_context)
answer

'Why do rivers cause erosion?  the force of the water wearing away the bed and bank of the river'

In [87]:
answer = get_the_answer(model, tokenizer, question_spanish, erosion_context)
answer

'Porque los rios causan erosion?  hydraulic action , corrosion , corrosion and attrition'

#### Fourth question (river shape)

In [98]:
shape_pos = []

shape_pos = find_words_in_paragraph(['shape', 'river'], shape_pos)

In [99]:
len(shape_pos)

249

In [100]:
question = "How do rivers obtain their shape?"
question_spanish = "Como es que los rios obtienen su forma?"

In [101]:
answers_shape = []

answers_shape = get_list_answers(shape_pos, answers_shape, model, tokenizer, question)

The size of tensor a (646) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (622) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (1003) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (1760) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (1813) must match the size of tensor b (512) at non-singleton dimension 1


In [102]:
#%store answers_shape

Stored 'answers_shape' (list)


In [None]:
%store -r answers_shape

In [None]:
answers_shape

In [104]:
print(outputs[shape_pos[0]])

For instance, the Amazon River receives water from more than 1,000 tributaries. Together, a river and its tributaries make up a river system. A river system is also called a drainage basin or watershed. A river’s watershed includes the river, all its tributaries, and any groundwater resources in the area.
The end of a river is its mouth. Here, the river empties into another body of water—a larger river, a lake, or the ocean. Many of the largest rivers empty into the ocean.
The flowing water of a river has great power to carve and shape the landscape. Many landforms, like the Grand Canyon in the U.S. state of Arizona, were sculpted by rivers over time. This process is called weathering or erosion.
The energy of flowing river water comes from the force of gravity, which pulls the water downward. The steeper the slope of a river, the faster the river moves and the more energy it has.
The movement of water in a river is called a current. The current is usually strongest near the river’s so

In [105]:
answer = get_the_answer(model, tokenizer, question, outputs[shape_pos[0]])
answer

'How do rivers obtain their shape?  flowing water of a river has great power to carve and shape the landscape'

In [106]:
answer = get_the_answer(model, tokenizer, question_spanish, outputs[shape_pos[0]])
answer

'Como es que los rios obtienen su forma?  [CLS] como es que los rios obtienen su forma ? [SEP] for instance , the amazon river receives water from more than 1 , 000 tributaries . together , a river and its tributaries make up a river system . a river system is also called a drainage basin or watershed'

##### Context obtained from [Deep bends: Just how does a river get its shape?](https://www.union-bulletin.com/local_columnists/dry_creek/deep-bends-just-how-does-a-river-get-its-shape/article_da1fc944-02cf-11e8-b2b7-9bca8fbe5f6d.html)

In [109]:
shape_context = """In rocky terrain, flowing water carves canyon walls that hem it in, but in soft valley soil, rivers and streams can shift their courses with ease.
The looping curves that form as a result are called meanders. Almost any disturbance along the riverbank — a burrowing animal, a fallen tree — can cause a meander to start to form, as long as it weakens the bank a little, dislodging soil and triggering erosion.
As the water rushes into this newly formed hollow in the bank, sweeping away more material, the flow on the opposite side of the stream actually weakens as a result.
At the site of the original disturbance, more and more soil and rock gets carried away, causing the shape of the stream to bow outward; on the other side, the slower-moving water can not carry as much sediment, and material is deposited onto the bottom, building up the bank.
Soon, a curve has formed, with deeper, faster water on the outside edge and shallow, slower water on the inside edge.
But you have probably never seen a single U-shaped bend in the course of an otherwise straight stream.
The formation of one meander triggers the formation of a second one immediately downstream, as the fast-moving water flowing out of the newly formed curve slams into the streams far side, causing more erosion and creating another bend in the opposite direction.
On and on this process goes, forming the never-ending loops and squiggles we are familiar with."""

In [110]:
answer = get_the_answer(model, tokenizer, question, shape_context)
answer

'How do rivers obtain their shape?  in soft valley soil'

In [111]:
answer = get_the_answer(model, tokenizer, question_spanish, shape_context)
answer

'Como es que los rios obtienen su forma?  [SEP]'

##### Fifth question (river color)

In [112]:
color_pos = []

color_pos = find_words_in_paragraph(['color', 'river'], color_pos)

In [113]:
len(color_pos)

144

In [114]:
question = "How do rivers obtain their color?"
question_spanish = "Como es que los rios obtienen su color?"

In [115]:
answers_colors = []

answers_colors = get_list_answers(color_pos, answers_colors, model, tokenizer, question)

The size of tensor a (919) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (1034) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (1813) must match the size of tensor b (512) at non-singleton dimension 1


In [116]:
%store answers_colors

Stored 'answers_colors' (list)


In [None]:
%store -r answers_colors

In [None]:
answers_colors

In [124]:
print(outputs[color_pos[31]])

In addition, if the river carries significant quantities of sediment, this material can act as tools to enhance wear of the bed abrasion. At the same time the fragments themselves are ground down, becoming smaller and more rounded attrition.
Sediment in rivers is transported as either bedload the coarser fragments which move close to the bed or suspended load finer fragments carried in the water. There is also a component carried as dissolved material.
For each grain size there is a specific flow velocity at which the grains start to move, called entrainment velocity. However the grains will continue to be transported even if the velocity falls below the entrainment velocity due to the reduced or removed friction between the grains and the river bed. Eventually the velocity will fall low enough for the grains to be deposited. This is shown by the Hjulström curve.
A river is continually picking up and dropping solid particles of rock and soil from its bed throughout its length. Where th

In [125]:
answer = get_the_answer(model, tokenizer, question, outputs[color_pos[31]])
answer

'How do rivers obtain their color?  transported matter'

In [126]:
answer = get_the_answer(model, tokenizer, question_spanish, outputs[color_pos[31]])
answer

'Como es que los rios obtienen su color? '

##### Context obtained from [American rivers](https://www.americanrivers.org/rivers/discover-your-river/river-colors/)

In [127]:
color_context = """Factors such as minerals, soil runoff and sediment, and even algae can cause water to vary from its natural color of blue.
The most common cause for water to change color is minerals. When a rock is weathered down over time, the minerals from the rock are dissolved and small pieces are released into the water causing different colors.
Iron, manganese, and calcium carbonate from limestone all common minerals that can cause water to range in color from red and orange to green and blue.
Sediment and soil runoff can also change waters color, sometimes as a temporary color change after storms and sometimes permanently if the river constantly carries lots of sediment. Erosion from river banks brings soil into the river, changing the color. After heavy storms, many rivers run brown from all the runoff flowing into the river. Clay can cause rivers to be murky whiteuddy brown, or yellow.
Algal blooms are natural occurring overgrowths of algae caused by sunlight, slow water, or nutrients. Pollution runoff from humans can also increase nutrients in the water and cause an algal bloom. Algae affect not only the health of a river but also the color. The color caused by algae can vary from a dark green to almost a reddish color. Algae consume nutrients from the water along with dissolved oxygen causing negative effects on the ecosystem of the river. Once the algae begin decaying it releases methane gas causing foul odors."""

In [129]:
answer = get_the_answer(model, tokenizer, question, color_context)
answer

'How do rivers obtain their color?  erosion from river banks'

In [132]:
answer = get_the_answer(model, tokenizer, question_spanish, color_context)
answer

'Como es que los rios obtienen su color?  [CLS] como es que los rios obtienen su color ? [SEP] factors such as minerals , soil runoff and sediment , and even algae can cause water to vary from its natural color of blue'

#### Sixth question (flood in rivers)

In [130]:
flood_pos = []

flood_pos = find_words_in_paragraph(['flood', 'river'], flood_pos)

In [131]:
len(flood_pos)

607

In [133]:
question = "How are floods caused in rivers?"
question_spanish = "Como es que las inundaciones se causan en los rios?"

In [134]:
answers_flood = []

answers_flood = get_list_answers(flood_pos, answers_flood, model, tokenizer, question)

The size of tensor a (695) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (515) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (789) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (646) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (586) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (582) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (1760) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (1813) must match the size of tensor b (512) at non-singleton dimension 1


In [144]:
#%store answers_flood

Stored 'answers_flood' (list)


In [None]:
%store -r answers_flood

In [None]:
answers_flood

In [136]:
print(outputs[flood_pos[34]])

Rain that falls in steep mountainous areas will reach the primary river in the drainage basin faster than flat or lightly sloping areas e.g., > 1 gradient.
 Shape 
Shape will contribute to the speed with which the runoff reaches a river. A long thin catchment will take longer to drain than a circular catchment.
 Size 
Size will help determine the amount of water reaching the river, as the larger the catchment the greater the potential for flooding. It is also determined on the basis of length and width of the drainage basin.
 Soil type 
Soil type will help determine how much water reaches the river. The runoff from the drainage area is dependent on the soil type. Certain soil types such as sandy soils are very free-draining, and rainfall on sandy soil is likely to be absorbed by the ground. However, soils containing clay can be almost impermeable and therefore rainfall on clay soils will run off and contribute to flood volumes. After prolonged rainfall even free-draining soils can beco

In [137]:
answer = get_the_answer(model, tokenizer, question, outputs[flood_pos[34]])
answer

'How are floods caused in rivers?  if the surface is impermeable the precipitation will create surface run - off'

In [138]:
answer = get_the_answer(model, tokenizer, question_spanish, outputs[flood_pos[34]])
answer

'Como es que las inundaciones se causan en los rios?  [SEP]'

##### Context obtained from [American rivers](https://www.americanrivers.org/rivers/discover-your-river/why-do-rivers-flood/)

In [140]:
flood_context = """Though ancient floods of this magnitude are hard to conceptualize, we are repeatedly exposed to tales of their contemporary brethren and the massive destruction they leave in their wake. Rivers and creeks flood when pulses of rainfall and/or snowmelt move downstream. This causes water to overtop the channels banks and spill onto the neighboring floodplain.
A natural river channel is shaped by the amount of water and sediment that travels through it. Even though rivers can vary greatly in their form, a natural river’s channel is almost always sized to carry the largest amount of water that flows through the system once every two years."""

In [141]:
answer = get_the_answer(model, tokenizer, question, flood_context)
answer

'How are floods caused in rivers?  when pulses of rainfall and / or snowmelt move downstream'

In [142]:
answer = get_the_answer(model, tokenizer, question_spanish, color_context)
answer

'Como es que las inundaciones se causan en los rios?  [CLS]'

#### Seventh question (reflection in water)

In [151]:
reflection_pos = []

reflection_pos = find_words_in_paragraph(['reflection'], reflection_pos)

In [152]:
len(reflection_pos)

53

In [163]:
question = "How is reflection caused in water?"
question_spanish = "Como se causa el reflejo en el agua?"

In [154]:
answers_reflection = []

answers_reflection = get_list_answers(reflection_pos, answers_reflection, model, tokenizer, question)

In [156]:
#%store answers_reflection

Stored 'answers_reflection' (list)


In [None]:
%store -r answers_reflection

In [None]:
answers_reflection

El corpus no parece tener la respuesta

In [157]:
print(outputs[reflection_pos[8]])

When the probe arrived in the Saturnian system in 2004, it was hoped that hydrocarbon lakes or oceans might be detectable by reflected sunlight from the surface of any liquid bodies, but no specular reflections were initially observed.The possibility remained that liquid ethane and methane might be found on Titan's polar regions, where they were expected to be abundant and stable. In Titan's south polar region, an enigmatic dark feature named Ontario Lacus was the first suspected lake identified, possibly created by clouds that are observed to cluster in the area. A possible shoreline was also identified near the pole via radar imagery. Following a flyby on July 22, 2006, in which the Cassini spacecraft's radar imaged the northern latitudes, which were at the time in winter. A number of large, smooth and thus dark to radar patches were seen dotting the surface near the pole. Based on the observations, scientists announced definitive evidence of lakes filled with methane on Saturn's moo

In [164]:
answer = get_the_answer(model, tokenizer, question, outputs[reflection_pos[8]])
answer

'How is reflection caused in water?  sunlight'

In [165]:
answer = get_the_answer(model, tokenizer, question_spanish, outputs[reflection_pos[8]])
answer

'Como se causa el reflejo en el agua?  [CLS] como se causa el reflejo en el agua ? [SEP]'

##### Context obtained from [Adorama](https://www.adorama.com/alc/how-to-photograph-amazing-water-reflections/)

In [158]:
reflection_context = """Reflection in water is caused when light hits the surface of still water and travels to our eye so we can see the complete image and light it’s reflecting.
All materials reflect light in some way. However, if the surface is rough, the light scatters in all directions and the reflected image is blurry.
Calm water is much flatter than most surfaces, so all the reflected light rays that arrange to complete a perfect image stay the arranged and don't scatter, creating the perfect mirrored image of the reflecting subject."""

In [166]:
answer = get_the_answer(model, tokenizer, question, reflection_context)
answer

'How is reflection caused in water?  when light hits the surface of still water'

In [167]:
answer = get_the_answer(model, tokenizer, question_spanish, reflection_context)
answer

'Como se causa el reflejo en el agua?  light hits the surface of still water and travels to our eye so we can see the complete image and light it ’ s reflecting'

#### Eight question (weir)

In [168]:
weir_pos = []

weir_pos = find_words_in_paragraph(['weir'], weir_pos)

In [169]:
len(weir_pos)

54

In [188]:
question = "What is a weir?"
question_spanish = "Que es una presa?"

In [171]:
answers_weir = []

answers_weir = get_list_answers(weir_pos, answers_weir, model, tokenizer, question)

In [172]:
#%store answers_weir

Stored 'answers_weir' (list)


In [None]:
%store -r answers_weir

In [None]:
answers_weir

There isn't enough information on the corpus

In [174]:
print(outputs[weir_pos[0]])

The Orange and Limpopo Rivers in southern Africa form the boundaries between provinces and countries along their routes.
 Sacred rivers 
Sacred rivers and their reverence is a phenomenon found in several religions, especially religions in which nature is revered. For example, the Indian-origin religions of Buddhism, Hinduism, Jainism, and Sikhism revere and preserve groves, forests, trees, mountains and rivers as sacred. Among the most sacred rivers in Hinduism are the Ganges, Yamuna, and Sarasvati rivers. Other sacred rivers for Indian religions include the Rigvedic rivers, the Narmada, the Godavari, and the Kaveri rivers. The Vedas and Gita, the most sacred of Hindu texts, were written on the banks of the Sarasvati river.
 Management 
Rivers are often managed or controlled to make them more useful or less disruptive to human activity.
Dams or weirs may be built to control the flow, store water, or extract energy.
Levees, known as dikes in Europe, may be built to prevent river water f

In [175]:
answer = get_the_answer(model, tokenizer, question, outputs[weir_pos[0]])
answer

'What is a weir?  dams or weirs may be built to control the flow , store water , or extract energy'

In [176]:
answer = get_the_answer(model, tokenizer, question_spanish, outputs[weir_pos[0]])
answer

'Que es una presa?  rivers have been confined within channels to free up flat flood - plain land for development'

##### Context obtained from [Wikipedia](https://en.wikipedia.org/wiki/Weir)

In [187]:
weir_context = """A weir /wɪər/ or low head dam is a barrier across the width of a river that alters the flow characteristics of water and usually results in a change in the height of the river level. Weirs are also used to control the flow of water for outlets of lakes, ponds, and reservoirs. There are many weir designs, but commonly water flows freely over the top of the weir crest before cascading down to a lower level. """

In [189]:
answer = get_the_answer(model, tokenizer, question, weir_context)
answer

'What is a weir?  a barrier across the width of a river'

In [190]:
answer = get_the_answer(model, tokenizer, question_spanish, weir_context)
answer

'Que es una presa? '

#### Nineth question (Speed of a river)

In [192]:
speed_pos = []

speed_pos = find_words_in_paragraph(['speed', 'river'], speed_pos)

In [193]:
len(speed_pos)

70

In [194]:
question = "What are the factors for speed in a river?"
question_spanish = "Cuales son los factores para la velocidad de un rio?"

In [196]:
answers_speed = []

answers_speed = get_list_answers(speed_pos, answers_speed, model, tokenizer, question)

The size of tensor a (1763) must match the size of tensor b (512) at non-singleton dimension 1


In [197]:
#%store answers_speed

Stored 'answers_speed' (list)


In [198]:
%store -r answers_speed

In [None]:
answers_speed

In [205]:
print(outputs[speed_pos[6]])

This flow of the boundary layer is significantly different from the speed and direction of the primary flow of the river, and is part of the river's secondary flow.
River flood plains that contain rivers with a highly sinuous platform are populated by longer oxbow lakes than those with low sinuosity.  This is because rivers with high sinuosity have larger meanders, and greater opportunity for longer lakes to form. Rivers with lower sinuosity are characterized by fewer cutoffs and shorter oxbow lakes due to the shorter distance of their meanders.
 Oxbow lake ecology 
Oxbow lakes form favorable habitats for wildlife communities. These often have unique characteristics. For example, the numerous oxbow lakes of the Amazon River are a favorable habitat for the giant river otter. Oxbow lakes may also be suitable locations for aquaculture.Oxbow lakes contribute to the health of a river ecosystem by trapping sediments and agricultural runoff, thereby removing them from the main river flow. How

There isn't enough information in the corpus

In [206]:
answer = get_the_answer(model, tokenizer, question, outputs[speed_pos[6]])
answer

'What are the factors for speed in a river?  speed and direction'

In [207]:
answer = get_the_answer(model, tokenizer, question_spanish, outputs[speed_pos[6]])
answer

'Cuales son los factores para la velocidad de un rio?  [SEP]'

##### Context obtained from [Globo surf](https://www.globosurfer.com/river-velocity/)

In [211]:
speed_context = """The River’s Velocity Factors
The river’s speed is directly related to some of the factors like the channel shape, the gradient of the slope, the water volume, the riverbed, and the friction, the erosion… And, as you can guess, it could vary from place to place, somewhere it can be fairly quickly, while in the other areas the same river could look more like a lake than an actual river.
The Shape Of The Channel
The first thing that affects the speed of the river in the shape of the channel. As the water flows it creates friction against the edges. The deeper and wider the channel is, the faster the water will be because the less amount of the water will be in touch and slowed down by the edges.
Water Volume
This is simple physics, the bigger the volume, the faster the river will be. Also, as the volume increases the level of erosion rises with it, creating a deeper and wider river bed that will allow the water to move easily.
Smoothness And Roughness Of The Channel
If there are lots of stones, rocks, or sand in the channel, the water will be slowed down by the friction and the resistance. The channels with fewer rocks and sand don’t have to deal with those, so the water saves the energy and therefore moves faster.
Gradient And The Elevation Of The Riverbed
The steeper the slope, the faster the water will be, thanks to gravity. For the experiment, you could take a bucket and fill it with water. Now start to pour the water. You’ll notice that the speed of the water running out of the bucket will increase as you turn it. The same principle works for the water in the river. When it meets the steep slope, it will gain speed. Also, it goes the other way, if the water runs into an uphill area, it is more likely that it will go slower."""

In [212]:
answer = get_the_answer(model, tokenizer, question, speed_context)
answer

'What are the factors for speed in a river?  channel shape , the gradient of the slope , the water volume , the riverbed , and the friction , the erosion'

In [213]:
answer = get_the_answer(model, tokenizer, question_spanish, speed_context)
answer

'Cuales son los factores para la velocidad de un rio? '

#### Tenth question (Discharge)

In [16]:
discharge_pos = []

discharge_pos = find_words_in_paragraph(['discharge'], discharge_pos)

In [17]:
len(discharge_pos)

442

In [25]:
question = "What is discharge in water?"
question_spanish = "Que es la descarga en el agua?"

In [19]:
answers_discharge = []

answers_discharge = get_list_answers(discharge_pos, answers_discharge, model, tokenizer, question)

Token indices sequence length is longer than the specified maximum sequence length for this model (613 > 512). Running this sequence through the model will result in indexing errors


The size of tensor a (613) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (613) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (1759) must match the size of tensor b (512) at non-singleton dimension 1
The size of tensor a (613) must match the size of tensor b (512) at non-singleton dimension 1


In [20]:
%store answers_discharge

Stored 'answers_discharge' (list)


In [None]:
%store -r answers_discharge

In [None]:
answers_discharge

In [22]:
print(outputs[discharge_pos[3]])

It has relatively cool temperatures, high oxygen levels, and fast, turbulent, swift flow.
The potamon is the remaining downstream stretch of river. It has warmer temperatures, lower oxygen levels, slow flow and sandier bottoms.
 Navigability 
The international scale of river difficulty is used to rate the challenges of navigation—particularly those with rapids. Class I is the easiest and Class VI is the hardest.
 Streamflow 
Studying the flows of rivers is one aspect of hydrology.
 Characteristics 
 Direction 
Rivers flow downhill with their power derived from gravity. A common misconception holds that all or most most rivers flow from north to south, but this is not so: rivers flow in all directions of the compass and often have complex meandering paths.Rivers flowing downhill, from river source to river mouth, do not necessarily take the shortest path. For alluvial streams, straight and braided rivers have very low sinuosity and flow directly down hill, while meandering rivers flow f

In [23]:
answer = get_the_answer(model, tokenizer, question, outputs[discharge_pos[3]])
answer

'What is discharge in water?  volumetric flow rate'

In [26]:
answer = get_the_answer(model, tokenizer, question_spanish, outputs[discharge_pos[6]])
answer

'Que es la descarga en el agua?  [CLS]'

### Question with another type of corpus (history)

In [31]:
history_context = '''World War I or the First World War, often abbreviated as WWI or WW1, and referred to by some Anglophone authors as the "Great War" or the "War to End All Wars", was a global conflict which lasted from 1914 to 1918, and is considered one of the deadliest conflicts in history. Belligerents included much of Europe, the Russian Empire, the United States, and the Ottoman Empire, with fighting occurring throughout Europe, the Middle East, Africa, the Pacific, and parts of Asia. An estimated 9 million soldiers were killed in combat, plus another 23 million wounded, while 5 million civilians died as a result of military action, hunger, and disease.[2] Millions more died in genocides within the Ottoman Empire and in the 1918 influenza pandemic, which was exacerbated by the movement of combatants during the war.
Prior to 1914, the European great powers were divided between the Triple Entente, comprising France, Russia, and Britain, and the Triple Alliance, containing Germany, Austria-Hungary, and Italy. Tensions in the Balkans came to a head on 28 June 1914 following the assassination of Archduke Franz Ferdinand, the Austro-Hungarian heir, by Gavrilo Princip, a Bosnian Serb. Austria-Hungary blamed Serbia, which led to the July Crisis, an unsuccessful attempt to avoid conflict through diplomacy. On 28 July 1914, Austria-Hungary declared war on Serbia, and Russia came to the latter's defence. By 4 August, the system of entangling alliances drew in Germany, France, and Britain, along with their respective colonies, although Italy initially remained neutral. In November 1914, the Ottoman Empire, Germany, and Austria-Hungary formed the Central Powers, and on 26 April 1915, Italy joined Britain, France, Russia, and Serbia as the Allies of World War I.'''


In [32]:
question = "What started the first world war?"
question_spanish = "Que comenzo la primera guerra mundial?"

In [33]:
answer = get_the_answer(model, tokenizer, question, history_context)
answer

'What started the first world war?  the assassination of archduke franz ferdinand , the austro - hungarian heir , by gavrilo princip , a bosnian serb'

## Conclusion

- Hubo alguna diferencia de las preguntas en español y en ingles?
  - Si la mayoria de las veces el modelo no pudo responder la pregunta en espanol, pero hubo unas cuatro veces en que la respondio y aparte daba hasta mas texto que la respuesta en ingles o daba un texto diferente al del ingles que era incorrecto en si para la pregunta.
- Que lenguaje conviene mas y porque?
  - Ingles, ya que el modleo en realidad fue entrenado solo con texto en ingles, a lo que dice la descripcion
- Cual era el tamaño del corpus?
  - 74364 parrafos y oracions (texto que terminaba con un . y espacio)
- Cuantas respuestas tienen coherencia?
  - La mayoria de las respuestas tienen coherencia, solo que la mayoria de las veces la respuesta era solo agarrar una parte del texto del contexto que estaba cerca de las palabras de la pregunta, cuando habia mas informacion que explicaba la pregunta en el texto. Muchas veces no fue tanto de entender el texto si no encontrar el texto que estaba cerca de la pregunta.
- Si cambias el corpus y preguntas lo mismo recibira una respuesta? Demuestre
  - Si sirve, al menos con historia lo comprobamos con la pregunta que sale antes del tema sobre la conclusion, donde preguntamos sobre quien comenzo la primera guerra mundial y el modelo pudo encontrar en el contexto que fue por el asesinato del archiduque.
- Cuantos lenguajes puede manejar BERT para resolver preguntas?
  - Nuestro modelo para responder preguntas en realidad solo sirve para un lenguaje, de la descripcion que tiene solo fue entrenado con texto en ingles.
