In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
d = pd.read_csv("/kaggle/input/uttarakhand/Uttarakhand.csv")

In [None]:
import torch

In [None]:
from transformers import BertForQuestionAnswering, BertTokenizer

model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [None]:
def answer_question(question, answer_text):
    '''
    This function will take a question_text string and an answer_text string (which contains the
    answer), and identifies the words within the `answer_text` that are the
    answer. And then print them out.
    '''
    # ======== Tokenizing ========
    # Tokening the text string.
    # Apply the tokenizer to the input text, treating them as a text-pair.
    input_ids = tokenizer.encode(question, answer_text)

    # Report how long the input sequence is.
    #print('Query has {:,} tokens.\n'.format(len(input_ids)))

    # ======== Set Segment IDs ========
    # Search the input_ids for the first instance of the `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # The number of segment A tokens includes the [SEP] token istelf.
    num_seg_a = sep_index + 1

    # The remainder are segment B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    # ======== Evaluate ========
    # Run our example through the model.
    outputs = model(torch.tensor([input_ids]), # The tokens representing our input text.
                    token_type_ids=torch.tensor([segment_ids]), # The segment IDs to differentiate question from answer_text
                    return_dict=True) 

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # ======== Reconstruct Answer ========
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Get the string versions of the input tokens.
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Start with the first token.
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
        
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]
    
    answer = answer.replace('[CLS]','')
    # A very un-professional way to deal with the [CLS] token
    # which was being returned upon not finding the proper answer in the part of
    # the dataset during the iteration
    print('Answer: "' + answer + '"')

In [None]:
path = '/kaggle/input/uttarakhand/Uttarakhand.csv'
# import pandas library
import pandas as pd
import re

#function to remove emojis and other possible icons
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [None]:
# importing csv file of uttarakhan dataset
import csv
data = ''

with open(path, 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        deEmojifiedText = deEmojify(str(row[0]))
        data+=deEmojifiedText+'. '

print(row)
print(len(data))
print(data[:100])

In [None]:
# wrapping all the text.
import textwrap

wrapper = textwrap.TextWrapper(width=80) 
print(wrapper.fill(data[1701:3300]))

In [None]:
# Trying some questions to answer them.
question = "What is the helpline number?"
for i in range(0,10):
  answer_question(question, data[(i*1500):((i+1)*1500)])

In [None]:
question = "what happened in uttarakhand?"
for i in range(0,10):
  answer_question(question, data[(i*1500):((i+1)*1500)])

In [None]:
question = "how much water level rose in rishikesh?"
for i in range(0,10):
  answer_question(question, data[(i*1500):((i+1)*1500)])

In [None]:
question = "what is the emergency helpline number?"
for i in range(0,10):
  answer_question(question, data[(i*1500):((i+1)*1500)])

In [None]:
question = "How many army units were deployed?"
for i in range(0,10):
  answer_question(question, data[(i*1500):((i+1)*1500)])

In [None]:
question = "How many people are affected?"
for i in range(0,10):
  answer_question(question, data[(i*1600):((i+1)*1600)])