# Starting Code

In [1]:
!wget -nc https://raw.githubusercontent.com/poleval/2021-question-answering/main/dev-0/in.tsv -O train_questions.tsv
!wget -nc https://raw.githubusercontent.com/poleval/2021-question-answering/main/dev-0/expected.tsv -O train_answers.tsv
!python -m spacy download pl_core_news_sm
!pip install icecream
!pip install levenshtein

--2023-05-31 21:03:09--  https://raw.githubusercontent.com/poleval/2021-question-answering/main/dev-0/in.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 61630 (60K) [text/plain]
Saving to: ‘train_questions.tsv’


2023-05-31 21:03:10 (299 KB/s) - ‘train_questions.tsv’ saved [61630/61630]

--2023-05-31 21:03:10--  https://raw.githubusercontent.com/poleval/2021-question-answering/main/dev-0/expected.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11643 (11K) [text/plain]
Saving to: ‘train_answers.tsv’


2023-05-31 21:03:10 (1

In [None]:
!pip install levenshtein

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting levenshtein
  Downloading Levenshtein-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (174 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/174.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=2.3.0 (from levenshtein)
  Downloading rapidfuzz-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, levenshtein
Successfully installed levenshtein-0.21.0 rapidfuzz-3.0.0


In [None]:
import spacy
from spacy.lang.pl.examples import sentences

import requests
from icecream import ic
import re
import Levenshtein


In [None]:
def load_text_data(mode = 'train'):
  question_sentences = []
  answer_sentences = []

  with open(mode + '_questions.tsv') as questions:
    for question in questions:
      question_sentences.append(question.strip())
  with open(mode + '_answers.tsv') as answers:
    for answer in answers:
      answer_sentences.append(answer.strip())

  return question_sentences, answer_sentences

def remove_html_tags(string):
    clean_string = re.sub(r"<.*?>", "", string)
    return clean_string

def get_wikipedia_titles(tokens):
    search_query = ' '.join(tokens)
    url = f"https://pl.wikipedia.org/w/api.php?action=query&list=search&format=json&srsearch={search_query}"

    response = requests.get(url)
    data = response.json()

    titles = []
    snippets = []
    if 'query' in data and 'search' in data['query']:
        for result in data['query']['search']:
            titles.append(result['title'])
            snippets.append(remove_html_tags(''.join(result['snippet'])))

    return titles, snippets

def qualified_tokens(tokenizer, sentence):
  return [token.text for token in tokenizer(sentence) if len(token) > 1]

def get_dists(tokenizer, tokens, titles):
    candidates = []

    for title in titles:
      tokenized_title = qualified_tokens(tokenizer, title)
      #print(f'Tokens: {" ".join(tokens)} Title : {title} {Levenshtein.seqratio(" ".join(tokens), tokenized_title)}')
      is_candidate = True
      for token in tokens:
        #print('Token : ' + token)
        for title_token in tokenized_title:
          if Levenshtein.seqratio(token, title_token) > 0.5:
            is_candidate = False
          #print(f'{token} - {title_token} : {Levenshtein.seqratio(token, title_token)}')
      if is_candidate:
        candidates.append(title)

    return candidates

def search_and_get_first_candidate(tokenizer, question):

  tokenized_question = qualified_tokens(tokenizer, question)
  titles, _ = get_wikipedia_titles(tokenized_question)
  candidates = get_dists(tokenizer, tokenized_question, titles)
  while len(candidates) == 0:
    tokenized_question = tokenized_question[1:]
    titles, _ = get_wikipedia_titles(tokenized_question)
    candidates = get_dists(tokenizer, tokenized_question, titles)

  if len(candidates) == 0:
    print('Couldn\'t find any candidate')
    return 'Have no idea'

  return candidates[0]

In [None]:
questions, answers = load_text_data('train')
tokenizer = spacy.load("pl_core_news_sm")

In [None]:
for i in range(len(questions)):
  candidate = search_and_get_first_candidate(tokenizer, questions[i])
  print(f'Question {i}: {questions[i]}')
  print(f'Answer {i}: {answers[i]}')
  print(f'First candidate: {candidate} | Ratio: {Levenshtein.seqratio(candidate, answers[i])}')

Question 0: Jak nazywa się pierwsza litera alfabetu greckiego?
Answer 0: alfa
First candidate: Alfa | Ratio: 0.75
Question 1: Jak nazywa się dowolny odcinek łączący dwa punkty okręgu?
Answer 1: cięciwa
First candidate: Cięciwa | Ratio: 0.8571428571428571
Question 2: W którym państwie rozpoczyna się akcja powieści „W pustyni i w puszczy”?
Answer 2: w Egipcie
First candidate: Port Said | Ratio: 0.2222222222222222
Question 3: Czy w państwach starożytnych powoływani byli posłowie i poselstwa?
Answer 3: tak
First candidate: Filip II Macedoński | Ratio: 0.18181818181818182
Question 4: W jakim zespole występowała Hanka w filmie „Żona dla Australijczyka”?
Answer 4: Mazowsze
First candidate: Elżbieta Czyżewska | Ratio: 0.3076923076923077
Question 5: W którym państwie leży Bombaj?
Answer 5: w Indiach
First candidate: Honolulu | Ratio: 0.11764705882352941
Question 6: Który numer boczny nosi czołg Rudy z „Czterech pancernych”?
Answer 6: 102
First candidate: Gustaw Jeleń | Ratio: 0.0
Question 7: Co

KeyboardInterrupt: ignored