# Read data

In [1]:
import pandas as pd
data = pd.read_csv('curated-full.tsv', sep='\t', encoding='utf8', header=None)
data.columns=['id','type','question','answer']
data[:5] 

Unnamed: 0,id,type,question,answer
0,1756,factoid,When is Fashion week in NYC?,Sept?(ember)?|Feb(ruary)?
1,1434,factoid,What site did Lindbergh begin his flight from ...,Long Island|New\s?York|Roosevelt Field
2,1790,factoid,What country is the holy city of Mecca located...,Saudi Arabia
3,1992,factoid,How hot is the sun?,"(5,?778|6,?[05]00)\s*K|(5,?505|6,?[50]00).*C|(..."
4,2128,factoid,What flavor filling did the original Twinkies ...,banana


# CHANGE QUESTION TO YES NO

In [2]:
#some rundom results can be

In [131]:
import rstr
import re

def generate_answer_from_pattern(pattern):
    value = rstr.xeger(pattern)
    value = re.sub(r"\s+", ' ', value)
    #return value
    if len(value)>len(pattern):
        return  re.sub(r"[^\w\s]", ' ', pattern)
    else:
        return value
data['answer_value'] =  data['answer'].apply(generate_answer_from_pattern)

In [132]:
data[['question','answer_value']][:100]

Unnamed: 0,question,answer_value
0,When is Fashion week in NYC?,February
1,What site did Lindbergh begin his flight from ...,Roosevelt Field
2,What country is the holy city of Mecca located...,Saudi Arabia
3,How hot is the sun?,5778.0
4,What flavor filling did the original Twinkies ...,banana
5,What is the state song of Kansas?,Home on the Range
6,"What war is connected with the book ""Charge of...",Crimean
7,Where was the first atomic bomb detonated?,New Mexico
8,What continent is Scotland in?,Europe
9,What island did the U.S. gain after the Spanis...,Puerto Rico


Обставинні питання в англійській мові складаються за двома паттернами:
1. підмет є в питанні
```(питальне слово) + (допоміжне дієслово) + (підмет) + (основне дієслово) ? + (додаток/обставина) ?```
_(Where) (was) (the first atomic bomb) (detonated)? => (the first atomic bomb) (was) (detonated) + відповідь (in New Mexico)_
_(Where) (was) (the first atomic bomb)? => (the first atomic bomb) (was) + відповідь (in New Mexico)_

2. питальне слово заміняє підмет
```(питальне слово) + (основне дієслово) + (додаток/обставина) ?```
_(Who) (discovered) (America)? => відповідь (Columbus) + (discovered) (America)_
_(What) (is) (the state song of Kansas)? => відповідь (Home on the Range) + (is) (the state song of Kansas)_ (edited)
Оці частинки, що я виділила дужками, називаються в NLP чанками (chunks). Ти їх можеш виділити через дерево складників або через простіший інструмент - chunker. Він є в nltk, наприклад.

А далі перестановка з урахуванням кількох моментів:
1. якщо допоміжне дієслово do/did/does, то основне треба перетворити: `did ... happen` => `happened`
2. є розширені питальні слова з which/what/how (what music, which style, how deep), де відповідь буде об’єднуватись з питальним словом (pop music, disco style, 593.0 deep)
3. до питань про час та місце треба добирати прийменники (in 1999, on May 7)
4. у типі 2 можуть бути лише питальні слова who і what

In [133]:
def get_chunks(sent):

    grammar = """Q1: {^<WRB><JJ>*(<VBZ>|<VBD>)*}
                    {^<VB><PRP><WRB>}
                Q2: {<WP><VBZ>*}
                {^<NNP>}
                {^<JJ>}
                {<WDT>}
                Q_symbol: {<.>*}
               S2: {<.*>+}
               }<Q1|Q2|Q_symbol>+{
               """
    chunkParser = nltk.RegexpParser(grammar)


    tagged = nltk.pos_tag(nltk.word_tokenize(sent))
    tree = chunkParser.parse(tagged)
    return tree

def print_tree(tree):    
    for subtree in tree.subtrees():
        if subtree.label() in ["Q1","Q2",'Q_symbol','S2']: 
            print(subtree.label()+str(subtree.leaves()))

In [134]:
import nltk
from nltk import chunk, tag, tokenize

for sent in data['question'].values[[3,15,18,21,23,109,242,673]]:
    tokens = tokenize.word_tokenize(sent)
    tagged_sent = tag.pos_tag(tokens)
    print(tagged_sent)
    print_tree(get_chunks(sent))
    print('-----------------')
#tree.draw()

[('How', 'WRB'), ('hot', 'JJ'), ('is', 'VBZ'), ('the', 'DT'), ('sun', 'NN'), ('?', '.')]
Q1[('How', 'WRB'), ('hot', 'JJ'), ('is', 'VBZ')]
S2[('the', 'DT'), ('sun', 'NN')]
Q_symbol[('?', '.')]
-----------------
[('How', 'WRB'), ('hot', 'JJ'), ('does', 'VBZ'), ('it', 'PRP'), ('get', 'VB'), ('in', 'IN'), ('Death', 'NNP'), ('Valley', 'NNP'), ('?', '.')]
Q1[('How', 'WRB'), ('hot', 'JJ'), ('does', 'VBZ')]
S2[('it', 'PRP'), ('get', 'VB'), ('in', 'IN'), ('Death', 'NNP'), ('Valley', 'NNP')]
Q_symbol[('?', '.')]
-----------------
[('How', 'WRB'), ('many', 'JJ'), ('Great', 'NNP'), ('Lakes', 'NNP'), ('are', 'VBP'), ('there', 'RB'), ('?', '.')]
Q1[('How', 'WRB'), ('many', 'JJ')]
S2[('Great', 'NNP'), ('Lakes', 'NNP'), ('are', 'VBP'), ('there', 'RB')]
Q_symbol[('?', '.')]
-----------------
[('How', 'WRB'), ('many', 'JJ'), ('planets', 'NNS'), ('are', 'VBP'), ('in', 'IN'), ('our', 'PRP$'), ('solar', 'NN'), ('system', 'NN'), ('?', '.')]
Q1[('How', 'WRB'), ('many', 'JJ')]
S2[('planets', 'NNS'), ('are', '

In [135]:
def translate_Q1(data, answer):
    question_chunks = data['Q1']
    vb=''
    for item in  question_chunks:        
        if item[1]=='VBZ':
            vb=item[0]
        
    cc = ''
    if question_chunks[0][0].lower() in ['where', 'when']:
        cc = 'in'
        
    end = ''
    for item in  question_chunks:        
        if item[1]=='JJ':
            end=item[0]
    
    sent = ' '.join(pair[0] for pair in data['S2'])
    
    return  ' '.join([sent,vb,cc,answer,end])

def translate_Q2(data, answer):
    question_chunks = data['Q2']
    vb=''
    if len(question_chunks)==2:
        vb=question_chunks[1][0]
  
    sent = ' '.join(pair[0] for pair in data['S2'])
    return  ' '.join([answer, vb,sent])

def get_statement_sent(question, answer):
    tree = get_chunks(question)
    data ={}
    for subtree in tree.subtrees():
        data[subtree.label()] = subtree.leaves()
    if 'Q1' in data:
        return translate_Q1(data, answer)
    else:
        return translate_Q2(data, answer)
    
get_statement_sent('When is Fashion week in NYC?','February')

'Fashion week in NYC is in February '

In [136]:
data[:5]

Unnamed: 0,id,type,question,answer,answer_value,statements
0,1756,factoid,When is Fashion week in NYC?,Sept?(ember)?|Feb(ruary)?,February,Fashion week in NYC is in February
1,1434,factoid,What site did Lindbergh begin his flight from ...,Long Island|New\s?York|Roosevelt Field,Roosevelt Field,Roosevelt Field site did Lindbergh begin his ...
2,1790,factoid,What country is the holy city of Mecca located...,Saudi Arabia,Saudi Arabia,Saudi Arabia country is the holy city of Mecc...
3,1992,factoid,How hot is the sun?,"(5,?778|6,?[05]00)\s*K|(5,?505|6,?[50]00).*C|(...",5778.0,the sun is 5505!/K Sm4x= C hot
4,2128,factoid,What flavor filling did the original Twinkies ...,banana,banana,banana flavor filling did the original Twinki...


In [137]:
data['statements'] = data[['question','answer_value']].apply(lambda row: get_statement_sent(row['question'],row['answer_value']),axis=1)

In [138]:
data[['question','statements'] ]

Unnamed: 0,question,statements
0,When is Fashion week in NYC?,Fashion week in NYC is in February
1,What site did Lindbergh begin his flight from ...,Roosevelt Field site did Lindbergh begin his ...
2,What country is the holy city of Mecca located...,Saudi Arabia country is the holy city of Mecc...
3,How hot is the sun?,the sun is 5778.0 hot
4,What flavor filling did the original Twinkies ...,banana flavor filling did the original Twinki...
5,What is the state song of Kansas?,Home on the Range is the state song of Kansas
6,"What war is connected with the book ""Charge of...",Crimean war is connected with the book `` Cha...
7,Where was the first atomic bomb detonated?,the first atomic bomb detonated in New Mexico
8,What continent is Scotland in?,Europe continent is Scotland in
9,What island did the U.S. gain after the Spanis...,Puerto Rico island did the U.S. gain after th...


In [139]:
data.to_csv('statements.csv', index=False, encoding='utf8')