# Data augmentation

## 1 Current data statistics

### We read in the files of queries, logical forms, and schema, and categorize them by length; within the same length, there would be subcategories

In [14]:
import numpy as np

logic_category_len = dict()
query_len = dict()
schema_len = dict()
with open('./rand.lo') as f_lo:
    with open('./rand.qu') as f_qu:
        with open('./rand.fi') as f_fi:
            logic_line, query_line, schema_line = f_lo.readline(), f_qu.readline(), f_fi.readline()
            while logic_line and query_line and schema_line:
                logic = logic_line.split()
                if len(logic) == 13:
                    if logic[4] == 'less':
                        logic[0] = 'argmax'
                    else:
                        logic[0] = 'argmin'
                    logic.insert(2, logic[3])
                length = len(logic)
                if length not in logic_category_len:
                    logic_category_len[length] = []
                    query_len[length] = []
                    schema_len[length] = []
                logic_category_len[length].append(logic_line)
                query_len[length].append(query_line)
                schema_len[length].append(schema_line)
                logic_line, query_line, schema_line = f_lo.readline(), f_qu.readline(), f_fi.readline()
for key in logic_category_len.keys():
    value = logic_category_len[key]
    print 'length = %d, total examples: %d' %(key, len(value))

length = 2, total examples: 2
length = 3, total examples: 156
length = 5, total examples: 128
length = 6, total examples: 1047
length = 7, total examples: 624
length = 10, total examples: 687
length = 11, total examples: 487
length = 12, total examples: 4
length = 14, total examples: 157
length = 18, total examples: 618


Have a look at the data:

In [41]:
for i in range(len(logic_category_len[7])):
    print query_len[7][i]
    #print logic_category_len[14][i]

who has the least apps and greater than 22 goals

which year was the latest year that havana was the 2nd_venue

who has the most field_goals with an average of 50

which year was the most recent year cambridge was the 2nd_venue

who has the least apps and greater than 114 goals

which year was the latest year that boston was the 2nd_venue

when was the first time nassau was 2nd_venue

what is the amount of the most goals before 2000

who has the least runs with an average above 20

who has the most goals but less than 268 apps

when was the first time detroit was 2nd_venue

who has the least matches with an average above 70

who has the most free_throws with an average above 30

who has the least goals and less than 253 apps

what is the amount of the least goals before 2016

what is the amount of the least goals before 2014

which state had the least number_of_candidates in 2015

what is the amount of the least goals after 2001

who has the most innings with an average of 80

what is 

### Now we collect all different schema in a list for later use

In [39]:
schema_collect = []
with open('./rand.fi') as f_fi:
    for line in f_fi:
        if line in schema_collect:
            continue
        schema_collect.append(line)
    
for schema in schema_collect:
    print schema

Nation Rank Gold Silver Bronze Total

Name Year_inducted Position Apps Goals

Year 1st_Venue 2nd_Venue 3rd_Venue 4th_Venue 5th_Venue 6th_Venue

Player Matches Innings Runs Average 100s 50s Games_Played Field_Goals Free_Throws Points

Team County Wins Years_won Areas Prices

Country Masters U.S._Open The_Open PGA Total

Swara Position Short_name Notation Mnemonic

State No._of_candidates No._of_elected Total_no._of_seats_in_Assembly Year_of_Election

Discipline Amanda Bernie Javine_H Julia Michelle

Nation Name Position League_Apps League_Goals FA_Cup_Apps FA_Cup_Goals Total_Apps Total_Goals

Menteri_Besar Took_office Left_office Party



### Next we do some data generation, the small goal is to double our current data size (8k~10k) 

As we previously did some work in the file ./data_prep/categorization.txt, we have several different sentences for a single length category. For each sentence structure, we first see whether it could applied to all or several schema, or just a single schema; then we tag each sentence, and for 'field' and 'value', we do data recombination for both query and logical forms; finally we add noise and replace synonyms in the queries to further complicate the sentence structrue.

Let's start with the easiest length = 3:

In [62]:
import os,sys,inspect

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

import tagger as tg


In [73]:
schema_collect[2] = "State Year_of_Election No._of_candidates No._of_elected Total_no._of_seats_in_Assembly \n"
schema_collect[7] = "Year 1st_Venue 2nd_Venue 3rd_Venue 4th_Venue 5th_Venue 6th_Venue \n"
schema_collect[3] = "Team Years_won County Wins Areas Prices \n"
schema_collect[4] = "Player Matches Innings Runs Average 100s 50s Games_Played Field_Goals Free_Throws Points \n"

schema_collect[6] = "Discipline Amanda Bernie Javine_H Julia Michelle \n"
schema_collect[8] = "Swara Position Short_name Notation Mnemonic \n"
schema_collect[7] = "Nation Name Position League_Apps League_Goals FA_Cup_Apps FA_Cup_Goals Total_Apps Total_Goals \n"
schema_collect[9] = "Year 1st_Venue 2nd_Venue 3rd_Venue 4th_Venue 5th_Venue 6th_Venue \n"

for schema in schema_collect:
    print schema

Nation Rank Gold Silver Bronze Total

Name Year_inducted Position Apps Goals

State Year_of_Election No._of_candidates No._of_elected Total_no._of_seats_in_Assembly 

Team Years_won County Wins Areas Prices 

Player Matches Innings Runs Average 100s 50s Games_Played Field_Goals Free_Throws Points 

Country Masters U.S._Open The_Open PGA Total

Discipline Amanda Bernie Javine_H Julia Michelle 

Nation Name Position League_Apps League_Goals FA_Cup_Apps FA_Cup_Goals Total_Apps Total_Goals 

Swara Position Short_name Notation Mnemonic 

Year 1st_Venue 2nd_Venue 3rd_Venue 4th_Venue 5th_Venue 6th_Venue 

Menteri_Besar Took_office Left_office Party



==== Conventions ====
1. "o" stands for "ordinal" values, refering to schema_collect[0:4]
2. "n" stands for "numerical" values, refering to schema_collect[4:8]
3. "s" stands for "string" values, refering to schema_collect[7:]

In [70]:
len3read = """which country has the most pga championships
which country had the most number of wins
who was the first nation
what is the name of the first nation on this chart
which country won the largest haul of bronze medals
what is the name of the swara that holds the first position
which country had the least bronze medals
who was the last de player
who scored the least on whitewater_kayak
which nation received the largest amount of gold medals
which state has the top no._of_elected amount
the team with the most gold medals
which nation was ranked last
the country that won the most medals was
who was the top scorer in innings
what is the top listed player
who is the top ranked nation
what is the largest matches amount""".split('\n')

# idx = 0
# for i in range(len(len3read)):
#     idx += 1
#     print "====== example %d ======" %idx
#     tagged2, field_corr, value_corr, newquery, newlogical = \
#             tg.sentTagging_value(query_len[7][i], schema_len[7][i], logic_category_len[7][i])
#     print logic_category_len[7][i]
#     print schema_len[7][i]
#     print query_len[7][i]
#     print tagged2
#     print field_corr
#     print value_corr 
#     print newquery
#     print newlogical
#     print '\n'

lo_template3_max = 'argmax <field>:0 <field>:1'
lo_template3_min = 'argmin <field>:0 <field>:1'

tagged2, field_corr, value_corr, newquery, _ = \
            tg.sentTagging_value(len3read[0], schema_collect[5])
print schema_collect[5]
print len3read[0]
print tagged2
print field_corr
print value_corr 
print newquery

['<field>', 'Country']
['<field>', 'PGA']
[(5, 1)]
[]
Country Masters U.S._Open The_Open PGA Total

which country has the most pga championships
<nan> <field>:0 <nan> <nan> <nan> <field>:1 <nan>
Country PGA
<nan> <nan>
which <field>:0 has the most <field>:1 championships


In [69]:
print len3read

['which country has the most pga championships', 'which country had the most number of wins', 'who was the first nation', 'what is the name of the first nation on this chart', 'which country won the largest haul of bronze medals', 'what is the name of the swara that holds the first position', 'which country had the least bronze medals', 'who was the last de player', 'who scored the least on whitewater_kayak', 'which nation received the largest amount of gold medals', 'which state has the top no._of_elected amount', 'the team with the most gold medals', 'which nation was ranked last', 'the country that won the most medals was', 'who was the top scorer in innings', 'what is the top listed player', 'who is the top ranked nation', 'what is the largest matches amount']
