# Exploratory Data Analysis
[for ConditionalQA]

In [1]:
import json
import numpy as np

In [2]:
trainFP = '../ConditionalQA/v1_0/train.json'
devFP = '../ConditionalQA/v1_0/dev.json'
docFP = '../ConditionalQA/v1_0/documents.json'

## Overall Documents

In [3]:
docs = json.load(open(docFP, 'r'))
# build a dictionary given the list, using url as key
docdict = {d['url']: d for d in docs}
print(f"There are a total of {len(docs)} documents.")

There are a total of 652 documents.


In [4]:
print(f"For each document, it encompasses the following keys:\n"
      f"\t{list(docs[0].keys())}")

For each document, it encompasses the following keys:
	['title', 'url', 'contents']


### average stats

In [5]:
avg_title_len = np.mean([len(d['title']) for d in docs])
print(f"The average title length (token counts) is: {avg_title_len:.4f} tokens.")

The average title length (token counts) is: 33.5414 tokens.


In [6]:
content_cnts = [len(d['contents']) for d in docs]
avg_content_cnt = np.mean(content_cnts)

content_lens = list()
for d in docs:
    content_lens.append(sum([len(c.split(' ')) for c in d['contents']]))
avg_content_len = np.mean(content_lens)
    
print(f"The average content count is: {avg_content_cnt:.4f} segments.")
print(f"The average content length is: {avg_content_len:.4f} segments.")

The average content count is: 106.1334 segments.
The average content length is: 1311.1258 segments.


### polar stats

In [7]:
print("For content counts:")
print(f"\tmax={max(content_cnts)}, min={min(content_cnts)}")

print("For content lengths:")
print(f"\tmax={max(content_lens)}, min={min(content_lens)}")

For content counts:
	max=559, min=20
For content lengths:
	max=8868, min=207


## Training Set

In [8]:
trains = json.load(open(trainFP, 'r'))
print(f"There are a total of {len(trains)}" + " instances in training set.")

There are a total of 2338 instances in training set.


In [9]:
print(f"For each training instance, it encompasses the following keys:\n"
      f"\t{list(trains[0].keys())}")

For each training instance, it encompasses the following keys:
	['url', 'scenario', 'question', 'not_answerable', 'answers', 'evidences', 'id']


### Typical Examples

In [10]:
def print_case(unit):
    for k, v in unit.items():
        print(f"{k:<10}: {v}")
    # add the document retrieved from docs
    print(f"document  :")
    for k, v in docdict[unit['url']].items():
        print(f"{' ' * 10}{k:<10}: {v}")
    return True

#### a) Not Answerable Question

In [11]:
for t in trains:
    if t['not_answerable']:
        print_case(t)
        break

url       : https://www.gov.uk/change-cancel-presumption-death-certificate
scenario  : My brother died on January 2nd. His death certificate is inaccurate in this respect. I do not know the doctor who signed the original certificate or why the mistake was made
question  : Is it possible to have the death certificate changed given that this happened over six months ago?
not_answerable: True
answers   : []
evidences : []
id        : train-24
document  :
          title     : Change or cancel a presumption of death certificate
          url       : https://www.gov.uk/change-cancel-presumption-death-certificate
          contents  : ['<h1>Overview</h1>', '<p>You can make a claim to cancel (‘revoke’) or change (‘vary’) the details of a declaration of presumed death from the High Court if you can prove the missing person:</p>', '<li>is still alive</li>', '<li>died at a time earlier or later than the time of death in the original declaration</li>', '<p>You can also make a claim if you can pro

#### b) yes/no question w/o conditions

In [12]:
for t in trains:
    if {a[0].lower() for a in t['answers']}.intersection({'yes', 'no'}):
        print_case(t)
        break

url       : https://www.gov.uk/applying-for-probate
scenario  : My father, who was a widower and the owner of several large properties in Wales, died recently and apparently intestate. My paternal uncle is applying for probate, but I believe that I have a stronger claim.
question  : Do I have a greater right to probate in respect of my late father's estate?
not_answerable: False
answers   : [['yes', []]]
evidences : ['<p>You can apply to become the estate’s administrator if you are 18 or over and you are the most ‘entitled’ inheritor of the deceased’s estate. This is usually the deceased’s closest living relative.</p>', '<p>Relatives are the most entitled inheritors in the following order:</p>', '<li>children (including legally adopted children but not step-children)</li>', '<li>brothers and sisters</li>']
id        : train-0
document  :
          title     : Applying for probate
          url       : https://www.gov.uk/applying-for-probate
          contents  : ['<h1>Overview</h1>', '

#### c) yes/no question w/ conditions

In [13]:
found = False
for t in trains:
    ans = t['answers']
    for a in ans:
        if a[0] in {'yes', 'no'} and len(a[1]):
            print_case(t)
            found = True
    if found:
        break

url       : https://www.gov.uk/apply-gender-recognition-certificate
scenario  : I was born and raised in Australia. I have changed my gender and got a certificate in Australia. I have moved to UK three years back
question  : I would like to know whether I am eligible to apply for Gender Recognition Certificate in UK ?
not_answerable: False
answers   : [['yes', ['<p>You must be 18 or over.</p>']]]
evidences : ['<p>Apply by the overseas route if your acquired gender has been legally accepted in an ‘approved country or territory’ and you have documents to prove it.</p>', '<p>You must be 18 or over.</p>', '<tr>Overseas route | Form T453 | Leaflet T454</tr>', '<p>If you’re applying using the overseas route, you must prove that your gender has been legally recognised in an ‘approved country or territory’. Send original or certified copies of the following (if you have them):</p>']
id        : train-1
document  :
          title     : Apply for a Gender Recognition Certificate
          url  

#### d) extractive question w/o conditions

In [14]:
found = False
for t in trains:
    ans, evi = t['answers'], t['evidences']
    for e in evi:
        for a in ans:
            if a[0] not in {'yes', 'no'} and len(a[1]) == 0 and a[0].lower() in e.lower():
                print_case(t)
                found = True
    if found:
        break

url       : https://www.gov.uk/paternity-pay-leave
scenario  : I'm 28, and have worked full-time for my current employer for just over 3 years. My wife is expecting our first child in a few months, and I intend to claim paid Paternity Leave when the baby is born.
question  : How much notice am I required to give my employer with regards to the starting date of my leave period?
not_answerable: False
answers   : [['at least 15 weeks before the baby is due', []]]
evidences : ['<p>At least 15 weeks before the baby is due, tell your employer:</p>']
id        : train-2
document  :
          title     : Paternity pay and leave
          url       : https://www.gov.uk/paternity-pay-leave
          contents  : ['<h1>Overview</h1>', '<p>When you take time off because your partner’s having a baby, adopting a child or having a baby through a surrogacy arrangement you might be eligible for:</p>', '<li>1 or 2 weeks’ paid Paternity Leave</li>', '<li>Paternity Pay</li>', '<li>Shared Parental Leave and

#### e) extractive question w/ conditions

In [15]:
found = False
for t in trains:
    ans, evi = t['answers'], t['evidences']
    for e in evi:
        for a in ans:
            if a[0] not in {'yes', 'no'} and len(a[1]) > 0 and a[0].lower() in e.lower():
                print_case(t)
                found = True
    if found:
        break

url       : https://www.gov.uk/make-will
scenario  : I have written a will with a help of a solicitor before six years and I have lot of changes in my assets and I am planning to make some changes to my will
question  : Can I make changes to my will once I have signed ? what is the process to amend a will ?
not_answerable: False
answers   : [['make a new will', ['<p>For major changes you should make a new will.</p>']], ['making an official alteration called a codicil', ['<p>You cannot amend your will after it’s been signed and witnessed. The only way you can change a will is by making an official alteration called a codicil.</p>']], ['follow the same signing and witnessing process', []]]
evidences : ['<p>If you want to update your will, you need to make an official alteration (called a ‘codicil’) or make a new will.</p>', '<p>If you make any changes to your will you must follow the same signing and witnessing process.</p>', '<p>You cannot amend your will after it’s been signed and witn

#### f) mixture of deterministic and conditionals 

In [16]:
found = False
for t in trains:
    found_det, found_con = False, False
    ans = t['answers']
    for a in ans:
        if len(a[1]) > 0:
            found_con = True
        if len(a[1]) == 0:
            found_det = True
        if found_con and found_det:
            found = True
            break
    if found:
        print_case(t)
        break

url       : https://www.gov.uk/make-will
scenario  : I have written a will with a help of a solicitor before six years and I have lot of changes in my assets and I am planning to make some changes to my will
question  : Can I make changes to my will once I have signed ? what is the process to amend a will ?
not_answerable: False
answers   : [['make a new will', ['<p>For major changes you should make a new will.</p>']], ['making an official alteration called a codicil', ['<p>You cannot amend your will after it’s been signed and witnessed. The only way you can change a will is by making an official alteration called a codicil.</p>']], ['follow the same signing and witnessing process', []]]
evidences : ['<p>If you want to update your will, you need to make an official alteration (called a ‘codicil’) or make a new will.</p>', '<p>If you make any changes to your will you must follow the same signing and witnessing process.</p>', '<p>You cannot amend your will after it’s been signed and witn

## Dev Set

In [17]:
devs = json.load(open(devFP, 'r'))
print(f"There are a total of {len(devs)}" + " instances in development set.")

There are a total of 285 instances in development set.


In [18]:
print(f"For each development instance, it encompasses the following keys:\n"
      f"\t{list(devs[0].keys())}")

For each development instance, it encompasses the following keys:
	['url', 'scenario', 'question', 'not_answerable', 'answers', 'evidences', 'id']


#### g) other generated answers

In [19]:
found = False
hit = False
for t in trains:
    ans, evi = t['answers'], t['evidences']
    for a in ans:
        if a[0] not in {'yes', 'no'}:
            for e in evi:
                if a[0].lower() in e.lower():
                    hit = True
            if not hit:
                print_case(t)
                break
    if found:
        break

### Typical Examples

#### a) Not Answerable Question

In [20]:
for d in devs:
    if d['not_answerable']:
        print_case(d)
        break

url       : https://www.gov.uk/financial-help-disabled
scenario  : I was injured in an accident at work 6 months ago and now I am unable to work for the rest of my working life due to my injuries.
question  : What is the maximum amount of Disability Living Allowance I can claim?
not_answerable: True
answers   : []
evidences : []
id        : dev-7
document  :
          title     : Financial help if you're disabled
          url       : https://www.gov.uk/financial-help-disabled
          contents  : ['<h1>Overview</h1>', '<p>There is a wide range of disability-related financial support, including benefits, tax credits, payments, grants and concessions.</p>', '<p>Some benefits you might get are:</p>', '<li>Universal Credit</li>', '<li>Personal Independence Payment (PIP) or Disability Living Allowance (DLA)</li>', '<li>Attendance Allowance</li>', '<li>‘new style’ Employment and Support Allowance (ESA)</li>', '<p>Depending on your circumstances, you might also be able to get:</p>', '<li>In

#### b) yes/no question w/o conditions

In [21]:
for d in devs:
    if {a[0].lower() for a in d['answers']}.intersection({'yes', 'no'}):
        print_case(d)
        break

url       : https://www.gov.uk/apply-special-guardian
scenario  : I am a guardian to a 16year old teenage boy from syria whom i cared for when he was young and homeless. I want to change his surname to to mine.
question  : Can I apply to change his surname to mine?
not_answerable: False
answers   : [['yes', ['<p>You’ll need to get the consent of everyone who has parental responsibility for the child before you make some important decisions, for example:</p>', '<p>If you cannot get consent, you can ask the court to decide. Use the form ‘Make an application in existing court proceedings related to children’ (form C2).</p>']]]
evidences : ['<p>You’ll need to get the consent of everyone who has parental responsibility for the child before you make some important decisions, for example:</p>', '<li>changing the child’s surname</li>', '<p>If you cannot get consent, you can ask the court to decide. Use the form ‘Make an application in existing court proceedings related to children’ (form C2).<

#### c) yes/no question w/ conditions

In [22]:
found = False
for d in devs:
    ans = d['answers']
    for a in ans:
        if a[0] in {'yes', 'no'} and len(a[1]):
            print_case(d)
            found = True
    if found:
        break

url       : https://www.gov.uk/apply-special-guardian
scenario  : I am a guardian to a 16year old teenage boy from syria whom i cared for when he was young and homeless. I want to change his surname to to mine.
question  : Can I apply to change his surname to mine?
not_answerable: False
answers   : [['yes', ['<p>You’ll need to get the consent of everyone who has parental responsibility for the child before you make some important decisions, for example:</p>', '<p>If you cannot get consent, you can ask the court to decide. Use the form ‘Make an application in existing court proceedings related to children’ (form C2).</p>']]]
evidences : ['<p>You’ll need to get the consent of everyone who has parental responsibility for the child before you make some important decisions, for example:</p>', '<li>changing the child’s surname</li>', '<p>If you cannot get consent, you can ask the court to decide. Use the form ‘Make an application in existing court proceedings related to children’ (form C2).<

#### d) extractive question w/o conditions

In [23]:
found = False
for d in devs:
    ans, evi = d['answers'], d['evidences']
    for e in evi:
        for a in ans:
            if a[0] not in {'yes', 'no'} and len(a[1]) == 0 and a[0].lower() in e.lower():
                print_case(d)
                found = True
    if found:
        break

url       : https://www.gov.uk/apply-special-guardian
scenario  : My brother and his wife are in prison for carrying out a large fraud scheme. Their 7 and 8 year old children have been living with me for the last 4 years. I want to become their Special Guardian to look after them permanently
question  : How long will it be before I hear back from the court?
not_answerable: False
answers   : [['within 10 days', []]]
evidences : ['<p>Within 10 days of receiving your application the court will send you a case number and a date for a meeting to set out:</p>']
id        : dev-0
document  :
          title     : Become a special guardian
          url       : https://www.gov.uk/apply-special-guardian
          contents  : ['<h1>What is a special guardian</h1>', '<p>You can apply to be a child’s special guardian when they cannot live with their birth parents and adoption is not right for them.</p>', '<p>You’ll be responsible for looking after the child until they’re 18 (unless the court takes

#### e) extractive question w/ conditions

In [24]:
found = False
for d in devs:
    ans, evi = d['answers'], d['evidences']
    for e in evi:
        for a in ans:
            if a[0] not in {'yes', 'no'} and len(a[1]) > 0 and a[0].lower() in e.lower():
                print_case(d)
                found = True
    if found:
        break

url       : https://www.gov.uk/support-for-foster-parents
scenario  : I have completed a training to be an approved foster. I am getting a foster care allowance to help the cost of caring for a child. I get approximatively £12000 per annum
question  : What are my tax liability of the money I receive from foster care allowance and are there any tax relaxation ?
not_answerable: False
answers   : [['in a tax year, households do not pay tax on the first £10,000 they earn from fostering.', []], ['you also get tax relief for every week (or part week) that a child is in your care. this means you do not have to pay tax on some of your earnings over £10,000.', ['<tr>Under 11 | £200 per child</tr>', '<tr>11 or over | £250 per child</tr>']]]
evidences : ['<p>In your tax return, you’ll be able to claim:</p>', '<li>a tax exemption of up to £10,000 per household</li>', '<li>tax relief for every week you foster a child</li>', '<p>In a tax year, households do not pay tax on the first £10,000 they earn

#### f) mixture of deterministic and conditionals 

In [25]:
found = False
for d in devs:
    found_det, found_con = False, False
    ans = d['answers']
    for a in ans:
        if len(a[1]) > 0:
            found_con = True
        if len(a[1]) == 0:
            found_det = True
        if found_con and found_det:
            found = True
            break
    if found:
        print_case(d)
        break

url       : https://www.gov.uk/tax-property-money-shares-you-inherit
scenario  : I inherited forty thousand pounds from my late father. I also have savings of around the same amount. I am currently out of work.
question  : Do I have to pay inheritance tax on my windfall?
not_answerable: False
answers   : [['yes', ['<p>The estate of the person who died usually pays Inheritance Tax. You may need to pay Inheritance Tax if the estate can’t or doesn’t pay it.</p>', '<p>You may have to pay Inheritance Tax on money and shares you inherit if the deceased person’s estate can’t or doesn’t pay.</p>']], ['no', []]]
evidences : ['<p>You don’t usually pay tax on anything you inherit at the time you inherit it.</p>', '<p>The estate of the person who died usually pays Inheritance Tax. You may need to pay Inheritance Tax if the estate can’t or doesn’t pay it.</p>', '<p>In most cases you don’t pay any tax on money and shares when you inherit them.</p>', '<p>You may have to pay Inheritance Tax on money a

#### g) other generated answers

In [26]:
found = False
hit = False
for d in devs:
    ans, evi = d['answers'], d['evidences']
    for a in ans:
        if a[0] not in {'yes', 'no'}:
            for e in evi:
                if a[0].lower() in e.lower():
                    hit = True
            if not hit:
                print_case(d)
                break
    if found:
        break