In [0]:
import pandas as pd
import numpy as np
import nltk
from sklearn.metrics import accuracy_score

In [0]:

nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [0]:
# Read the bAbI data as CSV
filename = 'resources/qa1_single-supporting-fact_train.txt'
data_qa1 = pd.read_csv(filename, delimiter='\t', names=['sentence', 'answer', 'factid'])
data_qa1 = data_qa1.fillna('')

In [0]:
data_qa1[:6]

Unnamed: 0,sentence,answer,factid
0,1 Mary moved to the bathroom.,,
1,2 John went to the hallway.,,
2,3 Where is Mary?,bathroom,1.0
3,4 Daniel went back to the hallway.,,
4,5 Sandra moved to the garden.,,
5,6 Where is Daniel?,hallway,4.0


In [0]:
# Tag each sentence as a statement (S) or question (Q)
tag_sentence = lambda row: 'S' if row.answer == '' else 'Q'
data_qa1['type'] = data_qa1.apply(tag_sentence, axis=1)

# Use NLTK to tokenize the sentences into arrays of words
# If you get an error here, make sure you have downloaded the NLTK packages above
tokenize = lambda row: nltk.word_tokenize(row.sentence)[1:]
data_qa1.sentence = data_qa1.apply(tokenize, axis=1)

# Drop the factid column, as we won't need it
data_qa1 = data_qa1.drop('factid', axis=1)

In [0]:
data_qa1[:6]

Unnamed: 0,sentence,answer,type
0,"[Mary, moved, to, the, bathroom, .]",,S
1,"[John, went, to, the, hallway, .]",,S
2,"[Where, is, Mary, ?]",bathroom,Q
3,"[Daniel, went, back, to, the, hallway, .]",,S
4,"[Sandra, moved, to, the, garden, .]",,S
5,"[Where, is, Daniel, ?]",hallway,Q


In [0]:
# Create a DataFrame with just the statements
def statements(df):
    return df[df.type == 'S'] \
        .reset_index(drop=True) \
        .drop('answer', axis=1) \
        .drop('type', axis=1)

# Create a DataFrame with just the questions
def questions(df):
    return df[df.type == 'Q'] \
        .reset_index(drop=True) \
        .drop('type', axis=1)

In [0]:
statements(data_qa1)[:4]

Unnamed: 0,sentence
0,"[Mary, moved, to, the, bathroom, .]"
1,"[John, went, to, the, hallway, .]"
2,"[Daniel, went, back, to, the, hallway, .]"
3,"[Sandra, moved, to, the, garden, .]"


In [0]:
questions(data_qa1)[:2]

Unnamed: 0,sentence,answer
0,"[Where, is, Mary, ?]",bathroom
1,"[Where, is, Daniel, ?]",hallway


In [0]:
# Tag each token as a part of speech
pos_tag = lambda row: nltk.pos_tag(row.sentence)
data_qa1['tag'] = data_qa1.apply(pos_tag, axis=1)

In [0]:
data_qa1[['sentence', 'tag']][:5]

Unnamed: 0,sentence,tag
0,"[Mary, moved, to, the, bathroom, .]","[(Mary, NNP), (moved, VBD), (to, TO), (the, DT..."
1,"[John, went, to, the, hallway, .]","[(John, NNP), (went, VBD), (to, TO), (the, DT)..."
2,"[Where, is, Mary, ?]","[(Where, WRB), (is, VBZ), (Mary, NNP), (?, .)]"
3,"[Daniel, went, back, to, the, hallway, .]","[(Daniel, NNP), (went, VBD), (back, RB), (to, ..."
4,"[Sandra, moved, to, the, garden, .]","[(Sandra, NNP), (moved, VBD), (to, TO), (the, ..."


In [0]:
def extract_statement(tags):
    '''Extracts a (subject, relation, object) triple from each statement based on the POS tags'''
    subject, relation, obj = '', '', ''
    for word,tag in tags:
        if tag == 'NNP':
            subject = word
        elif tag == 'VBD' or word == 'journeyed': # TODO: 'journeyed' is tagged improperly
            relation = word
        if tag == 'NNP' or tag == 'NN':
            obj = word
    return (subject, relation, obj)

In [0]:
def extract_question(tags):
    '''Extracts the entity under discussion from each question based on the POS tags'''
    entityUnderDiscussion = ''
    # This will find the last noun in the sentence
    for word,tag in tags:
        if tag == 'NNP' or tag == 'NN':
            entityUnderDiscussion = word
    return entityUnderDiscussion

In [0]:
def extract(row):
    '''Extracts the appropriate data given a processed DataFrame row'''
    if row.type == 'S':
        return extract_statement(row.tag)
    else:
        return extract_question(row.tag)

In [0]:
data_qa1['extracted'] = data_qa1.apply(extract, axis=1)

In [0]:
data_qa1[['sentence', 'extracted']][:5]

Unnamed: 0,sentence,extracted
0,"[Mary, moved, to, the, bathroom, .]","(Mary, moved, bathroom)"
1,"[John, went, to, the, hallway, .]","(John, went, hallway)"
2,"[Where, is, Mary, ?]",Mary
3,"[Daniel, went, back, to, the, hallway, .]","(Daniel, went, hallway)"
4,"[Sandra, moved, to, the, garden, .]","(Sandra, moved, garden)"


Voila, extraction is complete.

In [0]:
def person_statements(person):
    '''Get all statements that refer to the specified person'''
    stat = statements(data_qa1)
    return stat[stat.extracted.map(lambda t: t[0] == person)]

For instance, we can find all statements that refer to Sandra.

In [0]:
person_statements('Sandra')[:3]

Unnamed: 0,sentence,tag,extracted
3,"[Sandra, moved, to, the, garden, .]","[(Sandra, NNP), (moved, VBD), (to, TO), (the, ...","(Sandra, moved, garden)"
5,"[Sandra, journeyed, to, the, bathroom, .]","[(Sandra, NNP), (journeyed, VBD), (to, TO), (t...","(Sandra, journeyed, bathroom)"
10,"[Sandra, travelled, to, the, office, .]","[(Sandra, NNP), (travelled, VBD), (to, TO), (t...","(Sandra, travelled, office)"


In [0]:
def person_statements_recent(person, n=5):
    '''Get the n most recent statements that refer to the specified person in reverse chronological order'''
    return person_statements(person)[-n:].iloc[::-1]

In [0]:
person_statements_recent('Daniel', n=3)

Unnamed: 0,sentence,tag,extracted
1999,"[Daniel, went, to, the, garden, .]","[(Daniel, NNP), (went, VBD), (to, TO), (the, D...","(Daniel, went, garden)"
1996,"[Daniel, travelled, to, the, kitchen, .]","[(Daniel, NNP), (travelled, VBD), (to, TO), (t...","(Daniel, travelled, kitchen)"
1992,"[Daniel, moved, to, the, office, .]","[(Daniel, NNP), (moved, VBD), (to, TO), (the, ...","(Daniel, moved, office)"


In [0]:
from neo4j.v1 import GraphDatabase, basic_auth

In [0]:
# Create a neo4j session
# NOTE: Make sure that URL/credentials are correct and that Neo4j is running
driver = GraphDatabase.driver('bolt://localhost:7687', auth=basic_auth('neo4j', 'neo4j'))

In [0]:
# WARNING: This function will clear the database when run!
# Make sure all important data is backed up before continuing
def reset_db():
    '''Remove all nodes and relationships from the database'''
    session = driver.session()
    session.run('MATCH (n) DETACH DELETE n')

In [0]:
def create(query, n=0):
    '''Given a query, create a graph based on each triple in the extracted statements'''
    session = driver.session()
    stat = statements(data_qa1)
    n = len(stat) if n <= 0 else n # Run the first n statements if specified
    for subject,relation,obj in stat[:n].extracted:
        session.run(query, subject=subject, relation=relation, obj=obj)

In [0]:
reset_db() # This will clear the database!

In [0]:
# Create a direct relationship between subject and object
v1_query = '''
    MERGE (s:SUBJECT {name: $subject}) 
    MERGE (o:OBJECT  {name: $obj}) 
    MERGE (s)-[r:RELATION {name: $relation}]->(o)
'''

create(v1_query)

<img src="https://github.com/rishabhdhenkawat/graph-nlu/blob/master/notebooks/screenshots/simple-relation.png?raw=1" style="width:700px">

In [0]:
reset_db()

In [0]:
# Represent each relation as a node
v2_query = '''
    MERGE (s:SUBJECT {name: $subject})
    MERGE (o:OBJECT  {name: $obj})
    CREATE (s)-[:R0]->(r:RELATION {name: $relation})-[:R1]->(o)
'''

create(v2_query)

In [0]:
reset_db()

In [0]:
v3_query = '''
    MERGE (s:SUBJECT {name: $subject})
    MERGE (o:OBJECT  {name: $obj})
    
    WITH s,o
    
    // Create an new relation between the subject and object
    CREATE (s)-[:R0]->(r:RELATION {name: $relation})-[:R1]->(o)
    CREATE (s)-[h:HEAD]->(r) // Make the newly created relation the head of the list
    
    WITH s,r,o,h
    
    // Find the previous head of the list (if none exist, this query will terminate here)
    MATCH (s)-[h_prev:HEAD]->(r_prev:RELATION)
    WHERE h_prev <> h
    
    // Complete the link, remove the previous head pointer
    CREATE (r_prev)-[:NEXT]->(r)
    DELETE h_prev
'''

In [0]:
session = driver.session()
# Create an index for faster access
session.run('CREATE INDEX ON :SUBJECT(name)')
session.run('CREATE INDEX ON :RELATION(name)')
session.run('CREATE INDEX ON :OBJECT(name)')
create(v3_query)

<img src="https://github.com/rishabhdhenkawat/graph-nlu/blob/master/notebooks/screenshots/local-list.png?raw=1" style="width:800px">

In [0]:
def find_person(person):
    '''Find the room a person is currently in'''
    query = '''
        MATCH (s:SUBJECT {name:$name})-[:HEAD]->(r:RELATION)-->(o:OBJECT)
        RETURN s AS subject, r AS relation, o AS obj
    '''
    return session.run(query, name=person)

In [0]:


session = driver.session()
record = find_person('Mary').single()
print(record['obj'].get('name'))

kitchen


In [0]:
person_statements_recent('Mary', n=1)

Unnamed: 0,sentence,tag,extracted
1994,"[Mary, journeyed, to, the, kitchen, .]","[(Mary, NNP), (journeyed, VBD), (to, TO), (the...","(Mary, journeyed, kitchen)"


In [0]:
def find_person_history(person, n=100):
    '''Find the list of rooms a person was in, ordered by recency'''
    length = str(n) if n >= 1 else ''
    
    query = '''
        MATCH (s:SUBJECT {name:$name})-[:HEAD]->(r:RELATION)-->(o:OBJECT)
        MATCH (s)-->(r_prev:RELATION)-[k*1..%s]->(r), (r_prev)-->(o_prev:OBJECT)
        
        WITH size(k) AS dist, r, o, r_prev, o_prev
        ORDER BY size(k)
        
        WITH r, o, r_prev, o_prev
        RETURN [r.name] + collect(r_prev.name) AS relation, [o.name] + collect(o_prev.name) AS obj
    '''
    query = query % length
    
    session = driver.session()
    record = session.run(query, name=person).single()
    history = list(zip(record['relation'], record['obj']))[:-1]
    
    return history

In [0]:
find_person_history('John', n=5)

[('went', 'bedroom'),
 ('went', 'garden'),
 ('went', 'office'),
 ('journeyed', 'bedroom'),
 ('travelled', 'hallway')]

In [0]:
person_statements_recent('John', n=5)

Unnamed: 0,sentence,tag,extracted
1995,"[John, went, back, to, the, bedroom, .]","[(John, NNP), (went, VBD), (back, RB), (to, TO...","(John, went, bedroom)"
1989,"[John, went, back, to, the, garden, .]","[(John, NNP), (went, VBD), (back, RB), (to, TO...","(John, went, garden)"
1986,"[John, went, back, to, the, office, .]","[(John, NNP), (went, VBD), (back, RB), (to, TO...","(John, went, office)"
1982,"[John, journeyed, to, the, bedroom, .]","[(John, NNP), (journeyed, NN), (to, TO), (the,...","(John, journeyed, bedroom)"
1979,"[John, travelled, to, the, hallway, .]","[(John, NNP), (travelled, VBD), (to, TO), (the...","(John, travelled, hallway)"


In [0]:
def find_room_visitors(room):
    '''Find the list of visitors a room has, ordered by recency'''
    
    query = '''
        MATCH (r:RELATION)-->(o:OBJECT {name:$name})
        RETURN count(r) AS count
    '''
    
    session = driver.session()
    record = session.run(query, name=room).single()
    
    return record['count']

In [0]:
find_room_visitors('office')

334

## Calculate an Accuracy Score

In [0]:
def get_answers(row):
    '''Given an input row merge the statement in the graph, 
    or query the graph if it is a question'''
    if row.type == 'S':
        subject,relation,obj = row.extracted
        session.run(v3_query, subject=subject, relation=relation, obj=obj)
        return ''
    elif row.type == 'Q':
        person = row.extracted
        # WARNING: do not consume the result (e.g., call .consume() or .single()) 
        # until the entire iteration is done.
        # Failure to do so may cause the queries to be VERY slow!
        return find_person(person)

In [0]:
reset_db()

In [0]:
session = driver.session()
results = data_qa1.apply(get_answers, axis=1)
results = [x for x in results if x != '']
predicted = [result.single()['obj'].get('name') for result in results]

The `predicted` array contains the predicted answer to each question.`

In [0]:
predicted[:5]

['bathroom', 'hallway', 'hallway', 'office', 'bathroom']

In [0]:
actual = list(data_qa1[data_qa1.type == 'Q'].answer)

In [0]:
actual[:5]

['bathroom', 'hallway', 'hallway', 'office', 'bathroom']

In [0]:
accuracy_score(actual, predicted)

1.0