In [1]:
import spacy
import nltk
import pandas as pd
from IPython.display import display
nlp = spacy.load("en_core_web_sm")

## Read focus words from File

In [2]:
focus = []
with open("resources/focus_words.txt",'r') as fh:
    for line in fh:
        line = line.strip("\n")
        line = line.strip(' ')
        focus.append(line)
print(focus)

['kill', 'death', 'shoot', 'take', 'remove', 'kidnap', 'transport', 'train', 'fled', 'deport', 'expel', 'transfer', 'resettle', 'escape', 'run', 'murder', 'burn', 'hang', 'execute', 'throw', 'beat', 'stab', 'tuberculosis', 'epidemic', 'exterminate', 'typhus', 'dysentry', 'typhoid', 'kidnap', 'emigrate']


In [23]:
with open("corpus/Blazowa.txt",'r',encoding = "utf-8") as fh:
    text = fh.readlines()

## Find sentences that have these focus words,

Reverted back to wordnet as the memory requirements for word_vec is huge, but will work on good computers.

In [24]:
ls_index = []
ls_word = []
ls_match = []
for i,sent in enumerate(text):
    doc = nlp(sent)
    for token in doc:
        if token.lemma_ in focus:
            ls_index.append(i)
            ls_word.append(token.text)
            ls_match.append(token.lemma_)
d = {"SentIndex": ls_index,
       "FocusWord": ls_word,
       "MatchingWord": ls_match}
df = pd.DataFrame(d)
display(df)

Unnamed: 0,SentIndex,FocusWord,MatchingWord
0,1,expel,expel
1,5,transferred,transfer
2,8,took,take
3,9,running,run
4,10,taken,take
5,10,running,run
6,11,taken,take
7,16,ran,run
8,16,executed,execute
9,17,resettled,resettle


## Extracting Chunks from each sentence.
### Chunks will later be used to enhance the triples.

In [25]:
def get_phrase(token,sent):
    """
    Given a token that is noun or PROPN,
    get the Noun Phrase, This is a form of manual chunking
    """
    visited = set()
    visited.add(token.i)

    def visit_children(token,visited):
        if len(list(token.children)) != 0:
            for c in token.children:
                visited.add(c.i)
                visit_children(c,visited)

    visit_children(token,visited)
    visited = list(visited)
    visited.sort()
    phrase = ""
    for i in visited:
        phrase = phrase + " "+sent[i].text
    return phrase

ls_nouns = ["nsubj","dobj","pobj","nsubpass"]
ls_sentindices = []
ls_word_chunks = []
for index in df["SentIndex"]:
    sent = nlp(text[index])
    for token in sent:
        if token.dep_ in ls_nouns:
            phrase = get_phrase(token,sent)
            ls_sentindices.append(index)
            ls_word_chunks.append(phrase)
chunk_dict = {"sentIdx": ls_sentindices,
             "Chunks": ls_word_chunks}
chunk_df = pd.DataFrame(chunk_dict)
display(chunk_df)

Unnamed: 0,sentIdx,Chunks
0,1,the Soviets
1,1,Poland
2,1,the east
3,1,"September 17 , 1939"
4,1,German troops stationed in Błażowa
5,1,Błażowa
6,1,as many Jews as possible
7,1,them
8,1,the San River –
9,1,less than 16 kilometers ( 10 miles ) which ha...


## Triple extraction logic.

### Reverted back to original simple logic as the complex one has too many bugs.

Since we went back to this simple logic of triple extraction. We are no longer able to generate all potential triples.
The set of triple obtained is a small subset of all triples that actually exist in the text.

In [26]:
def get_triple(sent):
    nouns = list(sent.noun_chunks)
    for token in sent:
        if token.dep_ in ["nsubj","nsubjpass"] and token.head.pos_ == "VERB":
            vphrase = token.head
            sphrase = token
            for possible_object in vphrase.children:
                if possible_object.dep_ in["dobj","iobj","pobj"]:
                    return(sphrase,vphrase,possible_object)

### Chunks from the previous step is used here to obtain triples that are descriptive.

The triples obtained here are not reflective of the intended results.

In [27]:
count = 0
total = 0
ls_sent_indices = []
ls_subjects = []
ls_verbs = []
ls_objects = []

def get_chunk(sentidx,word):
    match = []
    for i,chunk in zip(ls_sentindices,ls_word_chunks):
        if i == sentidx:
            match.append(chunk)
        for w in match:
            if word.text in w:
                return w
        return word.text

for i,line in enumerate(text):
    sent = nlp(line)
    tup= get_triple(sent)
    if not(tup is None):
        s,v,o = tup
        subj = get_chunk(i,s)
        obj = get_chunk(i,o)
        ls_sent_indices.append(i)
        ls_subjects.append(subj)
        ls_verbs.append(v)
        ls_objects.append(obj)
d = {"SentIndx": ls_sent_indices,
     "Subjects": ls_subjects,
    "Verbs: ": ls_verbs,
    "Objects: ": ls_objects}
df = pd.DataFrame(d)
display(df)

Unnamed: 0,SentIndx,Subjects,Verbs:,Objects:
0,0,﻿Błażowa,located,kilometers
1,1,the Soviets,invaded,Poland
2,2,authorities,set,council
3,3,Poles,worked,estate
4,4,Germans,invaded,Union
5,5,they,had,relatives
6,7,Judenrat,opened,kitchen
7,8,Natansohn,chaired,committee
8,9,it,served,children
9,10,craftsmen,running,businesses


## Extracting Meta-data.

Verbs that are part of our extracted triples may contain some addition information. Based on observations this additional information is usually attached to the preposition of the verb. 

This Code can determine such prepositions and the verbs they are attached to to extract meta-data for the verb.

This meta data can be also encoded as triple as shown below in results.

In [28]:
ls_sindex = []
ls_verb = []
ls_prep = []
ls_data = []

for index,line in enumerate(text):
    sent = nlp(line)
    for token in sent:
        if token.dep_ == "prep" and token.head.pos_ == "VERB" and token.head.lemma_ in focus:
            ls_sindex.append(index)
            ls_verb.append(token.head.text)
            ls_prep.append(token.text)
            word = get_phrase(list(token.children)[0],sent)
            ls_data.append(word)
meta_dict = {"sentIndex": ls_sindex,
             "Verb": ls_verb,
            "preposition": ls_prep,
            "Meta": ls_data}
meta_df = pd.DataFrame(meta_dict)
display(meta_df)

Unnamed: 0,sentIndex,Verb,preposition,Meta
0,1,expel,by,ordering them to cross the San River –
1,5,transferred,By,July 1940
2,5,transferred,to,Błażowa
3,5,transferred,via,Rzeszów
4,10,running,In,July 1941
5,11,taken,In,mid - June 1941
6,11,taken,to,Rzeszów
7,16,ran,in,panic
8,17,resettled,In,May or June 1942
9,17,resettled,into,the Błażowa ghetto
