In [1]:
import pandas as pd
import numpy as np
import spacy as spacy
from spacy import displacy

In [2]:
df = pd.read_pickle('../dataframes/df_eq_label.pkl')

In [3]:
def eq_only(df):
    df = df[df['label'] == 1]
    df.dropna(subset=['body'], inplace=True)
    return df

In [4]:
df = eq_only(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
nlp = spacy.load('en_core_web_lg')
docs = [nlp(doc) for doc in df['body']]

## Feature Engineering

In [6]:
# Let's work with one example for now. 

doc = docs[4]
displacy.render(doc, style='ent', jupyter=True)

In [7]:
# Feature 1: The average of the word vectors corresponding to the sentence in which the candidate location is present.
loc_ents = np.array([ent for ent in doc.ents if ent.label_=='GPE' or ent.label_=='LOC'])
names = np.array([ent.text for ent in doc.ents if ent.label_ == 'GPE' or ent.label_=='LOC'])
f1 = np.array([ent.sent.vector for ent in doc.ents if ent.label_ == 'GPE' or ent.label_=='LOC'])

In [8]:
print(loc_ents.shape, names.shape, f1.shape)

(15,) (15,) (15, 300)


In [9]:
# Feature 2: A binary vector whose value at each dim is set to 1 only if the entity type
# appears in the setence of the candidate location. 

ENT_MAP = {'PERSON': 0,
          'NORP': 1,
          'FAC': 2,
          'ORG': 3,
          'GPE': 4,
          'LOC': 5,
          'PRODUCT': 6,
          'EVENT': 7,
          'WORK_OF_ART': 8,
          'LAW': 9,
          'LANGUAGE': 10,
          'DATE': 11,
          'TIME': 12,
          'PERCENT': 13,
          'MONEY': 14,
          'QUANTITY': 15,
          'ORDINAL': 16,
          'CARDINAL': 17}

num_ents = len(ENT_MAP)
num_examples = len(names)

f2 = np.zeros((num_examples, num_ents))


for i, e1 in enumerate(loc_ents):
    for e2 in e1.sent.ents:
        f2[i, ENT_MAP[e2.label_]] = 1      

In [10]:
# [(i, e2.label_) for i, e1 in enumerate(loc_ents) for e2 in e1.sent.ents]

f2.shape

(15, 18)

In [11]:
# Feature 3: A binary vector whos whose dimensions correspond to the possible POS tags. The value at each dim is set
# similarly as above.

POS_MAP = {'ADJ' : 0,
          'ADP' : 1,
          'ADV': 2,
          'AUX': 3,
          'CONJ': 4,
          'CCONJ': 5,
          'DET': 6,
          'INTJ': 7,
          'NOUN': 8,
          'NUM': 9,
          'PART': 10,
          'PRON': 11,
          'PROPN': 12,
          'PUNCT': 13,
          'SCONJ': 14,
          'SYM': 15,
          'VERB': 16,
          'X': 17,
          'SPACE': 18}

num_pos = len(POS_MAP)

f3 = np.zeros((num_examples, num_pos))

for i, e1 in enumerate(loc_ents):
    for e2 in e1.sent:
        f3[i, POS_MAP[e2.pos_]] = 1

In [12]:
# Feature 4: Token offset of the candidate location (from the beginning of sentence)

f4 = np.array([e.start for e in loc_ents])
f4.shape

(15,)

In [13]:
# Feature 5: Total count of times that GPE appeared in the document.

from collections import Counter

loc_counts = Counter(names)
loc_counts

f5 = np.array([loc_counts[e.text] for e in loc_ents])
f5

array([4, 2, 4, 1, 3, 4, 2, 3, 4, 1, 1, 2, 3, 2, 1])

In [14]:
# Hstack all of the features

names = names.reshape(-1, 1)
f4 = f4.reshape(-1, 1)
f5 = f5.reshape(-1, 1)

feature = np.hstack((names, f1, f2, f3, f4, f5))

In [15]:
feature.shape

(15, 340)

In [16]:
feature

array([['Los Angeles', '-0.1111575', '0.23521973', ..., '0.0', '7', '4'],
       ['California', '-0.1111575', '0.23521973', ..., '0.0', '10', '2'],
       ['Los Angeles', '0.12489756', '0.10034302', ..., '0.0', '36', '4'],
       ...,
       ['US', '-0.0028829884', '0.12967458', ..., '0.0', '351', '3'],
       ['Northridge', '-0.043124933', '0.2711033', ..., '0.0', '362',
        '2'],
       ['San Francisco', '-0.012085507', '0.122322366', ..., '0.0',
        '380', '1']], dtype='<U32')

In [17]:
names

array([['Los Angeles'],
       ['California'],
       ['Los Angeles'],
       ['Westwood'],
       ['US'],
       ['Los Angeles'],
       ['Northridge'],
       ['US'],
       ['Los Angeles'],
       ['San Andreas'],
       ['Pacific'],
       ['California'],
       ['US'],
       ['Northridge'],
       ['San Francisco']], dtype='<U13')

In [18]:
feature.shape

(15, 340)