In [1]:
import csv
import numpy as np
import pandas as pd
import re

ner_dictionary = {"0": "person",    
                    "1": "location",
                    "2": "property",
                    "3": "facility",
                    "4": "organization",
                    "5": "Misc"}

In [2]:
def read_csv(filename = 'data/ner_data.csv'):
    phrase = []
    ner = []

    with open (filename) as csvDataFile:
        csvReader = csv.reader(csvDataFile)

        for row in csvReader:
            phrase.append(row[0])
            ner.append(row[1])

    X = np.asarray(phrase)
    Y = np.asarray(ner, dtype=int)

    return X, Y

In [3]:
X_train, Y_train = read_csv('data/train_ner.csv')
X_test, Y_test = read_csv('data/test_ner.csv')

In [4]:
def label_to_ner(label):
    return ner_dictionary[str(label)]

In [5]:
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

In [6]:
Y_oh_train = convert_to_one_hot(Y_train, C = 6)
Y_oh_test = convert_to_one_hot(Y_test, C = 6)

In [7]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [8]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

In [9]:
def sentence_to_avg(sentence, word_to_vec_map):
    """
    Converts a sentence (string) into a list of words (strings). Extracts the GloVe representation of each word
    and averages its value into a single vector encoding the meaning of the sentence.
    """
    # Split sentence into list of lower case words (≈ 1 line)
    words = (sentence.lower()).split()
#     print(words)

    # Initialize the average word vector, should have the same shape as your word vectors.
    avg = np.zeros((50,))
    
    # average the word vectors. You can loop over the words in the list "words".
    for w in words:
        try:
            avg += word_to_vec_map[w]
#             print(word_to_vec_map[w])
        except:
            continue
#             print(w)
    avg = avg / len(words)
        
    return avg

In [10]:
avg = sentence_to_avg("In Paris for", word_to_vec_map)
print("avg = ", avg)

avg =  [ 0.41767667  0.59758667 -0.65344    -0.19065633 -0.14341267 -0.17668333
 -0.59169667 -0.08378967 -0.286369   -0.09159     0.14554167 -0.48012
 -0.61809667 -0.35851333  0.52082667 -0.147288   -0.17276533 -0.15958333
 -0.84262333 -0.07995     0.83005333  0.38693    -0.04865     0.16038267
 -0.54974333 -1.53546667 -0.269039    0.01381667 -0.22590333  0.11782133
  3.3204      0.30733    -0.41791467 -0.39879667  0.06613    -0.25322833
  0.19150967  0.50869667  0.240988    0.09266333  0.12594633 -0.01752573
  0.545627   -0.44854333 -0.17870133  0.19103633 -0.30942    -0.31141667
  0.28803333 -0.25586667]


In [11]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()
    
def print_predictions(X, pred):
    print()
    for i in range(X.shape[0]):
        print(X[i], label_to_ner(int(pred[i])))

In [12]:
def padding_sequence(idx, X):
    if idx == 0:
        return str(X[idx]) + str(X[idx + 1]) + str(X[idx + 2])
    elif idx == int(X.shape[0]) - 1:
        return str(X[idx - 2]) + str(X[idx - 1]) + str(X[idx])
    else:
        return str(X[idx-1]) +" "+ str(X[idx]) +" "+ str(X[idx+1])

In [13]:
def predict(X, Y, W, b, word_to_vec_map):
    """    
    Arguments:
    X -- input data containing sentences, numpy array of shape (m, None)
    Y -- labels, numpy array of shape (m, 1)
    
    Returns:
    pred -- numpy array of shape (m, 1) with your predictions
    """
    m = X.shape[0]
    pred = np.zeros((m, 1))
    
    for i in range(0, m):                       # Loop over training examples
        sentence = padding_sequence(i, X)
        sentence = re.sub(r'[^\w\s]', '', sentence)
        words = sentence.lower().split()        
        avg = np.zeros((50,))
        for w in words:
            try:
                avg += word_to_vec_map[w]
            except:
#                 print(w)
                continue
                
        avg = avg/len(words)

        # Forward propagation
        Z = np.dot(W, avg) + b
        A = softmax(Z)
        pred[i] = np.argmax(A)
#         print(pred[i])
        
    print("Accuracy: "  + str(np.mean((pred[:] == Y.reshape(Y.shape[0],1)[:]))))
    return pred

In [14]:
def model(X, Y, word_to_vec_map, learning_rate = 0.03, num_iterations = 100):
    """
    Model to train word vector representations in numpy.
    
    Arguments:
    X -- input data, numpy array of sentences as strings, of shape (m, 1)
    Y -- labels, numpy array of integers between 0 and 5, numpy-array of shape (m, 1)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    learning_rate -- learning_rate for the stochastic gradient descent algorithm
    num_iterations -- number of iterations
    
    Returns:
    pred -- vector of predictions, numpy-array of shape (m, 1)
    W -- weight matrix of the softmax layer, of shape (n_y, n_h)
    b -- bias of the softmax layer, of shape (n_y,)
    """
    
    np.random.seed(1)

    # Define number of training examples
    m = Y.shape[0]                          # number of training examples
    n_y = 6                                 # number of classes  
    n_h = 50                                # dimensions of the GloVe vectors 
    
    # Initialize parameters using Xavier initialization
    W = np.random.randn(n_y, n_h) / np.sqrt(n_h)
    b = np.zeros((n_y,))
    
    # Convert Y to Y_onehot with n_y classes
    Y_oh = convert_to_one_hot(Y, C = n_y) 
    
    v_w, v_b, eps = 0, 0, 1e-8
    # Optimization loop
    for t in range(num_iterations):                       # Loop over the number of iterations
        for i in range(0,m):                                # Loop over the training examples
            
            # Padding words to the i'th training example            
            sentence = padding_sequence(i, X)
            sentence = re.sub(r'[^\w\s]', '', sentence)
            
            avg = sentence_to_avg(sentence, word_to_vec_map)
            # Forward propagate through the softmax layer
            z = np.dot(W, avg) + b
            a = softmax(z)

            # Compute cost using the i'th training label's one hot representation and "A" (the output of the softmax)
            cost = -np.sum(np.multiply(Y_oh[i], np.log(a)))
            ### END CODE HERE ###
            
            # Compute gradients 
            dz = a - Y_oh[i]
            dW = np.dot(dz.reshape(n_y,1), avg.reshape(1, n_h))
            db = dz
            
            v_w = v_w + dW**2
            v_b = v_b + db**2
            
            # Update parameters with Adam Gradient Descent
            W = W - (learning_rate/np.sqrt(v_w + eps)) * dW
            b = b - (learning_rate/np.sqrt(v_b + eps)) * db
        
        if t % 50 == 0:
            print("Epoch: " + str(t) + " --- cost = " + str(cost))
            pred = predict(X, Y, W, b, word_to_vec_map)

    return pred, W, b

In [15]:
pred, W, b = model(X_train, Y_train, word_to_vec_map)
print(pred)

Epoch: 0 --- cost = 2.07270237827
Accuracy: 0.948538713195
Epoch: 50 --- cost = 3.07315772547
Accuracy: 0.948560523446
[[ 5.]
 [ 5.]
 [ 5.]
 ..., 
 [ 5.]
 [ 5.]
 [ 5.]]


In [16]:
print("Training set:")
pred_train = predict(X_train, Y_train, W, b, word_to_vec_map)
print('Test set:')
pred_test = predict(X_test, Y_test, W, b, word_to_vec_map)

Training set:
Accuracy: 0.948516902944
Test set:
Accuracy: 0.947702749739


In [27]:
X_my_sentences = np.array(["Queen", "Anne", "Hill", "is", "just", "few", "minutes", "from", "the", "Seattle", "center"])
Y_my_labels = np.array([[5],[0],[5],[5],[5],[5],[5],[5],[5],[1],[5]])

pred = predict(X_my_sentences, Y_my_labels , W, b, word_to_vec_map)
print_predictions(X_my_sentences, pred)

Accuracy: 0.818181818182

Queen Misc
Anne person
Hill person
is Misc
just Misc
few Misc
minutes Misc
from Misc
the Misc
Seattle Misc
center Misc
