In [1]:
import re, string, unicodedata
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
import gensim
import numpy as np
import pandas as pd
input_df = pd.read_csv('companies_data_neoway_subsample/input.csv')

In [2]:
#takes some time to load
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [3]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    #words = remove_non_ascii(words) #TODO: try?
    words = to_lowercase(words)
    words = remove_punctuation(words)
    #words = replace_numbers(words) #TODO: try
    #words = remove_stopwords(words) #TODO: try
    return words

In [7]:
def sentence_to_embedding(sentence,model):
    """Extract word embeddings from a sentence
    # Arguments
        sentence: sentence to convert to word embeddings; type: string
        model: word embedding model ; type: gensim.models.keyedvectors.Word2VecKeyedVectors
    # Returns
        np.array of shape (dim of word embedding,)
    """
    sentence = nltk.word_tokenize(sentence)
    sentence = normalize(sentence)
    #now using simple average (TODO: tf-idf version)
    dim = model.vector_size
    return np.mean([model[w] for w in sentence if w in model] or [np.zeros(dim)], axis=0)

In [35]:
def dataframe_to_embedding(df,attribute_list,model):
    """Extract word embeddings from original dataset
    # Arguments
        df: pd dataframe of the dataset
        list: list of attribute names (in string type) relevant for word embeddings
    # Returns
        np.array of shape (# of attributes, # of entities, dim of word embedding)
    """
    if bool(set(attribute_list)-set(df.columns.values))==True:
        raise ValueError('Attributes provided do not exist.')
    else:
        return np.vstack([[np.vstack(df[attribute].apply(str).apply(sentence_to_embedding,model=model))] for attribute in attribute_list])

In [41]:
#testing
test = dataframe_to_embedding(input_df,['name','addressStreet','addressCity','addressState'],model)

In [42]:
test.shape

(4, 189, 300)

In [None]:
def compute_similarity():
    