In [2]:
import numpy as np
import pickle
import gensim
import pandas as pd
import pymorphy2
from nltk.tokenize import RegexpTokenizer

Using TensorFlow backend.


In [4]:
DIR = 'data/w2v_models/'
MODEL_NAME = 'all.norm-sz100-w10-cb0-it1-min100.w2v'

In [42]:
w2v = gensim.models.KeyedVectors.load_word2vec_format(DIR + MODEL_NAME, binary=True, unicode_errors='ignore')
w2v.init_sims(replace=True)

In [26]:
class SentenseProcessor(object):
    def __init__(self, w2v_model_path, stop_list=[], tokenizer_regexp=u'[а-яА-Яa-zA-Z]+'):
        self.w2v = self._load_w2v(w2v_model_path)
        self.morph = pymorphy2.MorphAnalyzer()
        self.tokenizer = RegexpTokenizer(tokenizer_regexp)
        self.stop_list = []
        
    def _load_w2v(self, w2v_model_path):
        w2v = gensim.models.KeyedVectors.load_word2vec_format(w2v_model_path, binary=True, unicode_errors='ignore')
        w2v.init_sims(replace=True)
        return w2v
    
    def _make_bag_of_words(self, sample):
        if type(sample) is list:
            pass
        elif type(sample) is str:
            sample = sample.split()
        else:
            raise Exception('Sample should be string or list of words')
        return sample
        
    def tokenize(self, sample):
        '''make tokenization, return bag of words'''
        return self.tokenizer.tokenize(sample)
    
    def normalize(self, sample):
        """make words normalization"""
        bag_of_words = self._make_bag_of_words(sample)
        return [self.morph.parse(word)[0].normal_form for word in bag_of_words]
    
    def delete_stop_words(self, sample, stop_list=[]):
        """delete all garbage words from sample"""
        if not stop_list:
            stop_list = self.stop_list
        
        bag_of_words = self._make_bag_of_words(sample)

        for word in bag_of_words:
            if word.lower() in stop_list:
                bag_of_words.remove(word)

        return sample
    
    def process(self, sample, tokenize=True, normalize=True, delete_stop_words=True):
        
        sample = sample.lower()
        
        if tokenize:
            sample = self.tokenize(sample)
            
        if normalize:
            sample = self.normalize(sample)
            
        if delete_stop_words:
            sample = self.delete_stop_words(sample)
        
        return sample
    
    def cut_or_add(self, sample, vec_len):
        
        while len(sample) < vec_len:
            sample.append(np.zeros_like(sample[0], dtype=np.float32))
        
        if len(sample) > vec_len:
            sample = sample[:vec_len]
            
        return sample
    
    def convert2matrix(self, sample, vec_len=500):
        bag_of_words = self._make_bag_of_words(sample)
        bag_of_vectors = [self.w2v.word_vec(word) for word in bag_of_words]
        if vec_len:
            bag_of_vectors = self.cut_or_add(bag_of_vectors, vec_len)
        matrix = np.array(bag_of_vectors)
        return matrix

In [27]:
w2v_path = DIR + MODEL_NAME
sentence_processor = SentenseProcessor(w2v_path)

In [22]:
print (sentence_processor.process('Привет, меня зовут Павел'))
print (sentence_processor.process('Привет, меня зовут Павел', normalize=False))
print (sentence_processor.process('Привет, меня зовут Павел', tokenize=False))
sentence_processor.stop_list = ['и', 'а', ]
print (sentence_processor.process('Привет, меня зовут Павел и пока'))

['привет', 'я', 'звать', 'павел']
['привет', 'меня', 'зовут', 'павел']
['привет,', 'я', 'звать', 'павел']
['привет', 'я', 'звать', 'павел', 'пока']


In [40]:
%%time
sent = sentence_processor.process('Привет, меня зовут Павел и пока')
print (sent)
matrix = sentence_processor.convert2matrix(sent, vec_len=100)
print (matrix.shape)
print (matrix)

['привет', 'я', 'звать', 'павел', 'и', 'пока']
(100, 100)
[[-0.15173334 -0.00833917 -0.04943448 ...,  0.02435549 -0.07983094
  -0.11920947]
 [ 0.00650954  0.02521983 -0.08965836 ...,  0.09672094  0.0080568
  -0.09304222]
 [-0.15055948 -0.03354909 -0.05089633 ..., -0.04077737 -0.04305392
  -0.13613304]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]]
Wall time: 10 ms
