In [4]:
from warnings import warn
import pymorphy2
import re
import nltk
import numpy as np

In [32]:
class BetterMorphology:
    """
    A class for fast morphological analysis and word vectorization of texts.
    """

    def __init__(self):
        self.morpho = pymorphy2.MorphAnalyzer()
        self.cache = {}  # word form: initial word form
        self.dictionary = {}  # initial word form: vector position
        self.stop_words = set()

    def analyze_text(self, text):
        """
        This method performs morphological analysis of input text.
        It isolates words written only in cyrillic letters.


        Args:
            text: A string of text to analyze.

        Returns:
           A list of words in their initial form excluding stop words.

        """
        words = [word[0].lower() for word in re.findall(r"([А-ЯЁа-яё]+(-[А-ЯЁа-яё]+)*)", text) if
                 self.check_stop_words(word[0].lower())]
        return self.analyze_words(words)

    def analyze_words(self, words):
        """
        This method is used for morphological analysis.

        Args:
            words: A list of words to analyse.

        Returns:
           A list of normalized words.
        """
        res = []
        for word in words:
            if word in self.cache:
                res.append(self.cache[word])
            else:
                norm = self.morpho.parse(word)[0].normal_form
                res.append(norm)
                self.cache[word] = norm
                if norm not in self.dictionary:
                    self.dictionary[norm] = len(self.dictionary) + 1
        return res

    def clear_dict(self):
        """
        Clears the class dictionary.
        """
        self.dictionary.clear()

    def check_stop_words(self, word):
        """
        Checks whether a word is a stop word or not.

        Args:
            word: A word to check.

        Returns:
           A boolean value. True if given word was not found in the set of stop words, False otherwise.
        """
        return False if word.lower() in self.stop_words else True
        
    def clear_stop_words(self):
        """
        A method to clear an existing set of stop words.
        """
        self.stop_words.clear()

    @staticmethod
    def cosine_similarity(a, b):
        """
        This method evaluates cosine distance between two word vectors.
        
        Args:
            a: The first vector.
            b: The second vector.
            
        Return:
            A floating point value between 0 and 1 where 0 is least similar and 1 is most similar. 

        """
        #if not a or not b:
        if not len(a) or not len(b): #the trick above doesn't work numpy arrays
            raise ValueError("Vectors can't be empty.")

        if isinstance(a, (list, np.ndarray)) and isinstance(b, (list, np.ndarray)):
            if len(a) != len(b):
                warn("The lengths of input vectors are not matching. " +
                "The result is evaluated using the shortest vector's length. ")
                return np.sum([x * y for x, y in zip(a, b)]) / np.sqrt(np.dot(a, a) * np.dot(b, b))
            else:
                return np.dot(a, b) / np.sqrt(np.dot(a, a) * np.dot(b, b))

        elif isinstance(a, dict) and isinstance(b, dict):
            numerator = np.sum([a[k] * b[k] for k in set(a.keys()).intersection(b.keys())])
            denominator = np.sqrt(sum(map(lambda x: x ** 2, a.values())) * sum(map(lambda x: x ** 2, b.values())))
            return numerator / denominator

        else:
            raise TypeError("Vector type or vector type combination is not supported.")
        
    def fill_stop_words(self, stop_words):
        """
        This method is used to fill or update the set of stop words.
        It is assumed that the input list contains words in lowercase.

        Args:
            stop_words: An iterable containing words considered unimportant.

        Returns:
            None.
        """
        self.stop_words.update(stop_words)

    def filter_stop_words(self, words):
        """
        A method that removes stop words from a given list of words.

        Args:
            words: A list of words to filter.

        Returns:
            A list of words with stop words removed.
        """
        return list(filter(self.check_stop_words, words))
    
    def form_dict(self, texts):
        """
        Fill the class dictionary without generating word vectors.

        Args:
            texts: An iterable containing texts.

        Returns:
            None.
        """
        for text in texts:
            for word in text:
                if word not in self.cache:
                    norm = self.morpho.parse(word)[0].normal_form
                    self.cache[word] = norm
                    if norm not in self.dictionary:
                        self.dictionary[norm] = len(self.dictionary)

    @staticmethod
    def jaccard_coefficient(a, b):
        """
        This method evaluates Jaccard Coefficient (AKA Intersection over Union) of two vectors.
        
        Args:
            a: The first vector.
            b: The second vector.
            
        Returns:
            A floating point value between 0 and 1.
        """
        #if not a or not b:
        if not len(a) or not len(b):
            raise ValueError("Vectors can't be empty.")

        if isinstance(a, (list, np.ndarray)) and isinstance(b, (list, np.ndarray)):
            if len(a) != len(b):
                warn("The lengths of input vectors are not matching. " +
                     "The result is evaluated using the shortest vector's length. ")
            return len([x * y for x, y in zip(a, b) if x * y]) / len([x + y for x, y in zip(a, b) if x + y])
        elif isinstance(a, dict) and isinstance(b, dict):
            return len(set(a).intersection(b)) / len(set(a).union(b))
        else:
            raise TypeError("Vector type not supported.")
        
    def vectorize_as_array(self, words):
        """
        This method transforms an iterable of tokens into dense vector form.
        The position of each of word in the vector is stored in the class dictionary
        with the word itself as a key.
        
        Args:
            words: An iterables of tokens.
            
        Returns:
            A numpy array where each element is absolute frequency of a word and
            the position corresponds to the value of a given word in the dictionary.
            Contains all the words from the class dictionary even if the frequency
            is zero.
        """
        for word in words:
            if word not in self.cache:
                norm = self.morpho.parse(word)[0].normal_form
                self.cache[word] = norm
                if norm not in self.dictionary:
                    self.dictionary[norm] = len(self.dictionary.keys())
        word_vector = np.zeros((len(self.dictionary)))
        for word in words:
            word_vector[self.dictionary[self.cache[word]]] += 1
        return word_vector

    def vectorize_as_dict(self, words):
        """
        This method transforms an iterable of tokens into sparse vector form.

        Args:
            words: An iterable consisting of tokens.

        Returns:
            A dictionary containing initial forms of words as keys and their
            corresponding absolute frequencies as values.
        """
        word_vector = {}
        for word in words:
            if word in self.cache:
                word_vector[self.cache[word]] = word_vector.get(self.cache[word], 0) + 1  # absolute word frequency
            else:
                norm = self.morpho.parse(word)[0].normal_form
                self.cache[word] = norm
                word_vector[norm] = word_vector.get(norm, 0) + 1  # default value 0 if no value found by key
                if norm not in self.dictionary:
                    self.dictionary[norm] = len(self.dictionary)
        return word_vector

    def vectorize_as_list(self, words):
        """
        Transforms an iterable of tokens into a dense vector represenation.
        The position of each word is stored in the class dictionary.

        Args:
            words: A list of tokens to transform.

        Returns:
            A list where each element is absolute frequency of a word and
            the position corresponds to the value of a given word in the dictionary.
            Contains all the words from the class dictionary even if the frequency
            is zero.
        """
        for word in words:
            if word not in self.cache:
                norm = self.morpho.parse(word)[0].normal_form
                self.cache[word] = norm
                if norm not in self.dictionary:
                    self.dictionary[norm] = len(self.dictionary.keys())
        word_vector = [0 for _ in self.dictionary]
        for word in words:
            word_vector[self.dictionary[self.cache[word]]] += 1
        return word_vector

    def vectorize_as_list2(self, words):
        """
        Transforms an iterable of tokens into a dense vector represenation.
        The position of each word is stored in the class dictionary.
        Only words found in the class dictionary are accounted for, i.e.
        no new words are added to the class dictionary.

        Args:
            words: A list of tokens to transform.

        Returns:
            A list where each element is absolute frequency of a word and
            the position corresponds to the value of a given word in the dictionary.
            Contains all the words from the class dictionary even if the frequency
            is zero.
        """
        word_vector = [0 for _ in self.dictionary]
        for word in words:
            if word in self.cache:
                word_vector[self.dictionary[self.cache[word]]] += 1
        return word_vector


In [33]:
m = BetterMorphology()

In [34]:
stop_words = nltk.corpus.stopwords.words('russian')
m.fill_stop_words(stop_words)

In [8]:
m.analyze_text('Коэффициент Жаккара - отношение количества слов, встречающихся в обоих текстах к объединению лексики.')

['коэффициент',
 'жаккара',
 'отношение',
 'количество',
 'слово',
 'встречаться',
 'оба',
 'текст',
 'объединение',
 'лексика']

In [35]:
with open("data/war_and_peace.txt", encoding='utf-8') as f:
    war_and_peace = f.read()

In [36]:
with open('data/sebastopol.txt', encoding='utf-8') as f:
    seba = f.read()

In [42]:
words1 = m.analyze_text(war_and_peace)
words2 = m.analyze_text(seba)

In [43]:
words1 = m.vectorize_as_dict(words1)
words2 = m.vectorize_as_dict(words2)

In [44]:
m.cosine_similarity(words1, words2)

0.7192171320999665

In [45]:
with open('data/master.txt', encoding='cp1251') as f: # Мастер и Маргарита
    mast = f.read()

In [46]:
words3 = m.analyze_text(mast)
words3 = m.vectorize_as_dict(words3)

In [47]:
m.cosine_similarity(words1, words3)

0.5716111265611228

In [48]:
m.jaccard_coefficient(words1, words3)

0.1861354825545807

In [21]:
with open("data/veyr/index_split_017.xhtml", encoding='utf-8') as f: # Грузим главу 15, она побольше.
    veyr = f.read()

In [22]:
words4 = m.analyze_text(veyr)
words4 = m.vectorize_as_dict(words4)

In [23]:
m.cosine_similarity(words1, words4)

0.5010908763905298

In [24]:
m.cosine_similarity(words3, words4)

0.4283925404605853

In [39]:
war_arr = m.vectorize_as_array(m.analyze_text(war_and_peace))
seb_arr = m.vectorize_as_array(m.analyze_text(seba))

In [40]:
m.cosine_similarity(war_arr, seb_arr)

0.7192064466651534

In [41]:
m.jaccard_coefficient(war_arr, seb_arr)

0.15594405594405594