In [1]:
import re, math, urllib.request as req, numpy as np
from functools import reduce
import nltk.stem.snowball, bs4, collections
from scipy.spatial.distance import cosine

In [2]:
bookNames = ["Война и Мир 1", "Война и Мир 2", "Про войну", "Про Мышу"]
bookUrls = {
    "http://vojnaimir.ru/files/book1.txt" : "windows-1251",
    "http://vojnaimir.ru/files/book2.txt" : "windows-1251",
    "http://www.rastamantales.com/skazki/content/pro-voynu": "utf-8",
    "http://www.rastamantales.com/skazki/content/skazka-pro-myshu": "utf-8"}

def getBook(url, enc):
    return bs4.BeautifulSoup(req.urlopen(url).read().decode(enc), "lxml").text

books = [getBook(*u) for u in bookUrls.items()]

In [3]:
# возьмём из каждого текста только слова
# длина которых больше 2 символов, и оставим от них только основы
def textToWords(text):
    stemmer = nltk.stem.snowball.RussianStemmer()
    return [stemmer.stem(word) for word in re.split('\W+', text) if len(word) > 2]
    
# тексты, разбитые на слова
bookWords = [textToWords(book) for book in books]
# все известные нам слова
lexiconSet = reduce(lambda s, all: all | s, map(set, bookWords), set())

In [4]:
lexicon = list(lexiconSet)
inBooksMet = [0] * len(lexicon)

def vectorize(bookW, lexicon, inBooksMet=None):
    stats = collections.Counter()
    for word in bookW:
        stats[word] += 1
    vector = [0] * len(lexicon)
    
    for j in range(len(lexicon)):
        vector[j] = stats[lexicon[j]]
        if inBooksMet is not None and vector[j]:
            inBooksMet[j] += 1
    return np.array(vector)

vectorBooks = []
for book in bookWords:
    vector = vectorize(book, lexicon, inBooksMet)
    vectorBooks.append(vector)

In [5]:
N = len(books)

def norm(wordCount, inBooksMet, Nbooks):
    # первый множитель сглаживает влияние числа слов
    # второй множитель уменьшает влияние маловажных слов, которые есть во всех текстах
    return math.log(1 + wordCount, 2) * math.log(Nbooks / inBooksMet, 2)

def normVector(vector):
    return list(map(lambda v: norm(*v, N), zip(vector, inBooksMet)))

vectors = [np.array(normVector(vect)) for vect in vectorBooks]

In [6]:
cosDistances = np.matrix([0.]*N*N).reshape(N,N)

def cosine(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

for i in range(len(vectors)):
    for j in range(len(vectors)):
        cosDistances[i,j] = cosine(vectors[i], vectors[j])

In [7]:
print(cosDistances)

[[ 1.          0.550479    0.01601007  0.01725074]
 [ 0.550479    1.          0.01856861  0.01660717]
 [ 0.01601007  0.01856861  1.          0.35835123]
 [ 0.01725074  0.01660717  0.35835123  1.        ]]


In [8]:
def search(query):
    print("Ищем:", query)
    vect = vectorize(textToWords(query), lexicon)
    return sorted([(cosine(book[0], vect), book[1]) for book in zip(vectors, bookNames)], reverse=True)

print(*search("Пьер Безухов и Наташа Ростова"), sep="\n")

Ищем: Пьер Безухов и Наташа Ростова
(0.048983574415158146, 'Война и Мир 1')
(0.042743471630742193, 'Война и Мир 2')
(0.0, 'Про войну')
(0.0, 'Про Мышу')
