In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import pickle
import re
from itertools import combinations
import numpy as np


In [None]:
class NewsRecommender:
    """
    обучить систему на корпусе текстов  с помощью тематической модели и метрики, выбранных в результате исследования
    """
    def __init__(self):
        self.model = None
        self.vectorizer = TfidfVectorizer()
        self.w2v = None
        self.texts = None
        
        
    def train(self, texts):
        
        topic_range = range(4, 20)
        tfidfs = self.vectorizer.fit_transform(texts)
        fnames = self.vectorizer.get_feature_names()
        docgen = TokenGenerator( texts, [] )
        w2v_model = Word2Vec(docgen, size=500, min_count=20, sg=1)
        self.w2v = w2v_model
        print( "Model has %d terms" % len(w2v_model.wv.vocab) )
        w2v_model.save("w2v-model.bin")
        # texts = fetch_20newsgroups()
        best_model = None
        best_score = 0
        
        for i in topic_range:
            lda = LatentDirichletAllocation(n_components=i, learning_method="batch")
            W_lda = lda.fit_transform(tfidfs)
            with open("lda_%d.pkl" % i, "w") as io:
                pickle.dump(lda, io)
            H_lda = lda.components_
            nmf = NMF(n_components=i, solver="nndsvda")
            W_nmf = nmf.fit_transform(tfidfs)
            with open("nmf_%d.pkl" % i, "w") as io:
                pickle.dump(nmf, io)
            H_nmf = nmf.components_
            term_rankings_lda = get_descriptor(fnames, H_lda, i, 10)
            term_rankings_nmf = get_descriptor(fnames, H_nmf, i, 10)
            lda_score = tcw2c(self.w2v, term_rankings_lda)
            nmf_score = tcw2c(self.w2v, term_rankings_nmf)
            
            if lda_score > nmf_score and lda_score > best_score:
                best_score = lda_score
                best_model = lda
            elif nmf_score > lda_score and nmf_score > best_score:
                best_score = nmf_score
                best_model = nmf
        
        self.model = best_model
        print(type(best_model))
        print(best_model.n_topics)
        print(best_score)
            

            
    """
    выдать k самых пожих новостей для заданного заголовка по функции расстояния, выбранной в результате исследования
    обратите внимание, что text_sample может содержать слова не из обучающего корпуса
    """
    def recommend(self, text_sample, k):
        return ["news_1", "news_2", ... , "news_k"]
    
def tcw2c( w2v_model, term_rankings ):
    overall_coherence = 0.0
    for topic_index in range(len(term_rankings)):
        pair_scores = []
        for pair in combinations( term_rankings[topic_index], 2 ):
            pair_scores.append( w2v_model.similarity(pair[0], pair[1]) )
        topic_score = sum(pair_scores) / len(pair_scores)
        overall_coherence += topic_score
    return overall_coherence / len(term_rankings)

def get_descriptor( terms, H, topic_index, top ):
    top_indices = np.argsort( H[topic_index,:] )[::-1]
    top_terms = []
    for term_index in top_indices[0:top]:
        top_terms.append( terms[term_index] )
    return top_terms
        
    
class TokenGenerator:
    def __init__( self, documents, stopwords ):
        self.documents = documents
        self.stopwords = stopwords
        self.tokenizer = re.compile( r"(?u)\b\w\w+\b" )

    def __iter__( self ):
        for doc in self.documents:
            tokens = []
            for tok in self.tokenizer.findall( doc ):
                if tok in self.stopwords:
                    tokens.append( "<stopword>" )
                elif len(tok) >= 2:
                    tokens.append( tok )
            yield tokens