In [373]:
import pandas as pd

from gensim.corpora.dictionary import Dictionary
import re
import numpy as np
from razdel import tokenize
import pymorphy2

from gensim.models import LdaModel
from gensim.test.utils import datapath
from nltk.corpus import stopwords

import nltk

In [374]:
news = pd.read_csv("articles.csv")
users = pd.read_csv('users_articles.csv')
display(news.head(5),news.shape, users.head(5), users.shape)

Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...
3,4898,Главный тренер «Кубани» Юрий Красножан прокомм...
4,4899,Решением попечительского совета владивостокско...


(27000, 2)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"
3,u101138,"[5933, 6186, 5055, 6977, 5206, 488389]"
4,u108248,"[707, 1144, 2532, 2928, 3133, 324592]"


(8000, 2)

In [375]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/circle/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [376]:
stopword_ru = stopwords.words('russian')
len(stopword_ru)

morph = pymorphy2.MorphAnalyzer()

In [377]:
with open('stopwords.txt') as f:    
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords
len(stopword_ru)

776

In [378]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [379]:
%%time
#Запускаем очистку текста. Будет долго...
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

CPU times: user 49.7 s, sys: 1.37 s, total: 51 s
Wall time: 58.1 s


In [380]:
news['title'].head(3)

0    заместитель председателяnправительства рфnсерг...
1    матч  финала кубка россии по футболу был приос...
2    форвард авангарда томаш заборский прокомментир...
Name: title, dtype: object

In [381]:
%%time
#Запускаем лемматизацию текста. Будет очень долго...
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

CPU times: user 6min 28s, sys: 5.36 s, total: 6min 33s
Wall time: 7min 19s


In [382]:
news['title'].head(3)

0    [заместитель, председатель, правительство, рф,...
1    [матч, финал, кубок, россия, футбол, приостано...
2    [форвард, авангард, томаш, заборский, прокомме...
Name: title, dtype: object

In [383]:
#сформируем список наших текстов, разбив еще и на пробелы
texts = [t for t in news['title'].values]

In [384]:
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [385]:
%%time
from gensim.models import LdaModel
# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary)#, passes=10)

CPU times: user 1min 31s, sys: 3.45 s, total: 1min 34s
Wall time: 1min 47s


In [386]:
from gensim.test.utils import datapath
# Save model to disk.
temp_file = datapath("model.lda")
lda.save(temp_file)

# Load a potentially pretrained model from disk.
lda = LdaModel.load(temp_file)

In [387]:
# Create a new corpus, made of previously unseen documents.
other_texts = [t for t in news['title'].iloc[:3]]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]

unseen_doc = other_corpus[0]

print(other_texts[1])
lda[unseen_doc]

['матч', 'финал', 'кубок', 'россия', 'футбол', 'приостановить', 'судья', 'изз', 'взрыв', 'пиротехнический', 'снаряд', 'передавать', 'корреспондент', 'газета', 'ru', 'болельщик', 'выбросить', 'поле', 'петарда', 'судья', 'увести', 'команда', 'поле', 'подтрибунный', 'помещение', 'динамовец', 'уйти', 'торпедовец', 'остаться', 'кромка', 'поле', 'матч', 'остановить', 'пять', 'минута', 'газета', 'ru', 'вести', 'онлайнтрансляция', 'матч']


[(0, 0.115769096), (1, 0.013913814), (13, 0.8619121)]

In [388]:
x=lda.show_topics(num_topics=25, num_words=7,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

#Below Code Prints Only Words 
for topic,words in topics_words:
    print(f"topic_{topic}: "+"_".join(words))

topic_0: взрыв_смерть_восток_млн_стол_место_умереть
topic_1: форум_открытие_памятник_планета_мышь_nn_ремонт
topic_2: военный_россия_фестиваль_наука_активность_nn_доклад
topic_3: писать_газ_научный_пенсия_температура_конкурс_городской
topic_4: век_мэй_диск_образовать_лесной_пограничный_экспериментальный
topic_5: млрд_рост_уровень_газ_россия_журнал_сша
topic_6: украина_украинский_ракета_территория_новый_китай_киев
topic_7: российский_россия_сша_исследование_система_американский_помощь
topic_8: путин_млн_владимир_пресссекретарить_поверхность_сократиться_песок
topic_9: авария_налог_управлять_писать_ск_офицер_свидетель
topic_10: гражданин_погибнуть_ребёнок_миссия_фронт_народный_семья
topic_11: новый_всё_рынок_проект_первый_большой_связанный
topic_12: рубль_россия_статья_закон_район_газета_санкция
topic_13: nn_правительство_россия_глава_банк_развитие_население
topic_14: цена_продукция_турция_спрос_турецкий_доллар_евро
topic_15: фонд_тыс_станция_обнаружить_дом_участок_препарат
topic_16: опера

In [389]:
def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(
        zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [390]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.115783,0.013885,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4896,0.842735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4897,0.262203,0.034345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.565853,0.0,0.0,0.0,0.0,0.115455,0.0,0.0
3,4898,0.19119,0.0,0.0,0.046192,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.081594,0.0,0.0,0.0,0.0
4,4899,0.148324,0.0,0.0,0.0,0.0,0.0,0.0,0.631554,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [391]:
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [392]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))

In [393]:
doc_dict[0]

array([0.03268312, 0.        , 0.        , 0.        , 0.        ,
       0.27336738, 0.        , 0.19907463, 0.        , 0.        ,
       0.        , 0.        , 0.37754872, 0.        , 0.05551696,
       0.        , 0.        , 0.0273231 , 0.        , 0.02139095,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [394]:
user_articles_list = users['articles'].iloc[33]

def get_user_embedding(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.mean(user_vector, 0)
    return user_vector

In [395]:
pd.DataFrame(get_user_embedding(user_articles_list), index=(f'topic_{i}' for i in range(1,26)))

Unnamed: 0,0
topic_1,0.0
topic_2,0.0
topic_3,0.046011
topic_4,0.012519
topic_5,0.0
topic_6,0.036679
topic_7,0.061313
topic_8,0.123521
topic_9,0.04021
topic_10,0.002225


### Task 1

In [396]:
def get_user_embedding_median(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.median(user_vector, 0)
    return user_vector

In [397]:
def get_user_embedding_max(user_articles_list):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = np.max(user_vector, 0)
    return user_vector

In [398]:


def total_score(func):
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

    user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: func(x), 1)])
    user_embeddings.columns = [f'topic_{i}' for i in range(25)]
    user_embeddings['uid'] = users['uid'].values
    user_embeddings = user_embeddings[['uid'] + [f'topic_{i}' for i in range(25)]]

    X = pd.merge(user_embeddings, target, 'left')

    #разделим данные на train/test
    X_train, X_test, y_train, y_test = train_test_split(
    X[[f'topic_{i}' for i in range(25)]], X['churn'], random_state=13)
    
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    pred = lr.predict_proba(X_test)[:, 1]

    # Рассчитаем Precision, Recall, F_score, roc_auc_score
    precision, recall, thresholds = precision_recall_curve(y_test, pred)
    fscore = (2 * precision * recall) / (precision + recall)
    
    # locate the index of the largest f score
    idx = np.argmax(fscore)
    roc_auc_score = roc_auc_score(y_test, pred)

    return round(thresholds[idx], 2), round(fscore[idx], 2), round(precision[idx], 2), round(recall[idx], 2), round(roc_auc_score, 2)

In [399]:
target = pd.read_csv("users_churn.csv")
target.head(5)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0
3,u103439,0
4,u104300,0


In [400]:
metrics = pd.DataFrame(np.array([
    total_score(f) for f in [get_user_embedding,get_user_embedding_median,get_user_embedding_max]
]), columns=['Best_Threshold', 'F_Score', 'Precision', 'Recall', 'ROC_AUC'], index=['mean', 'median', 'max'])
metrics

Unnamed: 0,Best_Threshold,F_Score,Precision,Recall,ROC_AUC
mean,0.23,0.61,0.54,0.69,0.91
median,0.27,0.72,0.67,0.77,0.96
max,0.32,0.76,0.75,0.78,0.97
