In [1]:
import pandas as pd
import numpy as np
import re
import itertools

import nltk
from nltk.corpus import stopwords
from gensim.corpora.dictionary import Dictionary
from razdel import tokenize
import pymorphy2
nltk.download('stopwords')

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score 
from sklearn.metrics import precision_score, classification_report, precision_recall_curve, confusion_matrix

import matplotlib.pyplot as plt

%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
news = pd.read_csv("./Desktop/Artyom_DS/ML in Business/articles.csv")
print(news.shape)
news.head(3)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


In [3]:
users = pd.read_csv("./Desktop/Artyom_DS/ML in Business/users_articles.csv")
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [4]:
stopword_ru = stopwords.words('russian')
len(stopword_ru)

morph = pymorphy2.MorphAnalyzer()

In [5]:
with open("./Desktop/Artyom_DS/ML in Business/stopwords.txt") as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords
len(stopword_ru)

776

In [6]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [7]:
%%time
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

  from ipykernel import kernelapp as app


Wall time: 29.3 s


In [8]:
%%time
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

Wall time: 3min 18s


In [9]:
texts = [t for t in news['title'].values]

common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [10]:
%%time
from gensim.models import LdaModel

lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary)#, passes=10)

Wall time: 37.1 s


In [11]:
from gensim.test.utils import datapath

temp_file = datapath("model.lda")
lda.save(temp_file)

lda = LdaModel.load(temp_file)

In [12]:
other_texts = [t for t in news['title'].iloc[:3]]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]

unseen_doc = other_corpus[2]
print(other_texts[2])
lda[unseen_doc]

['форвард', 'авангард', 'томаш', 'заборский', 'прокомментировать', 'игра', 'свой', 'команда', 'матч', 'чемпионат', 'кхл', 'против', 'атланта', 'nnnn', 'провести', 'плохой', 'матч', 'нижний', 'новгород', 'против', 'торпедо', 'настраиваться', 'первый', 'минута', 'включиться', 'работа', 'сказать', 'заборский', 'получиться', 'забросить', 'быстрый', 'гол', 'задать', 'хороший', 'темп', 'поединок', 'мочь', 'играть', 'ещё', 'хороший', 'сторона', 'пять', 'очко', 'выезд', 'девять', 'это', 'хороший']


[(3, 0.31034908),
 (4, 0.049976952),
 (10, 0.12918384),
 (11, 0.03454674),
 (22, 0.044156797),
 (24, 0.414802)]

In [13]:
x=lda.show_topics(num_topics=25, num_words=7,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]


for topic,words in topics_words:
    print("topic_{}: ".format(topic)+" ".join(words))

topic_0: годовой свидетель сенатор терминал звание беспрецедентный киргизия
topic_1: фронт народный способность артист су звезда орден
topic_2: газ лечение умереть кость боль маршрут одежда
topic_3: напомнить километр праздник год топливо nn это
topic_4: рубль гражданин статья доллар тыс поверхность год
topic_5: год это который мочь исследование новый человек
topic_6: год млн рост составить тыс это который
topic_7: мышь виза студия привлекательность армянский васильев армения
topic_8: год российский эксперт nn который технология россия
topic_9: снижение район город фестиваль область мероприятие конкурс
topic_10: продукция nn ступень физика тема ремонт кризис
topic_11: взрыв мозг произойти выяснить проверка городской спустя
topic_12: военный рейс боевой район операция глубина космонавт
topic_13: космос турецкий турция планета кожа лекарство еда
topic_14: мальчик лётчик тепло странный египетский пилот широко
topic_15: женщина писать мужчина nn вицепремьер девочка прогнозировать
topic_16:

In [14]:
def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [15]:
#mean
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.0,0.0,0.08942,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.893022,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4896,0.0,0.0,0.0,0.246353,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.170215,0.0,0.0,0.0,0.04806,0.0
2,4897,0.0,0.0,0.0,0.31038,0.049977,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.044155,0.0,0.414827
3,4898,0.0,0.0,0.0,0.123398,0.0,0.290966,0.0,0.0,0.0,...,0.0,0.0,0.060173,0.0,0.0,0.0,0.0,0.0,0.0,0.515532
4,4899,0.0,0.0,0.0,0.200898,0.0,0.0,0.0,0.0,0.0,...,0.557811,0.0,0.0,0.0,0.0,0.0,0.178089,0.0,0.0,0.0


In [16]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))

In [17]:
def get_user_embedding(user_articles_list, agg_func='mean'):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    if agg_func == 'median':
        user_vector = np.median(user_vector, 0)
    elif agg_func == 'max':
        user_vector = np.max(user_vector, 0)
    else:
        user_vector = np.mean(user_vector, 0)
    return user_vector

In [18]:
def add_string_to_df(df, string):
    data = df.copy()
    data.loc[len(data)] = string
    return data

In [19]:
results = pd.DataFrame(columns=['type','roc_auc','precision','recall','f_score','threshold'])

In [20]:
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x,'mean'), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.009814,0.0,0.0,0.072232,0.0,0.208356,0.041125,0.0,0.049251,...,0.044489,0.003475,0.15217,0.089025,0.195034,0.009124,0.002467,0.013179,0.011054,0.013243
1,u108690,0.0,0.003945,0.0,0.004014,0.015362,0.146465,0.03751,0.0,0.011472,...,0.043785,0.003665,0.143796,0.10389,0.151149,0.003404,0.096422,0.012407,0.049333,0.116796
2,u108339,0.004296,0.0,0.008916,0.022407,0.017647,0.096949,0.00804,0.0,0.018282,...,0.002605,0.004457,0.145341,0.090753,0.178363,0.042086,0.083631,0.0,0.085344,0.047073


In [21]:
target = pd.read_csv("./Desktop/Artyom_DS/ML in Business/users_churn.csv")
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


In [22]:
X = pd.merge(user_embeddings, target, 'left')
X.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24,churn
0,u105138,0.009814,0.0,0.0,0.072232,0.0,0.208356,0.041125,0.0,0.049251,...,0.003475,0.15217,0.089025,0.195034,0.009124,0.002467,0.013179,0.011054,0.013243,0
1,u108690,0.0,0.003945,0.0,0.004014,0.015362,0.146465,0.03751,0.0,0.011472,...,0.003665,0.143796,0.10389,0.151149,0.003404,0.096422,0.012407,0.049333,0.116796,1
2,u108339,0.004296,0.0,0.008916,0.022407,0.017647,0.096949,0.00804,0.0,0.018282,...,0.004457,0.145341,0.090753,0.178363,0.042086,0.083631,0.0,0.085344,0.047073,1


In [23]:
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

In [24]:
logreg = LogisticRegression()

logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
preds = logreg.predict_proba(X_test)[:, 1]
preds[:10]

array([0.12101561, 0.0569261 , 0.46537916, 0.18697775, 0.00909749,
       0.05769021, 0.12701452, 0.04417622, 0.186513  , 0.05249499])

In [26]:

precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)

ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.251256, F-Score=0.710, Precision=0.641, Recall=0.796


In [27]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [28]:
r_a = roc_auc_score(y_test, preds)
r_a

0.9549718006860864

In [29]:
mean_results = ['mean',r_a, precision[ix],recall[ix],fscore[ix],thresholds[ix]]
results = add_string_to_df(results, mean_results)
results

Unnamed: 0,type,roc_auc,precision,recall,f_score,threshold
0,mean,0.954972,0.641447,0.795918,0.710383,0.251256


In [30]:
#median
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x,'median'), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.0,0.0,0.0,0.042034,0.0,0.160691,0.013213,0.0,0.0,...,0.014772,0.0,0.163247,0.022548,0.164687,0.0,0.0,0.0,0.0,0.0
1,u108690,0.0,0.0,0.0,0.0,0.008722,0.127672,0.016406,0.0,0.0,...,0.029681,0.0,0.085594,0.09224,0.140494,0.0,0.035045,0.0,0.013138,0.094672
2,u108339,0.0,0.0,0.0,0.016055,0.005042,0.073495,0.0,0.0,0.019561,...,0.0,0.0,0.103629,0.042667,0.168885,0.023128,0.0,0.0,0.079163,0.038233


In [31]:
target = pd.read_csv("./Desktop/Artyom_DS/ML in Business/users_churn.csv")
X = pd.merge(user_embeddings, target, 'left')
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

In [32]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
preds = logreg.predict_proba(X_test)[:, 1]

In [34]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)

ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.310008, F-Score=0.798, Precision=0.776, Recall=0.820


In [35]:
r_a = roc_auc_score(y_test, preds)
r_a

0.9754264782836213

In [36]:
median_results = ['median',r_a, precision[ix],recall[ix],fscore[ix],thresholds[ix]]
results = add_string_to_df(results, median_results)
results

Unnamed: 0,type,roc_auc,precision,recall,f_score,threshold
0,mean,0.954972,0.641447,0.795918,0.710383,0.251256
1,median,0.975426,0.776062,0.820408,0.797619,0.310008


In [37]:
#max
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x,'max'), 1)])
user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
user_embeddings.head(3)

Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,u105138,0.048687,0.0,0.0,0.206647,0.0,0.544086,0.182438,0.0,0.245429,...,0.132388,0.02085,0.260167,0.425228,0.396364,0.054744,0.014803,0.079077,0.066325,0.079457
1,u108690,0.0,0.023673,0.0,0.024082,0.05557,0.33793,0.115355,0.0,0.06883,...,0.121076,0.021988,0.344343,0.231286,0.264629,0.020426,0.34559,0.058957,0.143566,0.344915
2,u108339,0.025777,0.0,0.027447,0.05565,0.054967,0.214381,0.026389,0.0,0.044798,...,0.015631,0.015275,0.301722,0.344781,0.240991,0.173803,0.312707,0.0,0.191796,0.093398


In [38]:
target = pd.read_csv("./Desktop/Artyom_DS/ML in Business/users_churn.csv")
X = pd.merge(user_embeddings, target, 'left')
X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)

In [39]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [40]:
preds = logreg.predict_proba(X_test)[:, 1]

In [41]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)

ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.341331, F-Score=0.790, Precision=0.776, Recall=0.804


In [42]:
r_a = roc_auc_score(y_test, preds)
r_a

0.9689074946217804

In [43]:
max_results = ['max',r_a, precision[ix],recall[ix],fscore[ix],thresholds[ix]]
results = add_string_to_df(results, max_results)
results

Unnamed: 0,type,roc_auc,precision,recall,f_score,threshold
0,mean,0.954972,0.641447,0.795918,0.710383,0.251256
1,median,0.975426,0.776062,0.820408,0.797619,0.310008
2,max,0.968907,0.775591,0.804082,0.789579,0.341331
