In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

<h1>Считывание данных</h1>

In [2]:
users = pd.read_csv('train/users.csv', sep=';', index_col=None, dtype={'age': str, 'chb': str, 'chit_type': str, 'gender': str})
items = pd.read_csv('train/items.csv', sep=';', index_col=None, dtype={'author': str, 'bbk': str, 'izd': str, 'sys_numb': str, 'title': str, 'year_izd': str})
train_transactions = pd.read_csv('train/train_transactions_extended.csv', sep=';', index_col=None, dtype={'chb': str, 'date_1': str, 'is_printed': str, 'is_real': str, 'source': str, 'sys_numb': str, 'type': str})

In [3]:
print(f"Кол-во пользователей: {len(train_transactions['chb'].unique())}")
print(f"Кол-во документов в истории пользователей: {len(train_transactions['sys_numb'].unique())}")
print(f"Общее кол-во документов: {len(items['sys_numb'].unique())}")

Кол-во пользователей: 16753
Кол-во документов в истории пользователей: 194666
Общее кол-во документов: 354355


<h1>Предобработка данных</h1>

In [4]:
users['age'] = users['age'].replace(['0', 'отсутствует'], np.nan)
users['gender'] = users['gender'].replace(['не указан', 'отсутствует'], np.nan)
users['chit_type'] = users['chit_type'].replace(['нет данных', 'отсутствует'], np.nan)

In [5]:
users = users[['chb','age','gender']].dropna()
items = items[['sys_numb','title']]
train_transactions = train_transactions[['chb','sys_numb']]

<h1>Нормализация текста</h1>

In [6]:
from nltk.corpus import stopwords as sw
import pymorphy2
import re
stopwords = set(sw.words("russian"))

In [7]:
morph = pymorphy2.MorphAnalyzer()
token_pattern = re.compile(r'\b[\w\.\/]+\b')
tokenize = lambda doc: token_pattern.findall(doc)

badlist = set(['отсутствует', 'отсутствовать'])

def get_tokens(line):
    cur_line = tokenize(line)
    normalized_tokens = [morph.parse(x)[0].normal_form for x in cur_line]
    filtered_tokens = [token for token in normalized_tokens if not token in badlist and not token in stopwords]
    return filtered_tokens
                

In [8]:
from multiprocessing import Pool

In [9]:
pool = Pool()
items['title_tokens'] = pool.map(get_tokens, items['title'].values)
pool.terminate()

In [10]:
items.sample(3)

Unnamed: 0,sys_numb,title,title_tokens
236516,RSL01004324139,Марк Шагал об искусстве и культуре,"[марк, шагать, искусство, культура]"
54505,RSL01003628213,Волшебный камень : Vice Versa by Anstey,"[волшебный, камень, vice, versa, by, anstey]"
105767,RSL07000416141,отсутствует,[]


## Извлечение биграмм и исключение редких слов

In [11]:
from gensim.models.phrases import Phrases
from gensim.corpora import Dictionary

In [12]:
bigram_transformer = Phrases(items['title_tokens'].values)
trigram_transformer = Phrases(bigram_transformer[items['title_tokens'].values])
quadrogram_transformer = Phrases(trigram_transformer[items['title_tokens'].values])
pentagram_transformer = Phrases(quadrogram_transformer[items['title_tokens'].values])
items['norm_title'] = [pentagram_transformer[q] for q in items['title_tokens']]

In [13]:
id2word = Dictionary(items['norm_title'].values)
id2word.filter_extremes(no_below=5)
items['norm_title'] = [[w for w in q if w in id2word.token2id]for q in items['norm_title']]

In [14]:
items = items[['sys_numb','norm_title']]

In [15]:
items.sample(3)

Unnamed: 0,sys_numb,norm_title
313352,RSL01002152040,"[связь_общественность, сфера, исполнительский_..."
264283,RSL01007765957,"[актуальный_проблема, разоружение, реф_сб]"
333890,RSL01005680978,"[тепло_массоперенос, материал_совещание, ин_те..."


### Фильтрация трансзакций

In [16]:
import turicreate as gl

In [17]:
g = gl.SGraph()

In [18]:
g = g.add_edges(train_transactions, src_field = 'chb', dst_field = 'sys_numb')

In [19]:
k_value = 1

In [20]:
kc = gl.kcore.create(g, kmin = k_value - 1, kmax = k_value)

In [21]:
training_filtered = g.get_neighborhood(kc.core_id.filter_by(k_value, 'core_id')['__id'], radius=0)

In [23]:
training_filtered = gl.SFrame(training_filtered.edges).rename({'__src_id' : 'chb', '__dst_id' : 'sys_numb'})

<h1>Обучение модели</h1>

In [25]:
#(training_data, validation_data) = gl.recommender.util.random_split_by_user(gl.SFrame(training_filtered))

In [26]:
train_setting = dict(
    regularization = 1e-12,
    linear_regularization = 1e-12,
    side_data_factorization = True,
    num_factors = 410,
    num_sampled_negative_examples = 150, 
    max_iterations = 50) 

In [27]:
model_fr = gl.ranking_factorization_recommender.create(
    gl.SFrame(training_filtered), 
    user_id = 'chb',
    item_id = 'sys_numb',
#    user_data = gl.SFrame(users[['chb','age','gender']]),
    item_data = gl.SFrame(items),
    **train_setting)

In [28]:
#prec_rec_data = model_fr.evaluate_precision_recall(validation_data)

In [29]:
results = model_fr.recommend(k=20)

In [30]:
solution = results.to_dataframe()
solution = solution[['chb','sys_numb']]
solution.to_csv("solution_f_x.csv", index=False, sep=';')

In [31]:
model_ic = gl.item_content_recommender.create(
    observation_data  = gl.SFrame(training_filtered), 
    user_id = 'chb',
    item_id = 'sys_numb',
    max_item_neighborhood_size=512,
    item_data = gl.SFrame(items))

Applying transform:
Class             : AutoVectorizer

Model Fields
------------
Features          : ['norm_title']
Excluded Features : ['sys_numb']

Column      Type  Interpretation  Transforms  Output Type
----------  ----  --------------  ----------  -----------
norm_title  list  categorical     Flatten     dict       




In [32]:
results_i = model_ic.recommend(k=20)

In [33]:
solution_i = results_i.to_dataframe()
solution_i = solution_i[['chb','sys_numb']]
solution_i.to_csv("solution_i_ix.csv", index=False, sep=';')

In [34]:
solution_i = results_i.to_dataframe()
solution = results.to_dataframe()

In [35]:
solution_i = solution_i[['chb','sys_numb','score']]
solution = solution[['chb','sys_numb','score']]

In [41]:
df3 = pd.concat([solution, solution_i]).groupby(by = ['chb','sys_numb']).mean()

In [42]:
solution_final = df3.sort_values('score').groupby(level=0).tail(20).sort_index().reset_index()

In [39]:
solution_final = solution_final[['chb','sys_numb']]
solution_final.to_csv("solution_final.csv", index=False, sep=';')