In [66]:
import json
from math import log, exp, log1p
from os import listdir
from os.path import isfile, join
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
import numpy as np
import scipy
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from scipy.sparse import coo_matrix
from scipy.linalg import logm, expm
from nltk.stem.snowball import RussianStemmer
import Stemmer
from stop_words import get_stop_words
from datetime import datetime
import matplotlib
%matplotlib inline

In [2]:
def load_post(path):
    with open(path) as json_file:
        post = json.load(json_file)
    hubs = {t['title'] : True for t in post['hubs']}   
    return [post['_id'], post['published']['$date'], post['title'], post['author']['url'], \
            post['domain'], hubs, post['content'], post['tags']] 

def load_posts(path):
    for file_name in listdir(path):
        file_path = join(path, file_name)
        if isfile(file_path):
            yield load_post(file_path)
            
def get_image_count(html):
    return len(re.findall('<img.*?>', html))            
            
def prepare_data(path):
    data = pd.DataFrame(load_posts(path), columns = ['_id', 'published', 'title', 'author', 'domain',\
                                                     'hubs', 'content', 'tags'])
    data['published'] = pd.to_datetime(data['published'])
    # Считаем и нормализуем количество изображений
    #data['image_count'] = data['content'].apply(get_image_count)
    #data['image_count'] = data['image_count'] / data['image_count'].max()
    # Считаем и нормализуем длину текста
    data['content_length'] = data['content'].str.len()
    data['sites'] = data['content'].apply(\
        lambda html: { s:True for s in re.findall('<a href="https?://(.+?)(?:/.*"|")>', html)})
    return data

russian_stemmer = Stemmer.Stemmer('ru')
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: russian_stemmer.stemWords(analyzer(doc))

In [3]:
%%time
df_train = prepare_data('./train/')

CPU times: user 1min 54s, sys: 18.4 s, total: 2min 13s
Wall time: 2min 54s


In [11]:
df_train.head(2)

Unnamed: 0,_id,published,title,author,domain,hubs,content,tags,content_length,sites
0,https://habrahabr.ru/company/webnames/blog/121...,2011-06-14 15:52:00,В Турции введена цензура на доменные имена,https://habrahabr.ru/company/webnames/blog/121...,habrahabr.ru,{'Блог компании Webnames.ru': True},<p>Правительство Турции </p>запретило доменные...,[],114,{}
1,https://geektimes.ru/post/102539/,2010-08-24 17:29:00,Draganflyer X8 — мечта любого шпиона,https://geektimes.ru/users/marks,geektimes.ru,{'Железо': True},"<img src=""https://habrastorage.org/storage/hab...","[Draganflyer, беспилотники, UAV, шпионство]",2736,"{'habrahabr.ru': True, 'gizmodo.com': True}"


In [5]:
target = pd.read_csv('./train_target.csv')
train = df_train.merge(target, on = '_id')

In [70]:
train.head()

Unnamed: 0,_id,published,title,author,domain,hubs,content,tags,content_length,sites,favs_lognorm,favs_meanlog
108649,https://geektimes.ru/post/1453/,2006-01-16 21:02:00,Mail.ru и «Яндекс» заключили стратегическое со...,https://geektimes.ru/users/gameboyhippo,geektimes.ru,{'Чёрная дыра': True},Два крупнейших портала российского интернета —...,"[поиск, реклама, технологии, контекст, партнер...",1450,"{'www.mail.ru': True, 'www.google.com': True}",0.0,5.793826
150143,https://geektimes.ru/post/1455/,2006-01-19 21:11:00,«Яндекс» поддерживает «правильные» тарифы на и...,https://geektimes.ru/users/gameboyhippo,geektimes.ru,{'Чёрная дыра': True},"Сегодня стартует <a href=""http://tarif.yandex....","[широкополосный доступ, трафик, тариф, Яндекс,...",1253,{'tarif.yandex.ru': True},0.0,5.793826
135431,https://geektimes.ru/post/1454/,2006-03-21 21:07:00,«Яндекс» удвоил доходы и прибыль,https://geektimes.ru/users/gameboyhippo,geektimes.ru,{'Чёрная дыра': True},"Чистая прибыль (net income) компании <a href=""...","[финансы, прибыль, доход, Яндекс, реклама, ста...",1184,{'www.yandex.ru': True},0.0,5.793826
38576,https://geektimes.ru/post/1452/,2006-03-22 21:00:00,Поиск по блогам от «Яндекса» вышел из беты,https://geektimes.ru/users/gameboyhippo,geektimes.ru,{'Чёрная дыра': True},"<a href=""http://www.yandex.ru/"">«Яндекс»</a> з...","[блогосфера, поиск, Яндекс, релевантность, инф...",1173,"{'www.yandex.ru': True, 'blogs.yandex.ru': Tru...",0.693147,5.793826
127317,https://geektimes.ru/post/1457/,2006-03-30 21:58:00,Запущен первый российский поиск по wap-ресурсам,https://geektimes.ru/users/gameboyhippo,geektimes.ru,{'Чёрная дыра': True},"Компания <a href=""http://www.mail.ru/"">Mail.Ru...","[поиск, технологии, WAP, информация, морфологи...",575,{'www.mail.ru': True},0.0,5.793826


In [10]:
train[['_id', 'published', 'favs_lognorm']][train['domain'] == 'geektimes.ru'].to_csv('gt_favs.csv', index = False)
train[['_id', 'published', 'favs_lognorm']][train['domain'] == 'habrahabr.ru'].to_csv('habr_favs.csv', index = False)

In [57]:
habr_mean_fav = pd.read_csv('habr_favs_mean_pred.csv').fillna(0)
gt_mean_fav = pd.read_csv('gt_favs_mean_pred.csv').fillna(0)

In [58]:
gt_mean_fav.columns = habr_mean_fav.columns = ['date', 'favs_mean60', 'favs_mean60_pred']
gt_mean_fav['date'] = pd.to_datetime(gt_mean_fav['date'])
habr_mean_fav['date'] = pd.to_datetime(habr_mean_fav['date'])
gt_mean_fav.set_index('date', inplace = True)
habr_mean_fav.set_index('date', inplace = True)

In [59]:
def get_mean_fav(timestamp, domain):
    return (habr_mean_fav if domain == 'habrahabr.ru' else gt_mean_fav).loc[ts.date(), 'favs_mean60']         

In [67]:
%%time
train['favs_meanlog'] = train.apply(lambda row: log1p(get_mean_fav(row['published'], row['domain'])), axis = 1)

CPU times: user 1min 17s, sys: 24 ms, total: 1min 17s
Wall time: 1min 17s


In [69]:
train = train.sort_values('published')

In [74]:
n_valid = train[train['published'] > '2016-08-31'].count()[0]

In [76]:
n_train = train.count()[0] - n_valid

In [71]:
y = train['favs_lognorm'] - train['favs_meanlog']

In [77]:
data_train, data_valid, y_train, y_valid = train[ : n_train], train[n_train : ], y[ : n_train], y[n_train : ]

In [87]:
def extract_features(data_train, data_valid):
    title_tfidf = TfidfVectorizer(stop_words=get_stop_words('russian'), analyzer='word', ngram_range=(1, 2))
    X_train_title = title_tfidf.fit_transform(data_train['title'])
    X_valid_title = title_tfidf.transform(data_valid['title'])
    hub_vect = DictVectorizer()
    X_train_hub = hub_vect.fit_transform(data_train['hubs'])
    X_valid_hub = hub_vect.transform(data_valid['hubs'])
    other_dict = DictVectorizer()
    X_train_other = other_dict.fit_transform(data_train[['author', 'domain']].T.to_dict().values())
    X_valid_other = other_dict.transform(data_valid[['author', 'domain']].T.to_dict().values())
    #publ_hour = DictVectorizer()
    #X_train_hour = publ_hour.fit_transform([{time.hour:True} for time in data_train['published']])
    #X_valid_hour = publ_hour.transform([{time.hour:True} for time in data_valid['published']])
    publ_weekday = DictVectorizer()
    X_train_weekday = publ_weekday.fit_transform([{time.weekday():True} for time in data_train['published']])
    X_valid_weekday = publ_weekday.transform([{time.weekday():True} for time in data_valid['published']])
    tags = DictVectorizer()
    X_train_tags = tags.fit_transform([dict((t, True) for t in tags) for tags in data_train['tags']])
    X_valid_tags = tags.transform([dict((t, True) for t in tags) for tags in data_valid['tags']])
    html_tag_regexp = re.compile('<.*?>')
    #content_tfidf = HashingVectorizer(stop_words=get_stop_words('russian'), ngram_range=(1, 2), n_features = 2**18)
    default_prerpocessor = TfidfVectorizer().build_preprocessor()
    remove_html_tags_preprocessor = lambda s: default_prerpocessor(html_tag_regexp.sub('', s))
    content_tfidf = TfidfVectorizer(stop_words=get_stop_words('russian'), analyzer='word', ngram_range=(1, 1),\
                                   preprocessor = remove_html_tags_preprocessor)
    X_train_content = content_tfidf.fit_transform(data_train['content'])
    X_valid_content = content_tfidf.transform(data_valid['content'])
    max_len_log = log(data_train['content_length'].max())
    X_train_textlen = coo_matrix(data_train['content_length'].apply(lambda x: log1p(x) / max_len_log)).T
    X_valid_textlen = coo_matrix(data_valid['content_length'].apply(lambda x: log1p(x) / max_len_log)).T
    sites = DictVectorizer()
    X_train_sites = sites.fit_transform(data_train['sites'])
    X_valid_sites = sites.transform(data_valid['sites'])
    X_train = scipy.sparse.hstack([X_train_title, X_train_hub, X_train_other, X_train_content,\
                               X_train_weekday, X_train_tags, X_train_textlen, X_train_sites]).tocsr(copy = False) 
    X_valid = scipy.sparse.hstack([X_valid_title, X_valid_hub, X_valid_other, X_valid_content,\
                               X_valid_weekday, X_valid_tags, X_valid_textlen, X_valid_sites]).tocsr(copy = False) 
    return X_train, X_valid

In [88]:
%%time
X_train, X_valid = extract_features(data_train, data_valid)

CPU times: user 3min 50s, sys: 6.35 s, total: 3min 56s
Wall time: 4min 11s


Загрузили и обработали данные, попробуем обучить модель.

In [89]:
mse_history = []

In [90]:
%%time
reg = linear_model.SGDRegressor(n_iter = 100,  penalty = 'elasticnet', loss = 'squared_epsilon_insensitive', alpha = 0.000001)
reg.fit(X_train, y_train)
y_valid_pred = reg.predict(X_valid)

CPU times: user 2min 9s, sys: 24 ms, total: 2min 9s
Wall time: 2min 9s


In [91]:
mse = mean_squared_error(y_valid, y_valid_pred)
mse_history.append(mse)
print(mse_history)

[0.62436598687365008]


Получим результаты на тесте.
Приготовим данные для обучения модели:

In [92]:
test = prepare_data('./test/')

In [93]:
X, X_test = extract_features(train, test)

Обучим модель:

In [95]:
%%time
rgs = linear_model.SGDRegressor(n_iter = 1000,  penalty = 'elasticnet', loss = 'squared_epsilon_insensitive', alpha = 0.00001)
rgs.fit(X, y)

CPU times: user 30min 25s, sys: 72 ms, total: 30min 25s
Wall time: 30min 24s


In [97]:
y_test_pred = rgs.predict(X_test)

In [104]:
habr_mean_fav_last = habr_mean_fav.loc['2016-10-31':]['favs_mean60'].mean()
gt_mean_fav_last = gt_mean_fav.loc['2016-10-31':]['favs_mean60'].mean()

In [106]:
gt_mean_fav_last

215.41996336996343

In [112]:
def get_pred_mean_fav(timestamp, domain):
    return (habr_mean_fav if domain == 'habrahabr.ru' else gt_mean_fav).loc[ts.date(), 'favs_mean60_pred'] 

In [113]:
test['favs_meanlog'] = test.apply(lambda row: log1p(get_pred_mean_fav(row['published'], row['domain'])), axis = 1)
test['favs_lognorm'] = y_test_pred + test['favs_meanlog']
test[['_id', 'favs_lognorm']].to_csv("my_submission.csv", index = False)

array([-2.8647031 , -2.26892618, -3.43648231, ..., -2.29441615,
       -2.66372756, -2.59215377])

Получим предсказание на тестовых данных:

In [None]:
test = prepare_data('./test/')

In [None]:
X_test_title = title_tfidf_.transform(test['title'])
X_test_hub = hub_vect_.transform(test['hubs'])
X_test_other = other_dict_.transform(test[['author', 'domain']].T.to_dict().values())
X_test_tags = tags_.transform([dict((t, True) for t in tags) for tags in test['tags']])
X_test_content = content_tfidf_.transform(test['content'])
X_test_weekday = publ_weekday_.transform([{time.weekday():True} for time in test['published']])
X_test_textlen = coo_matrix(test['content_length'] / max_len_).T
X_test = scipy.sparse.hstack([X_test_title, X_test_hub, X_test_other, X_test_content, X_test_weekday,\
                              X_test_tags, X_test_textlen]).tocsr(copy = False)
y_test_pred = rgs.predict(X_test)

In [None]:
test['favs_lognorm'] = y_test_pred
test[['_id', 'favs_lognorm']].to_csv("my_submission.csv", index = False)

In [None]:
!head my_submission.csv

In [None]:
!head sample_submission.csv

In [None]:
html = train['content'].iloc[10000]

In [None]:
{ s:True for s in re.findall('<a href="https?://(.+?)(?:/.*"|")>', html)}

In [None]:
'<a href="https?://(.+)">'

In [None]:
html

In [None]:
log(2.7)

In [None]:
test.head()

In [None]:
dir('a')

In [None]:
'a b'.replace(' ', '_')

In [None]:
' '.join(['a', 'b'])

In [None]:
stemmer = RussianStemmer(False)
stemmer.stem('клонировать')

In [None]:
train.columns = ['_id', 'published', 'title', 'author', 'domain', 'hubs', 'content',\
       'tags', 'text', 'image_count', 'text_length', 'sites', 'favs_lognorm']

In [None]:
import math
math.exp(7.047517)

In [None]:
re.match('[A-Za-z0-9]{3,}', 'Asdsn')

In [None]:
target['favs_lognorm'] = np.expm1(target['favs_lognorm'])

In [None]:
2**16

In [None]:
target

In [None]:
train['published'].min(), train['published'].max(), test['published'].min(), test['published'].max() 

In [None]:
train['month'] = train['published'].apply()

In [None]:
m = DateTime('2016-12-31 22:49:00')

In [None]:
%%time
train['mean_favs'] = [calc_month_mean_favs(d, train) for d in train.index]

In [None]:
train.index

In [None]:
train.set_index('published', inplace = True)

In [None]:
vis = train[['published', 'mean_favs']].sort_values('published')

In [None]:
vis.plot(x='published', y = 'mean_favs')

In [None]:
title_tfidf_ = TfidfVectorizer(stop_words=get_stop_words('russian'), analyzer='word', ngram_range=(1, 2))
X_title = title_tfidf_.fit_transform(train['title'])
hub_vect_ = DictVectorizer()
X_hub = hub_vect_.fit_transform(train['hubs'])
other_dict_ = DictVectorizer()
X_other = other_dict_.fit_transform(train[['author', 'domain']].T.to_dict().values())
tags_ = DictVectorizer()
X_tags = tags_.fit_transform([dict((t, True) for t in tags) for tags in train['tags']])
content_tfidf_ = TfidfVectorizer(stop_words=get_stop_words('russian'), analyzer='word', ngram_range=(1, 1),\
                               preprocessor = remove_html_tags_preprocessor)
X_content = content_tfidf_.fit_transform(train['content'])
publ_weekday_ = DictVectorizer()
X_weekday = publ_weekday.fit_transform([{time.weekday():True} for time in train['published']])
max_len_ = train['content_length'].max()
X_textlen = coo_matrix(train['content_length'] / max_len_).T
X = scipy.sparse.hstack([X_title, X_hub, X_other, X_content, X_weekday, X_tags, X_textlen]).tocsr(copy = False) 