In [1]:
import json
from math import log, exp, log1p
from os import listdir
from os.path import isfile, join
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
import numpy as np
import scipy
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from scipy.sparse import coo_matrix
from scipy.linalg import logm, expm
from nltk.stem.snowball import RussianStemmer
import Stemmer
from stop_words import get_stop_words
from datetime import datetime
import matplotlib
%matplotlib inline

In [2]:
def load_post(path):
    with open(path) as json_file:
        post = json.load(json_file)
    hubs = {t['title'] : True for t in post['hubs']}   
    return [post['_id'], post['published']['$date'], post['title'], post['author']['url'], \
            post['domain'], hubs, post['content'], post['tags']] 

def load_posts(path):
    for file_name in listdir(path):
        file_path = join(path, file_name)
        if isfile(file_path):
            yield load_post(file_path)
            
def get_image_count(html):
    return len(re.findall('<img.*?>', html))            
            
def prepare_data(path):
    data = pd.DataFrame(load_posts(path), columns = ['_id', 'published', 'title', 'author', 'domain',\
                                                     'hubs', 'content', 'tags'])
    data['published'] = pd.to_datetime(data['published'])
    # Считаем и нормализуем количество изображений
    #data['image_count'] = data['content'].apply(get_image_count)
    #data['image_count'] = data['image_count'] / data['image_count'].max()
    # Считаем и нормализуем длину текста
    data['content_length'] = data['content'].str.len()
    data['sites'] = data['content'].apply(\
        lambda html: { s:True for s in re.findall('<a href="https?://(.+?)(?:/.*"|")>', html)})
    return data

russian_stemmer = Stemmer.Stemmer('ru')
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: russian_stemmer.stemWords(analyzer(doc))

In [3]:
%%time
df_train = prepare_data('./train/')

CPU times: user 1min 55s, sys: 19.9 s, total: 2min 15s
Wall time: 2min 56s


In [4]:
df_train.head(2)

Unnamed: 0,_id,published,title,author,domain,hubs,content,tags,content_length,sites
0,https://habrahabr.ru/company/webnames/blog/121...,2011-06-14 15:52:00,В Турции введена цензура на доменные имена,https://habrahabr.ru/company/webnames/blog/121...,habrahabr.ru,{'Блог компании Webnames.ru': True},<p>Правительство Турции </p>запретило доменные...,[],114,{}
1,https://geektimes.ru/post/102539/,2010-08-24 17:29:00,Draganflyer X8 — мечта любого шпиона,https://geektimes.ru/users/marks,geektimes.ru,{'Железо': True},"<img src=""https://habrastorage.org/storage/hab...","[Draganflyer, беспилотники, UAV, шпионство]",2736,"{'habrahabr.ru': True, 'gizmodo.com': True}"


In [5]:
target = pd.read_csv('./train_target.csv')
train = df_train.merge(target, on = '_id')

In [6]:
train.head()

Unnamed: 0,_id,published,title,author,domain,hubs,content,tags,content_length,sites,favs_lognorm
0,https://habrahabr.ru/company/webnames/blog/121...,2011-06-14 15:52:00,В Турции введена цензура на доменные имена,https://habrahabr.ru/company/webnames/blog/121...,habrahabr.ru,{'Блог компании Webnames.ru': True},<p>Правительство Турции </p>запретило доменные...,[],114,{},0.0
1,https://geektimes.ru/post/102539/,2010-08-24 17:29:00,Draganflyer X8 — мечта любого шпиона,https://geektimes.ru/users/marks,geektimes.ru,{'Железо': True},"<img src=""https://habrastorage.org/storage/hab...","[Draganflyer, беспилотники, UAV, шпионство]",2736,"{'habrahabr.ru': True, 'gizmodo.com': True}",3.295837
2,https://habrahabr.ru/company/droider/blog/127362/,2011-08-30 16:34:00,"Droider Chart. Выпуск 67, прикладной",https://habrahabr.ru/company/droider,habrahabr.ru,{'Блог компании Droider.Ru': True},"Всем, привет!<br>\r\n<br>\r\nВ новом выпуске <...","[android, android os, Droider Chart, Droider, ...",1420,"{'droider.ru': True, 'habrahabr.ru': True}",1.609438
3,https://geektimes.ru/post/119923/,2011-05-25 03:21:00,Играем в ZX Spectrum на iPhone,https://geektimes.ru/users/soulburner,geektimes.ru,{'Смартфоны': True},"<img src=""https://habrastorage.org/storage/hab...","[zx spectrum, ixpectrum, ios, iphone, zx, ност...",2509,"{'zx-spectrum.narod.ru': True, 'www.doggysoft....",2.772589
4,https://geektimes.ru/post/29492/,2008-07-21 14:14:00,Интересное поведение браузеров,https://geektimes.ru/users/tkf,geektimes.ru,{'Чёрная дыра': True},Заметил интересную вещь. На странице есть див ...,"[браузеры, невидимый блок, странность]",343,{},0.0


In [7]:
train[['_id', 'published', 'favs_lognorm']][train['domain'] == 'geektimes.ru'].to_csv('gt_favs.csv', index = False)
train[['_id', 'published', 'favs_lognorm']][train['domain'] == 'habrahabr.ru'].to_csv('habr_favs.csv', index = False)

In [8]:
habr_mean_fav = pd.read_csv('habr_favs_mean_pred.csv').fillna(0)
gt_mean_fav = pd.read_csv('gt_favs_mean_pred.csv').fillna(0)

In [9]:
gt_mean_fav.columns = habr_mean_fav.columns = ['date', 'favs_mean60', 'favs_mean60_pred']
gt_mean_fav['date'] = pd.to_datetime(gt_mean_fav['date'])
habr_mean_fav['date'] = pd.to_datetime(habr_mean_fav['date'])
gt_mean_fav.set_index('date', inplace = True)
habr_mean_fav.set_index('date', inplace = True)

In [12]:
def get_mean_fav(timestamp, domain):
    return (habr_mean_fav if domain == 'habrahabr.ru' else gt_mean_fav).loc[timestamp.date(), 'favs_mean60']         

In [13]:
%%time
train['favs_meanlog'] = train.apply(lambda row: log1p(get_mean_fav(row['published'], row['domain'])), axis = 1)

CPU times: user 1min 17s, sys: 0 ns, total: 1min 17s
Wall time: 1min 17s


In [14]:
train = train.sort_values('published')

In [15]:
n_valid = train[train['published'] > '2016-08-31'].count()[0]

In [16]:
n_train = train.count()[0] - n_valid

In [17]:
y = train['favs_lognorm'] - train['favs_meanlog']

In [18]:
data_train, data_valid, y_train, y_valid = train[ : n_train], train[n_train : ], y[ : n_train], y[n_train : ]

In [19]:
def extract_features(data_train, data_valid):
    title_tfidf = TfidfVectorizer(stop_words=get_stop_words('russian'), analyzer='word', ngram_range=(1, 2))
    X_train_title = title_tfidf.fit_transform(data_train['title'])
    X_valid_title = title_tfidf.transform(data_valid['title'])
    hub_vect = DictVectorizer()
    X_train_hub = hub_vect.fit_transform(data_train['hubs'])
    X_valid_hub = hub_vect.transform(data_valid['hubs'])
    other_dict = DictVectorizer()
    X_train_other = other_dict.fit_transform(data_train[['author', 'domain']].T.to_dict().values())
    X_valid_other = other_dict.transform(data_valid[['author', 'domain']].T.to_dict().values())
    #publ_hour = DictVectorizer()
    #X_train_hour = publ_hour.fit_transform([{time.hour:True} for time in data_train['published']])
    #X_valid_hour = publ_hour.transform([{time.hour:True} for time in data_valid['published']])
    publ_weekday = DictVectorizer()
    X_train_weekday = publ_weekday.fit_transform([{time.weekday():True} for time in data_train['published']])
    X_valid_weekday = publ_weekday.transform([{time.weekday():True} for time in data_valid['published']])
    tags = DictVectorizer()
    X_train_tags = tags.fit_transform([dict((t, True) for t in tags) for tags in data_train['tags']])
    X_valid_tags = tags.transform([dict((t, True) for t in tags) for tags in data_valid['tags']])
    html_tag_regexp = re.compile('<.*?>')
    #content_tfidf = HashingVectorizer(stop_words=get_stop_words('russian'), ngram_range=(1, 2), n_features = 2**18)
    default_prerpocessor = TfidfVectorizer().build_preprocessor()
    remove_html_tags_preprocessor = lambda s: default_prerpocessor(html_tag_regexp.sub('', s))
    content_tfidf = TfidfVectorizer(stop_words=get_stop_words('russian'), analyzer='word', ngram_range=(1, 1),\
                                   preprocessor = remove_html_tags_preprocessor)
    X_train_content = content_tfidf.fit_transform(data_train['content'])
    X_valid_content = content_tfidf.transform(data_valid['content'])
    max_len_log = log(data_train['content_length'].max())
    X_train_textlen = coo_matrix(data_train['content_length'].apply(lambda x: log1p(x) / max_len_log)).T
    X_valid_textlen = coo_matrix(data_valid['content_length'].apply(lambda x: log1p(x) / max_len_log)).T
    sites = DictVectorizer()
    X_train_sites = sites.fit_transform(data_train['sites'])
    X_valid_sites = sites.transform(data_valid['sites'])
    X_train = scipy.sparse.hstack([X_train_title, X_train_hub, X_train_other, X_train_content,\
                               X_train_weekday, X_train_tags, X_train_textlen, X_train_sites]).tocsr(copy = False) 
    X_valid = scipy.sparse.hstack([X_valid_title, X_valid_hub, X_valid_other, X_valid_content,\
                               X_valid_weekday, X_valid_tags, X_valid_textlen, X_valid_sites]).tocsr(copy = False) 
    return X_train, X_valid

In [20]:
%%time
X_train, X_valid = extract_features(data_train, data_valid)

CPU times: user 3min 56s, sys: 5.18 s, total: 4min 1s
Wall time: 4min 7s


Загрузили и обработали данные, попробуем обучить модель.

In [21]:
mse_history = []

In [26]:
%%time
reg = linear_model.SGDRegressor(n_iter = 200,  penalty = 'elasticnet', loss = 'squared_epsilon_insensitive', alpha = 0.000001)
reg.fit(X_train, y_train)
y_valid_pred = reg.predict(X_valid)

CPU times: user 3min 59s, sys: 72 ms, total: 3min 59s
Wall time: 3min 59s


In [25]:
mse = mean_squared_error(y_valid, y_valid_pred)
mse_history.append(mse)
print(mse_history)

[0.50402791238449729, 0.54674477247018693]


Получим результаты на тесте.
Приготовим данные для обучения модели:

In [None]:
test = prepare_data('./test/')

In [None]:
X, X_test = extract_features(train, test)

Обучим модель:

In [None]:
%%time
rgs = linear_model.SGDRegressor(n_iter = 100,  penalty = 'elasticnet', loss = 'squared_epsilon_insensitive', alpha = 0.00001)
rgs.fit(X, y)

In [None]:
y_test_pred = rgs.predict(X_test)

In [None]:
habr_mean_fav_last = habr_mean_fav.loc['2016-10-31':]['favs_mean60'].mean()
gt_mean_fav_last = gt_mean_fav.loc['2016-10-31':]['favs_mean60'].mean()

In [None]:
gt_mean_fav_last

In [None]:
def get_pred_mean_fav(timestamp, domain):
    return (habr_mean_fav if domain == 'habrahabr.ru' else gt_mean_fav).loc[ts.date(), 'favs_mean60_pred'] 

In [None]:
test['favs_meanlog'] = test.apply(lambda row: log1p(get_pred_mean_fav(row['published'], row['domain'])), axis = 1)
test['favs_lognorm'] = y_test_pred + test['favs_meanlog']
test[['_id', 'favs_lognorm']].to_csv("my_submission.csv", index = False)