In [1]:
import json
from math import log, exp
from os import listdir
from os.path import isfile, join
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
import numpy as np
import scipy
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from scipy.sparse import coo_matrix
from scipy.linalg import logm, expm
from nltk.stem.snowball import RussianStemmer
import Stemmer
from stop_words import get_stop_words
from datetime import datetime
import matplotlib
%matplotlib inline

In [2]:
def load_post(path):
    with open(path) as json_file:
        post = json.load(json_file)
    hubs = {t['title'] : True for t in post['hubs']}   
    return [post['_id'], post['published']['$date'], post['title'], post['author']['url'], \
            post['domain'], hubs, post['content'], post['tags']] 

def load_posts(path):
    for file_name in listdir(path):
        file_path = join(path, file_name)
        if isfile(file_path):
            yield load_post(file_path)
            
def get_image_count(html):
    return len(re.findall('<img.*?>', html))            
            
def prepare_data(path):
    data = pd.DataFrame(load_posts(path), columns = ['_id', 'published', 'title', 'author', 'domain',\
                                                     'hubs', 'content', 'tags'])
    data['published'] = pd.to_datetime(data['published'])
    # Считаем и нормализуем количество изображений
    #data['image_count'] = data['content'].apply(get_image_count)
    #data['image_count'] = data['image_count'] / data['image_count'].max()
    # Считаем и нормализуем длину текста
    data['content_length'] = data['content'].str.len()
    data['sites'] = data['content'].apply(\
        lambda html: { s:True for s in re.findall('<a href="https?://(.+?)(?:/.*"|")>', html)})
    return data

russian_stemmer = Stemmer.Stemmer('ru')
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: russian_stemmer.stemWords(analyzer(doc))

In [3]:
%%time
df_train = prepare_data('./train/')

CPU times: user 1min 55s, sys: 18.2 s, total: 2min 13s
Wall time: 2min 54s


In [5]:
df_train.head(2)

Unnamed: 0,_id,published,title,author,domain,hubs,content,tags,content_length,sites
0,https://habrahabr.ru/company/webnames/blog/121...,2011-06-14 15:52:00,В Турции введена цензура на доменные имена,https://habrahabr.ru/company/webnames/blog/121...,habrahabr.ru,{'Блог компании Webnames.ru': True},<p>Правительство Турции </p>запретило доменные...,[],114,{}
1,https://geektimes.ru/post/102539/,2010-08-24 17:29:00,Draganflyer X8 — мечта любого шпиона,https://geektimes.ru/users/marks,geektimes.ru,{'Железо': True},"<img src=""https://habrastorage.org/storage/hab...","[Draganflyer, беспилотники, UAV, шпионство]",2736,"{'habrahabr.ru': True, 'gizmodo.com': True}"


In [6]:
target = pd.read_csv('./train_target.csv')
train = df_train.merge(target, on = '_id')

In [7]:
def calc_month_mean_favs(date, df, domain, delta = '60d'): 
    return df[(df['published'] > date - pd.Timedelta(delta)) &\
              (df['published'] < date + pd.Timedelta(delta)) &\
              (df['domain'] == domain)]['favs_lognorm'].mean()

In [8]:
train['mean_gt_favs'] = train['published'].apply(lambda d: calc_month_mean_favs(d, train, 'geektimes.ru'))
train['mean_habr_favs'] = train['published'].apply(lambda d: calc_month_mean_favs(d, train, 'habrahabr.ru'))

In [8]:
y = train['favs_lognorm'] - train['mean_favs']

In [9]:
data_train, data_valid, y_train, y_valid = train_test_split(train, y, test_size=0.25, random_state=42)

In [10]:
title_tfidf = TfidfVectorizer(stop_words=get_stop_words('russian'), analyzer='word', ngram_range=(1, 2))
X_train_title = title_tfidf.fit_transform(data_train['title'])
X_valid_title = title_tfidf.transform(data_valid['title'])

In [11]:
hub_vect = DictVectorizer()
X_train_hub = hub_vect.fit_transform(data_train['hubs'])
X_valid_hub = hub_vect.transform(data_valid['hubs'])

In [12]:
other_dict = DictVectorizer()
X_train_other = other_dict.fit_transform(data_train[['author', 'domain']].T.to_dict().values())
X_valid_other = other_dict.transform(data_valid[['author', 'domain']].T.to_dict().values())

In [13]:
publ_hour = DictVectorizer()
X_train_hour = publ_hour.fit_transform([{time.hour:True} for time in data_train['published']])
X_valid_hour = publ_hour.transform([{time.hour:True} for time in data_valid['published']])

In [14]:
publ_weekday = DictVectorizer()
X_train_weekday = publ_weekday.fit_transform([{time.weekday():True} for time in data_train['published']])
X_valid_weekday = publ_weekday.transform([{time.weekday():True} for time in data_valid['published']])

In [15]:
tags = DictVectorizer()
X_train_tags = tags.fit_transform([dict((t, True) for t in tags) for tags in data_train['tags']])
X_valid_tags = tags.transform([dict((t, True) for t in tags) for tags in data_valid['tags']])

In [18]:
%%time
html_tag_regexp = re.compile('<.*?>')
#content_tfidf = HashingVectorizer(stop_words=get_stop_words('russian'), ngram_range=(1, 2), n_features = 2**18)
default_prerpocessor = TfidfVectorizer().build_preprocessor()
remove_html_tags_preprocessor = lambda s: default_prerpocessor(html_tag_regexp.sub('', s))
content_tfidf = TfidfVectorizer(stop_words=get_stop_words('russian'), analyzer='word', ngram_range=(1, 1),\
                               preprocessor = remove_html_tags_preprocessor)
X_train_content = content_tfidf.fit_transform(data_train['content'])
X_valid_content = content_tfidf.transform(data_valid['content'])

CPU times: user 2min 59s, sys: 1.61 s, total: 3min
Wall time: 3min


In [19]:
max_len = data_train['content_length'].max()
X_train_textlen = coo_matrix(data_train['content_length'] / max_len).T
X_valid_textlen = coo_matrix(data_valid['content_length'] / max_len).T

In [20]:
sites = DictVectorizer()
X_train_sites = sites.fit_transform(data_train['sites'])
X_valid_sites = sites.transform(data_valid['sites'])

In [21]:
X_train = scipy.sparse.hstack([X_train_title, X_train_hub, X_train_other, X_train_content,\
                               X_train_weekday, X_train_tags, X_train_textlen, X_train_sites]).tocsr(copy = False) #X_train_content,\
X_valid = scipy.sparse.hstack([X_valid_title, X_valid_hub, X_valid_other, X_valid_content,\
                               X_valid_weekday, X_valid_tags, X_valid_textlen, X_valid_sites]).tocsr(copy = False) #X_valid_content,\

Загрузили и обработали данные, попробуем обучить модель.

In [None]:
mse_history = []

In [22]:
%%time
reg = linear_model.SGDRegressor(n_iter = 100,  penalty = 'elasticnet', loss = 'squared_epsilon_insensitive', alpha = 0.000001)
reg.fit(X_train, y_train)
y_valid_pred = reg.predict(X_valid)

CPU times: user 2min 4s, sys: 756 ms, total: 2min 5s
Wall time: 2min 4s


In [23]:
mse = mean_squared_error(y_valid, y_valid_pred)
mse_history.append(mse)
print(mse_history)

[0.83687382205867233]


Получим результаты на тесте.
Приготовим данные для обучения модели:

In [24]:
title_tfidf_ = TfidfVectorizer(stop_words=get_stop_words('russian'), analyzer='word', ngram_range=(1, 2))
X_title = title_tfidf_.fit_transform(train['title'])
hub_vect_ = DictVectorizer()
X_hub = hub_vect_.fit_transform(train['hubs'])
other_dict_ = DictVectorizer()
X_other = other_dict_.fit_transform(train[['author', 'domain']].T.to_dict().values())
tags_ = DictVectorizer()
X_tags = tags_.fit_transform([dict((t, True) for t in tags) for tags in train['tags']])
content_tfidf_ = TfidfVectorizer(stop_words=get_stop_words('russian'), analyzer='word', ngram_range=(1, 1),\
                               preprocessor = remove_html_tags_preprocessor)
X_content = content_tfidf_.fit_transform(train['content'])
publ_weekday_ = DictVectorizer()
X_weekday = publ_weekday.fit_transform([{time.weekday():True} for time in train['published']])
max_len_ = train['content_length'].max()
X_textlen = coo_matrix(train['content_length'] / max_len_).T
X = scipy.sparse.hstack([X_title, X_hub, X_other, X_content, X_weekday, X_tags, X_textlen]).tocsr(copy = False) 

Обучим модель:

In [25]:
rgs = linear_model.SGDRegressor(n_iter = 100,  penalty = 'elasticnet', loss = 'squared_epsilon_insensitive', alpha = 0.000001)
rgs.fit(X, y)
mean_squared_error(y, rgs.predict(X))

0.59028766734424143

Получим предсказание на тестовых данных:

In [None]:
test = prepare_data('./test/')

In [None]:
X_test_title = title_tfidf_.transform(test['title'])
X_test_hub = hub_vect_.transform(test['hubs'])
X_test_other = other_dict_.transform(test[['author', 'domain']].T.to_dict().values())
X_test_tags = tags_.transform([dict((t, True) for t in tags) for tags in test['tags']])
X_test_content = content_tfidf_.transform(test['content'])
X_test_weekday = publ_weekday_.transform([{time.weekday():True} for time in test['published']])
X_test_textlen = coo_matrix(test['content_length'] / max_len_).T
X_test = scipy.sparse.hstack([X_test_title, X_test_hub, X_test_other, X_test_content, X_test_weekday,\
                              X_test_tags, X_test_textlen]).tocsr(copy = False)
y_test_pred = rgs.predict(X_test)

In [None]:
test['favs_lognorm'] = y_test_pred
test[['_id', 'favs_lognorm']].to_csv("my_submission.csv", index = False)

In [None]:
!head my_submission.csv

In [None]:
!head sample_submission.csv

In [None]:
html = train['content'].iloc[10000]

In [None]:
{ s:True for s in re.findall('<a href="https?://(.+?)(?:/.*"|")>', html)}

In [None]:
'<a href="https?://(.+)">'

In [None]:
html

In [None]:
log(2.7)

In [None]:
test.head()

In [None]:
dir('a')

In [None]:
'a b'.replace(' ', '_')

In [None]:
' '.join(['a', 'b'])

In [None]:
stemmer = RussianStemmer(False)
stemmer.stem('клонировать')

In [None]:
train.columns = ['_id', 'published', 'title', 'author', 'domain', 'hubs', 'content',\
       'tags', 'text', 'image_count', 'text_length', 'sites', 'favs_lognorm']

In [None]:
import math
math.exp(7.047517)

In [None]:
re.match('[A-Za-z0-9]{3,}', 'Asdsn')

In [None]:
target['favs_lognorm'] = np.expm1(target['favs_lognorm'])

In [None]:
2**16

In [None]:
target

In [None]:
train['published'].min(), train['published'].max(), test['published'].min(), test['published'].max() 

In [None]:
train['month'] = train['published'].apply()

In [None]:
m = DateTime('2016-12-31 22:49:00')

In [None]:
%%time
train['mean_favs'] = [calc_month_mean_favs(d, train) for d in train.index]

In [None]:
train.index

In [None]:
train.set_index('published', inplace = True)

In [None]:
vis = train[['published', 'mean_favs']].sort_values('published')

In [None]:
vis.plot(x='published', y = 'mean_favs')