In [1]:
import pandas as pd
import json
from sklearn.feature_extraction.text import  TfidfVectorizer, CountVectorizer
from sklearn.linear_model import Ridge
from gc import collect
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, make_scorer
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.sparse import hstack
import re
from nltk.corpus import stopwords

In [2]:
def write_submission_file(prediction, filename='../out/submit.csv',
    path_to_sample='../data/sample_submission.csv'):
    submission = pd.read_csv(path_to_sample, index_col='id')
    
    submission['log_recommends'] = prediction
    submission.to_csv(filename)

In [3]:
def add_loglength(df, X, scaler, fit):   
    loglength = df['log_length']
    if fit:
        loglength = scaler.fit_transform(loglength.values.reshape(-1, 1)) # 
    else:
        loglength = scaler.transform(loglength.values.reshape(-1, 1))
    return scaler, hstack((X, loglength))

In [4]:
def add_day(df, X, scaler, fit):   
    day = df['delta_time'].astype(np.float)
    if fit:
        day = scaler.fit_transform(day.values.reshape(-1, 1)) # 
    else:
        day = scaler.transform(day.values.reshape(-1, 1))
    return scaler, hstack((X, day))

In [5]:
def get_lang(lang):
    top_langs = ['en', 'pt', 'es', 'fr', 'it', 'ru', 'tr', 'ja', 'de', 'id']
    return lang if lang in top_langs else 'other'


In [6]:
def add_hour(df, X_sparse):
    X = X_sparse
    hour = df['time'].apply(lambda ts: ts.hour)
    for i in range(0,24):
        X = hstack((X, (hour == i).astype('int').values.reshape(-1, 1)))
    return X

In [7]:
%%time
train = pd.read_pickle('../data/train.pickle', compression='xz')
train.drop(train[train.length == 0].index, axis=0, inplace=True)

CPU times: user 424 ms, sys: 13.6 ms, total: 437 ms
Wall time: 633 ms


In [8]:
train.shape

(62253, 13)

In [9]:
train['lang'] = train['lang'].apply(lambda x: get_lang(x))

In [10]:
train['date'] = pd.to_datetime(train.date)
train['time'] = pd.to_datetime(train.time)#.apply(lambda x: x.time()
train['time'] = train.time.apply(lambda x: x.time())
train['year_month'] = train['date'].apply(lambda x: 100 * x.year + x.month)
train['year'] = train['date'].apply(lambda x: x.year)


In [11]:
train.sort_values(by='year_month', inplace=True)
train.drop(train[train.length == 0].index, axis=0, inplace=True)

train['target'].mean(), train['target'].std(),  train['target'].max(), train.shape

(3.051711004770855, 1.9252352577043828, 11.25157, (62253, 15))

In [12]:
train['target_expm'] = train['target'].apply(lambda x: np.expm1(x))
train['log_length'] = train['length'].apply(lambda x: np.log10(x+1))


In [13]:
train = train[
    (train.target > train.target.quantile(0.1)) &
    (train.target < train.target.quantile(0.98)) &
    (train.length < train.length.quantile(.99)) &
    (train.log_length > train.log_length.quantile(.01))
]
train = train[train.year > 2012]
collect()

7

In [14]:
mindate = train.date.min()
train['delta_date'] = (train['date'] - mindate)
train['delta_date'] = train['delta_date'].apply(lambda x: x.days)

In [15]:
train['delta_date'].head()

45154     2
25360    14
50640    21
58055     6
38292     8
Name: delta_date, dtype: int64

In [16]:
train_corps = pd.read_csv('../data/train_corps.csv')

In [17]:
scorer = make_scorer(mean_absolute_error)
y = train.target.values
collect()

0

In [18]:
train.drop('target', axis=1, inplace=True)

In [19]:
%%time
tfidf = TfidfVectorizer(sublinear_tf=True, max_features=80000, ngram_range=(1,2)) #, binary=True)#, c
X_corp = tfidf.fit_transform(list(train_corps['content'].iloc[train.index]))
collect()

CPU times: user 4min 29s, sys: 5.12 s, total: 4min 34s
Wall time: 4min 33s


In [20]:
cv = cross_val_score(Ridge(), X_corp, y, cv=4, n_jobs=-1, scoring=scorer)
print(f'Crossvalidation: mean = {cv.mean():.4f}, std = {cv.std():.4f}')

Crossvalidation: mean = 1.1690, std = 0.0242


In [21]:

langvec = CountVectorizer(binary=True, )
lang_train = langvec.fit_transform(train.lang)
X_train = hstack((X_corp, lang_train))
cv = cross_val_score(Ridge(), X_train, y, cv=4, n_jobs=-1, scoring=scorer)
print(f'Crossvalidation: mean = {cv.mean():.4f}, std = {cv.std():.4f}')

Crossvalidation: mean = 1.1676, std = 0.0245


In [22]:
authorvec = CountVectorizer(binary=True)
author_train = authorvec.fit_transform(train.author)
X_train = hstack((X_train, author_train))
cv = cross_val_score(Ridge(), X_train, y, cv=4, n_jobs=-1, scoring=scorer)
print(f'Crossvalidation: mean = {cv.mean():.4f}, std = {cv.std():.4f}')

Crossvalidation: mean = 1.0630, std = 0.0497


In [23]:
titlevec = TfidfVectorizer(ngram_range=(1,3)) #, min_df=2
title_train = titlevec.fit_transform(train.tittle)
X_train = hstack((X_train,  title_train))
cv = cross_val_score(Ridge(), X_train, y, cv=4, n_jobs=-1, scoring=scorer)
print(f'Crossvalidation: mean = {cv.mean():.4f}, std = {cv.std():.4f}')

Crossvalidation: mean = 1.0350, std = 0.0494


In [24]:
train['corp_tags'] = train.tags.apply(lambda x: ' '.join(json.loads(x)))
tagvec = TfidfVectorizer(ngram_range=(1,3)) #, binary=True, min_df=2
tag_train = tagvec.fit_transform(train.corp_tags)
X_train = hstack((X_train, tag_train))
cv = cross_val_score(Ridge(), X_train, y, cv=4, n_jobs=-1, scoring=scorer)
print(f'Crossvalidation: mean = {cv.mean():.4f}, std = {cv.std():.4f}')

Crossvalidation: mean = 1.0297, std = 0.0500


In [25]:
llscaler, X_train = add_loglength(train, X_train, MinMaxScaler(), True)
cv = cross_val_score(Ridge(), X_train, y, cv=4, n_jobs=-1, scoring=scorer)
print(f'Crossvalidation: mean = {cv.mean():.4f}, std = {cv.std():.4f}')

Crossvalidation: mean = 1.0257, std = 0.0482


In [26]:
X_train = add_hour(df=train, X_sparse=X_train)
cv = cross_val_score(Ridge(), X_train, y, cv=4, n_jobs=-1, scoring=scorer)
print(f'Crossvalidation: mean = {cv.mean():.4f}, std = {cv.std():.4f}')

Crossvalidation: mean = 1.0252, std = 0.0478


In [27]:
X_train.shape

(52043, 962128)

In [28]:
test = pd.read_pickle('../data/test.pickle', compression='xz')
test_corps = pd.read_csv('../data/test_corps.csv')
corps = list(test_corps['content'])
corps = [str(line) for line in corps]

In [29]:
test['lang'] = test['lang'].apply(lambda x: get_lang(x))
test['date'] = pd.to_datetime(test.date)
test['time'] = pd.to_datetime(test.time)#.apply(lambda x: x.time()
test['time'] = test.time.apply(lambda x: x.time())
test['year_month'] = test['date'].apply(lambda x: 100 * x.year + x.month)
test['log_length'] = test['length'].apply(lambda x: np.log10(x+1))
X_corp_test = tfidf.transform(corps)
test['corp_tags'] = test.tags.apply(lambda x: ' '.join(json.loads(x)))
lang_test = langvec.transform(test.lang)
author_test = authorvec.transform(test.author)
title_test = titlevec.transform(test.tittle)
tag_test = tagvec.transform(test.corp_tags)
X_test = hstack((X_corp_test, lang_test, author_test, title_test, tag_test))
scaler, X_test = add_loglength(test, X_test, llscaler, False)
X_test = add_hour(df=test, X_sparse=X_test)

In [30]:
X_test.shape

(34645, 962128)

In [31]:
cv = cross_val_score(Ridge(), X_train, y, cv=8, n_jobs=-1, scoring=scorer)
print(f'Crossvalidation: mean = {cv.mean():.4f}, std = {cv.std():.4f}')

Crossvalidation: mean = 0.9960, std = 0.0489


In [32]:
ridge = Ridge()
ridge.fit(X_train, y)
pred = ridge.predict(X_test)

In [33]:
pred.mean()

3.4225321235960666

In [34]:
pred += (4.33328 - pred.mean())

In [35]:
write_submission_file(prediction=pred, filename='../out/tags_tfidf_ridge.csv')
message = '"tags, titles, authors, months and dirty hack"' #input('Информация о модели\n')
!kaggle competitions submit -c how-good-is-your-medium-article -f ../out/tags_tfidf_ridge.csv -m {message}

100%|████████████████████████████████████████| 843k/843k [00:10<00:00, 84.8kB/s]
Successfully submitted to How good is your Medium article?