In [1]:
import numpy as np
import json
import pandas as pd
from tqdm import tqdm_notebook
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge
from scipy.sparse import csr_matrix, hstack, vstack
from bs4 import BeautifulSoup
from langdetect import detect
import re
import pickle
import lightgbm as lgb
import gensim
from gensim.matutils  import Sparse2Corpus
from gensim.corpora import Dictionary
from gensim.models import LdaModel
train_target = pd.read_csv('train_log1p_recommends.csv', index_col='id')
y_train = train_target['log_recommends'].values

In [8]:
df_train = pd.DataFrame()
df_test = pd.DataFrame()

In [9]:
def read_json_line(line=None):
    result = None
    try:        
        result = json.loads(line)
    except Exception as e:      
        idx_to_replace = int(str(e).split(' ')[-1].replace(')',''))      
        new_line = list(line)
        new_line[idx_to_replace] = ' '
        new_line = ''.join(new_line)     
        return read_json_line(line=new_line)
    return result

from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ' '.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def preprocess(path_to_inp_json_file):
    content_list = []
    images_list = []
    frames_list = []
    published_list = []
    author_list = []
    domain_list = []
    tags_list = []
    lang_list = []

    with open(path_to_inp_json_file, encoding='utf-8') as inp_file:
        for line in tqdm_notebook(inp_file):
            json_data = read_json_line(line)
            content = json_data['content'].replace('\n', ' ').replace('\r', ' ').replace('\xa0', ' ').replace('\u200a', ' ')

            tags_str = []
            soup = BeautifulSoup(content, 'lxml')
            try:
                tag_block = soup.find('ul', class_='tags')
                tags = tag_block.find_all('a')
                for tag in tags:
                    tags_str.append(tag.text.translate({ord(' '):None, ord('-'):None}))
                tags = ' '.join(tags_str)
            except Exception:
                tags = 'none'
            tags_list.append(tags)
        
            content_no_html_tags = strip_tags(content)
            content_list.append(content_no_html_tags)

            frames = re.findall(r'iframeContainer', content)
            images = re.findall(r'<img class=', content)

            images_list.append(len(images))
            frames_list.append(len(frames))
            
            published = json_data['published']['$date']
            published_list.append(published) 
 
            author = json_data['meta_tags']['author'].strip()
            author_list.append(author) 
        
            domain = json_data['domain']
            domain_list.append(domain)
            
            try:
                lang = detect(content_no_html_tags)
            except Exception:      
                lang = 'undefined'   
            lang_list.append(lang)
            

    return content_list, images_list, frames_list, published_list, author_list, domain_list, tags_list, lang_list

In [None]:
print('Preprocessing train data...')
content_train, images_list, frames_list, published_list, author_list, domain_list, tags_list, lang_list = preprocess(path_to_inp_json_file='train.json')
df_train['images'] = images_list
df_train['frames'] = frames_list
df_train['published'] = published_list
df_train['author'] = author_list
df_train['domain'] = domain_list
df_train['tags'] = tags_list
df_train['lang'] = lang_list

print('Preprocessing test data...')
content_test, images_list, frames_list, published_list, author_list, domain_list, tags_list, lang_list = preprocess(path_to_inp_json_file='test.json')
df_test['images'] = images_list
df_test['frames'] = frames_list
df_test['published'] = published_list
df_test['author'] = author_list
df_test['domain'] = domain_list
df_test['tags'] = tags_list
df_test['lang'] = lang_list

del images_list, frames_list, published_list, author_list, domain_list, tags_list, lang_list





In [11]:
df_train['num_tags'] = df_train['tags'].apply(lambda x: len(x.split()))
df_test['num_tags'] = df_test['tags'].apply(lambda x: len(x.split()))

In [14]:
df_train.to_pickle('df_train.pkl')
df_test.to_pickle('df_test.pkl')
with open('content_train.pkl', 'wb') as f:
    pickle.dump(content_train, f)
with open('content_test.pkl', 'wb') as f:
    pickle.dump(content_test, f)

In [2]:
df_train = pd.read_pickle('df_train.pkl')
df_test = pd.read_pickle('df_test.pkl')
with open ('content_train.pkl', 'rb') as f:
    content_train = pickle.load(f)
with open ('content_test.pkl', 'rb') as f:
    content_test = pickle.load(f)

In [3]:
df_train['published'] = pd.to_datetime(df_train['published'], format='%Y-%m-%dT%H:%M:%S.%fZ')
df_test['published'] = pd.to_datetime(df_test['published'], format='%Y-%m-%dT%H:%M:%S.%fZ')

In [4]:
print('Preprocessing LDA features...')
cv = CountVectorizer(max_features=10000, min_df = 0.1, max_df = 0.8)
sparse_train = cv.fit_transform(content_train)
sparse_test  = cv.transform(content_test)
full_sparse_data =  vstack([sparse_train, sparse_test])

corpus_data_gensim = gensim.matutils.Sparse2Corpus(full_sparse_data, documents_columns=False)
del sparse_train, sparse_test, full_sparse_data

vocabulary_gensim = {}
for key, val in cv.vocabulary_.items():
    vocabulary_gensim[val] = key
    
dict = Dictionary()
dict.merge_with(vocabulary_gensim)

lda = LdaModel(corpus_data_gensim, num_topics = 30 )

def document_to_lda_features(lda_model, document):
    topic_importances = lda.get_document_topics(document, minimum_probability=0)
    topic_importances = np.array(topic_importances)
    return topic_importances[:,1]

lda_features = list(map(lambda doc:document_to_lda_features(lda, doc),corpus_data_gensim))
data_pd_lda_features = pd.DataFrame(lda_features)

df_lda_train = data_pd_lda_features.iloc[:len(y_train), :]
df_lda_test = data_pd_lda_features.iloc[len(y_train):, :]
del data_pd_lda_features

Preprocessing LDA features...


In [5]:
print('Preprocessing features...')
df_train = pd.concat([df_train, df_lda_train], axis=1)
df_test = pd.concat([df_test, df_lda_test.reset_index(drop=True)], axis=1)
del df_lda_train, df_lda_test

df_train['content'] = content_train
df_train['target'] = y_train

df_train.sort_values(by='published', inplace=True)
df_train = df_train[df_train.published>='2016-01-01']
df_train.reset_index(drop=True, inplace=True)
idx = len(df_train[df_train.published<'2017-04-01'])

content_train = df_train['content'].values.tolist()
y_train = df_train['target'].values
df_train.drop(['content', 'target', 'tags'], axis=1, inplace=True)

df_test.drop(['tags'], axis=1, inplace=True)

Preprocessing features...


In [6]:
def a_d(row):
    return str(row['domain'])+'_'+str(row['author'])

idx_split = len(df_train)
df_full = pd.concat([df_train, df_test])


# Временные признаки
df_full['dow'] = df_full['published'].apply(lambda x: x.dayofweek)
df_full['month'] = df_full['published'].apply(lambda x: x.month)
df_full['hour'] = df_full['published'].apply(lambda x: x.hour)
df_full['year'] = df_full['published'].apply(lambda x: x.year)
df_full['year_month'] = df_full['published'].apply(lambda x: 100 * x.year + x.month)

df_full['hour_sin'] = np.sin((df_full.hour)*(2.*np.pi/24))
df_full['hour_cos'] = np.cos((df_full.hour)*(2.*np.pi/24))

df_full['dow_sin'] = np.sin((df_full.dow)*(2.*np.pi/7))
df_full['dow_cos'] = np.cos((df_full.dow)*(2.*np.pi/7))

df_full['month_sin'] = np.sin((df_full.month)*(2.*np.pi/12))
df_full['month_cos'] = np.cos((df_full.month)*(2.*np.pi/12))

#Другие
counts = df_full.author.value_counts()
repl = counts[counts < 2].index
df_full['author'] = df_full.author.replace(repl, 'other').values

counts = df_full.domain.value_counts()
repl = counts[counts < 1000].index
df_full['domain'] = df_full.domain.replace(repl, 'other').values

df_full['a_d'] = df_full.apply(a_d, axis=1)

# Преобразование
list_to_dums = ['author', 'domain', 'lang', 'year', 'year_month', 'num_tags', 'images', 'a_d', 'frames']
dummies = pd.get_dummies(df_full, columns = list_to_dums, #drop_first=True,
                            prefix=['col_{}'.format(i) for i in range(len(list_to_dums))], sparse=False)

# Удаляем ненужные
list_to_drop = ['published', 'hour', 'dow', 'month']
dummies.drop(list_to_drop, axis=1, inplace=True)

X_train_feats = dummies.iloc[:idx_split, :]
X_test_feats = dummies.iloc[idx_split:, :]

print('TRAIN feats: {}'.format(X_train_feats.shape))
print('TEST feats: {}'.format(X_test_feats.shape))
print('dummies: {}'.format(dummies.shape))
del dummies, df_full

TRAIN feats: (45938, 30681)
TEST feats: (34645, 30681)
dummies: (80583, 30681)


In [7]:
def write_submission_file(prediction, filename,
                          path_to_sample='sample_submission.csv'):
    submission = pd.read_csv(path_to_sample, index_col='id')
    
    submission['log_recommends'] = prediction
    submission.to_csv(filename)

In [8]:
print('Making TF-IDF features...')
tfidf_content_params={'ngram_range': (1,3),
                      'max_features': 150000, 
                      'stop_words': 'english',
                      'sublinear_tf': True
                      }
tfidf_content = TfidfVectorizer(**tfidf_content_params)

ridge_params={
               'alpha': 1.30,
               'random_state': 1,
                  }

ridge = Ridge(**ridge_params)

X_train_content = tfidf_content.fit_transform(content_train)
X_test_content = tfidf_content.transform(content_test)

X_train_csr = csr_matrix(hstack([X_train_content, X_train_feats.values])) 
X_test_csr = csr_matrix(hstack([X_test_content, X_test_feats.values]))  

print('Training Ridge model...')
ridge_pred = ridge.fit(X_train_csr, y_train).predict(X_test_csr)

Making TF-IDF features...
Training Ridge model...


In [9]:
print('Training LightGBM model...')
params = {
    'num_leaves': 2**5 - 1,
    'objective': 'mean_absolute_error',
    'learning_rate': 0.01,
    'bagging_fraction': 0.9,
    'metric': 'mean_absolute_error',
    'bagging_seed': 1,
    'num_threads': 8
}

np.random.seed(1)
lgb_train = lgb.Dataset(X_train_csr, label=y_train)
bst_lgb = lgb.train(params, lgb_train, num_boost_round=600)
lgb_pred = bst_lgb.predict(X_test_csr, num_iteration=600)

Training LightGBM model...


In [10]:
print('Making prediction...')
mix_pred = 0.733 * ridge_pred + (1-0.733) * lgb_pred
shift = 4.33328 - np.mean(mix_pred)
mix_pred = np.log1p(np.round(np.expm1(mix_pred)))
mix_pred = mix_pred + shift
write_submission_file(mix_pred, 'petukhov_submission.csv')

Making prediction...
