In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import vstack, hstack, csc_matrix
from sklearn.preprocessing import OneHotEncoder
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold, StratifiedKFold, TimeSeriesSplit,\
                                    cross_val_score, cross_validate, GridSearchCV, ParameterGrid
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from tqdm import tqdm_notebook
from collections import defaultdict
import time
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
import category_encoders as ce
import json
from bs4 import BeautifulSoup
import re
from lightgbm import LGBMRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression
import gc
import nltk                                         #Natural language processing tool-kit

from nltk.corpus import stopwords                   #Stopwords corpus
from nltk.stem import PorterStemmer                 # Stemmer


In [2]:
# функция для чтения json файлов
def read_json(path_to_file):
    with open(path_to_file) as fin:
        for line in fin: 
            yield json.loads(line)

In [3]:
# пути к файлам, размеры файлов
PATH_TR, PATH_TE = 'train.json', 'test.json'
LEN_TR, LEN_TE = 62313, None

y_ser= pd.read_csv('train_log1p_recommends.csv', index_col='id')['log_recommends']
y_tr = y_ser.values
y_index= y_ser.index

IDX_SPLIT = np.int32(LEN_TR*.9)
NFOLDS = 3
SEED = 13

In [4]:
def _prepare_string(string):
    '''
    1) приводит к нижнему регистру
    2) удаляет пробелы в конце строки
    3) удаляет эмоджи
    4) удаляет стоп-слова
    5) делает стемминг(приводит к начальной форме, удаляя окончания и тд)
    6) удаляет пунктуацию
    '''
    snow = nltk.stem.SnowballStemmer('english')
    emoji_pattern = re.compile("["
                u"\U0001F600-\U0001F64F"  
                u"\U0001F300-\U0001F5FF"  
                u"\U0001F680-\U0001F6FF"  
                u"\U0001F1E0-\U0001F1FF"  
                                   "]+", flags=re.UNICODE)

    string = string.lower().strip() # нижний регистр                
    cleanr = re.compile('<.*?>')
    string = re.sub(cleanr, ' ', string) # удаляем html теги
    string = re.sub(r'[?|!|\'|"|#]',r'',string)
    string = re.sub(r'[.|,|)|(|\|/]',r' ',string) # удаляем пунтктуацию
    string = emoji_pattern.sub(r'', string).strip() # удаляем эмоджи

    words = []
    for word in string.split():
        # удаляем стоп-слова, делаем стемминг
        if word not in stopwords.words('english'):
            words.append(snow.stem(word))
            
    return ' '.join(words)

In [None]:
strings_L, string_idxs = [], []
for idx, article in tqdm_notebook(zip(y_index, read_json(PATH_TR)), total = LEN_TR):   
    string = BeautifulSoup(article['content'], "lxml").text     
    strings_L.append(_prepare_string(string))
    string_idxs.append(idx)

HBox(children=(FloatProgress(value=0.0, max=62313.0), HTML(value='')))

In [None]:
np.all(np.array(y_index) == np.array(string_idxs))

#### целевая переменная 

In [None]:
def inv_boxcox(y,ld):
    '''обратное преобразование бокса-кокса'''
    if ld == 0:
        return(np.exp(y))
    else:
        return(np.exp(np.log(ld*y+1)/ld))

In [None]:
# box-cox преобразование(чтобы исправить сильную скошенность)
y_boxcox_tr, ld = stats.boxcox(y_tr)

plt.hist(y_tr, density = 1, alpha = .7)
plt.hist(y_boxcox_tr, density = 1, alpha = .7)
plt.legend(['log', 'boxcol+log'])
plt.title('target')
plt.ylabel('share')
plt.xlabel('value')
plt.show()

In [None]:
lin_reg = LinearRegression()
ridge_reg = Ridge(random_state = SEED)
lasso_reg = Lasso(random_state = SEED)
lgb_reg = LGBMRegressor(random_state = SEED)

In [None]:
cv_results_L =[]
for i in tqdm_notebook([1, 2, 3]):
    vec = TfidfVectorizer(ngram_range = (1, i), max_features = 1000000)
    tfidf_tr = csc_matrix(vec.fit_transform(strings_L))
    scores = []
    for estimator in tqdm_notebook((lin_reg, ridge_reg, lasso_reg, lgb_reg)):
        scores.append(cross_validate(estimator, tfidf_tr[:IDX_SPLIT], y_boxcox_tr[:IDX_SPLIT],
                       cv=NFOLDS,
                       scoring = 'neg_mean_absolute_error')['test_score'].mean())
    cv_results_L.append((i, scores))