In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from scipy.sparse import hstack

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.multioutput import MultiOutputRegressor


pd.options.display.float_format = '{:,.2f}'.format
sns.set_theme()
nltk.download("stopwords");

RANDOM_STATE = 44

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train_df = pd.read_csv("../data/train_dataset_train.csv", index_col=0)
test_df = pd.read_csv("../data/test_dataset_test.csv", index_col=0)

train_df["is_train"] = 1
test_df["is_train"] = 0

df = pd.concat([train_df, test_df])

In [3]:
class RBKpreprocessor(BaseEstimator):
    def __init__(self):
        self._vectorizer_tags = TfidfVectorizer()
        self._vectorizer_authors = TfidfVectorizer()
        self._category_ohe = OneHotEncoder()
        self._category_from_title_ohe = OneHotEncoder()
        self._stop_words = stopwords.words("russian")
        self._stemmer = SnowballStemmer("russian")
        self._vectorizer_title = TfidfVectorizer()
        
        
    def _clean_list(self, title):
        return(title.
           replace("[", "").
           replace("]", "").
           replace(".", "").
           replace("'", "").
           replace(",", " ")
          )
    
    def _clean_title(self, title):
        if title.find("\n")>0:
            title = title[0:title.find("\n\n")].lower()
        title = " ".join([self._stemmer.stem(w) for w in title.split() if w not in self._stop_words])
        return title

    def _find_category_in_title(self, title):
        if title.find("\n")>0:
            title = title[title.find("\n\n"):].lower().strip()
        else:
            title = ""
        if "," in title:
            title = title[0:title.index(",")]
        else:
            title = ""

        return title
    
    def fit(self, df):
        self._category_ohe.fit(df.category.values.reshape(-1,1))
        
        authors_clean = df.authors.apply(self._clean_list)
        tags_clean = df.tags.apply(self._clean_list)
        self._vectorizer_tags.fit(tags_clean)
        self._vectorizer_authors.fit(authors_clean);
        
        title_clean = df.title.apply(self._clean_title)
        category_from_title =  df.title.apply(self._find_category_in_title)
        self._category_from_title_ohe.fit(category_from_title.values.reshape(-1,1))
        self._vectorizer_title.fit(title_clean)
        
        return(self)
        
        
    def transform(self, df):
        ctr_zero = (df.ctr == 0)
        ctr_log = np.log(df.ctr)
        mean_ctr_log = np.mean(ctr_log.values, where=(ctr_log != -np.inf))
        ctr_log = np.where(df["ctr"] == 0, mean_ctr_log, ctr_log)
        
        category_sparse = self._category_ohe.transform(df.category.values.reshape(-1,1))
        
        authors_clean = df.authors.apply(self._clean_list)
        tags_clean = df.tags.apply(self._clean_list)
        
        authors_count = authors_clean.apply(lambda x: len(x.split()))
        tags_count = tags_clean.apply(lambda x: len(x.split()))
        
        tags_sparse = self._vectorizer_tags.transform(tags_clean)
        authors_sparse = self._vectorizer_authors.transform(authors_clean)
        
        publish_date = pd.to_datetime(df.publish_date)
        publish_year = publish_date.dt.year * 100 + publish_date.dt.month
        publish_day = publish_date.dt.day
        publish_weekday = publish_date.dt.weekday
        publish_hour = publish_date.dt.hour
        
        title_clean = df.title.apply(self._clean_title)
        title_sparse  = self._vectorizer_title.transform(title_clean)
        
        category_from_title =  df.title.apply(self._find_category_in_title)
        category_from_title_sparse = self._category_from_title_ohe.transform(category_from_title.values.reshape(-1,1))
        
        return hstack([
            ctr_zero.values[:,None],
            ctr_log[:,None],
            category_sparse,
            authors_count.values[:,None],
            tags_count.values[:,None],
            tags_sparse,
            authors_sparse,
            publish_year.values[:,None],
            publish_day.values[:,None],
            publish_weekday.values[:,None],
            publish_hour.values[:,None],
            title_sparse,
            category_from_title_sparse
        ])
        
        

In [4]:
%%time
preprocess = RBKpreprocessor()
preprocess.fit(df)

CPU times: total: 2.5 s
Wall time: 3.81 s


# Cделаем wrapper для модели, чтобы обучался и предсказывал на  логарифмах

In [5]:
class LogModel(BaseEstimator):
    def __init__(self, model):
        self._model = model
    
    def fit(self, X, y):
        views_log = np.log(y["views"])
        depth_log = np.log(np.log(y["depth"]))
        full_reads_percent_log = y["full_reads_percent"]
        
        y_log = np.matrix([
            views_log,
            depth_log,
            full_reads_percent_log
        ]).T
        
        self._model.fit(X, y_log)
        
    def predict(self, X):
        preds = self._model.predict(X)
        preds[:,0] = np.exp(preds[:,0])
        preds[:,1] = np.exp(np.exp(preds[:,1]))
        return(preds)
        

In [6]:
class TrimModel(BaseEstimator):
    def __init__(self, model):
        self._model = model
    
    def fit(self, X, y):
        views = (np.where(y["views"] < 500000, y["views"], 500000))
        depth = (np.where(y["depth"] < 1.4, y["depth"], 1.4))
        full_reads_percent = np.where(y["full_reads_percent"] < 63, y["full_reads_percent"], 63)
        
        y_log = np.matrix([
            views,
            depth,
            full_reads_percent
        ]).T
        
        self._model.fit(X, y_log)
        
    def predict(self, X):
        preds = self._model.predict(X)
        return(preds)
        

In [7]:
class LogTrimModel(BaseEstimator):
    def __init__(self, model):
        self._model = model
    
    def fit(self, X, y):
        views_log = np.log(np.where(y["views"] < 500000, y["views"], 500000))
        depth_log = np.log(np.log(np.where(y["depth"] < 1.4, y["depth"], 1.4)))
        full_reads_percent_log = np.where(y["full_reads_percent"] < 63, y["full_reads_percent"], 63)
        
        y_log = np.matrix([
            views_log,
            depth_log,
            full_reads_percent_log
        ]).T
        
        self._model.fit(X, y_log)
        
    def predict(self, X):
        preds = self._model.predict(X)
        preds[:,0] = np.exp(preds[:,0])
        preds[:,1] = np.exp(np.exp(preds[:,1]))
        return(preds)
        

## Сделаем подсчет метрики локально

In [22]:
def my_metric(y_real, preds, detail=True):
    overall = (
            0.4*r2_score(y_real.iloc[:,0], preds[:,0]) + 
            0.3*r2_score(y_real.iloc[:,1], preds[:,1]) + 
            0.3*r2_score(y_real.iloc[:,2], preds[:,2])
        )
    
    if not detail:
        return overall
    else:
        return (
            overall,
            0.4*r2_score(y_real.iloc[:,0], preds[:,0]), 
            0.3*r2_score(y_real.iloc[:,1], preds[:,1]), 
            0.3*r2_score(y_real.iloc[:,2], preds[:,2])
        )

## Запишем файл сабмита

In [23]:
def write_down_predictions(preds, output_file = "output.csv"):
    solution = pd.read_csv("data\sample_solution.csv")
    solution.iloc[:,1:4] = preds
    solution.to_csv(output_file, index=False)

## Проерим есть ли смысле предсказывать логарифм, на случайном лесе


In [110]:

model = RandomForestRegressor(n_estimators=10, random_state=RANDOM_STATE)
model2 = LogModel(RandomForestRegressor(n_estimators=10, random_state=RANDOM_STATE))
model3 = TrimModel(RandomForestRegressor(n_estimators=10, random_state=RANDOM_STATE))
model4 = LogTrimModel(RandomForestRegressor(n_estimators=10, random_state=RANDOM_STATE))

In [111]:
%%time
X = preprocess.transform(train_df)
y = train_df[["views", "depth", "full_reads_percent"]]


  result = getattr(ufunc, method)(*inputs, **kwargs)


Wall time: 8.28 s


In [112]:
%%time
model.fit(X,y)

Wall time: 34.3 s


RandomForestRegressor(n_estimators=10, random_state=44)

In [113]:
%%time
model2.fit(X,y)

Wall time: 42.7 s


In [114]:
%%time
model3.fit(X,y)

Wall time: 36.7 s


In [117]:
%%time
model4.fit(X,y)

Wall time: 28.8 s


In [121]:
preds1 = model.predict(X)
preds2 = model2.predict(X)
preds3 = model3.predict(X)
preds4 = model4.predict(X)

In [122]:
my_metric(y, preds1)

(0.8938592005968848,
 0.3558952283397556,
 0.28382833050623957,
 0.25413564175088965)

In [123]:
my_metric(y, preds2)

(0.8541198898857254,
 0.32121048725985796,
 0.2720670636551116,
 0.26084233897075587)

In [124]:
my_metric(y, preds3)

(0.6963091484413472,
 0.19231865601416565,
 0.26946807013595553,
 0.23452242229122597)

In [125]:
my_metric(y, preds4)

(0.6616841378266715,
 0.16497621232865287,
 0.25504590577806735,
 0.24166201971995124)

Выводы:
- И логарифмирование целевой переменной, и обрезка выбросов ухудщают метрики, причем больше всего обрезка выбросов
- Пока это не будем использовать, но возможно стоит попробовать на других моделяю

## Train simple random forest with 1000 trees

In [121]:
model = RandomForestRegressor(n_estimators=1000, n_jobs=-1)

In [122]:
%%time
#model.fit(X,y)

Wall time: 16min 1s


RandomForestRegressor(n_estimators=1000, n_jobs=-1)

In [123]:
#preds = model.predict(X)

In [124]:
#my_metric(y, preds)

(0.9317596455378674,
 0.372370229905858,
 0.28945648936304835,
 0.26993292626896104)

In [125]:
#x_test = preprocess.transform(test_df)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [126]:
preds = model.predict(x_test)

In [127]:
write_down_predictions(preds, "local2. random_forest 1000.csv")

## Сделаем локальную валидацию, отделим валидационный набор

In [22]:
valid_train_df, valid_test_df = train_test_split(train_df, test_size=0.2, random_state=42, shuffle=True)

In [54]:
model = RandomForestRegressor(n_estimators=20, random_state=RANDOM_STATE)

In [50]:
%%time
X_train = preprocess.transform(valid_train_df)
y_train = valid_train_df[["views", "depth", "full_reads_percent"]]

X_test = preprocess.transform(valid_test_df)
y_test = valid_test_df[["views", "depth", "full_reads_percent"]]



  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Wall time: 3.25 s


In [55]:
%%time
model.fit(X_train, y_train)

Wall time: 22.7 s


RandomForestRegressor(n_estimators=20, random_state=44)

In [56]:
preds = model.predict(X_test)

In [57]:
my_metric(y_test, preds)

(0.5505679603112843,
 0.27710126175703836,
 0.2246971035024428,
 0.048769595051803116)

Ну такой себе скор, причем просел видно что на full_reads_percent, его вообще не угадал

In [153]:
pd.DataFrame(y_test.full_reads_percent- preds[:,2])

Unnamed: 0_level_0,full_reads_percent
document_id,Unnamed: 1_level_1
6263fbeb9a794761fa1c9376GdvEEJMZRXa38h4sJlwntg,5.37
6283aadb9a794705cc4ac991JnuDF1b9TZCuQ3VU56NrcA,-10.29
61fd51029a79478831c73fb6vO1lIZGVSA6bcD86zFD_GQ,1.88
6270fb119a7947ad76f31a99mk49J4pYQJel6lQQ9QNs4g,6.71
625ae6bf9a794712dc71f3beh3_sjwUqRrWd4t2UmMI_KA,10.63
...,...
6245617c9a7947bc739707c6W7orlrmeRMWJyOIhg8AADQ,-14.22
628381d19a794778b3dea1bf6UHf2I9-ShKAFECe99AexQ,2.89
627070f39a79478c90d17e708QhTqUYuQOyA9EcJEYHHjQ,-15.93
62855f7e9a79479a4008576dkSYRUq_GS6CgTPH4kdteoA,7.14


## Попробуем подобрать параметры деревьев
- Глубина дерева
- Количетво фичей в дереве
- Количество деревьев будем брать равное 20, чтобы быстрее считало, потом обучим на 1000

In [65]:
X_train.shape[1]

17377

In [102]:
parameters = dict(
    max_features=[650, 700, 750],
    max_depth =[500, 700, 1000]
)
model = RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)
clf = RandomizedSearchCV(model, parameters, random_state=RANDOM_STATE, n_iter=20)

In [103]:
%%time
X_train = preprocess.transform(train_df)
y_train = train_df[["views", "depth", "full_reads_percent"]]

  result = getattr(ufunc, method)(*inputs, **kwargs)


Wall time: 3.76 s


In [104]:
%%time
search = clf.fit(X_train, y_train)



Wall time: 6min 12s


In [105]:
search.best_params_

{'max_features': 700, 'max_depth': 700}

In [106]:
pd.DataFrame(search.cv_results_)[["param_max_features", "param_max_depth", "rank_test_score", "mean_test_score"]]

Unnamed: 0,param_max_features,param_max_depth,rank_test_score,mean_test_score
0,650,500,4,0.52
1,700,500,3,0.52
2,750,500,7,0.5
3,650,700,4,0.52
4,700,700,1,0.52
5,750,700,7,0.5
6,650,1000,4,0.52
7,700,1000,1,0.52
8,750,1000,7,0.5


In [100]:
search.best_params_

{'max_features': 700, 'max_depth': 500}

In [101]:
pd.DataFrame(search.cv_results_)[["param_max_features", "param_max_depth", "rank_test_score", "mean_test_score"]]

Unnamed: 0,param_max_features,param_max_depth,rank_test_score,mean_test_score
0,600,200,12,0.51
1,700,200,4,0.52
2,800,200,5,0.52
3,600,300,9,0.51
4,700,300,2,0.52
5,800,300,6,0.52
6,600,400,9,0.51
7,700,400,2,0.52
8,800,400,6,0.52
9,600,500,9,0.51


In [96]:
pd.DataFrame(search.cv_results_)[["param_max_features", "param_max_depth", "rank_test_score", "mean_test_score"]]

Unnamed: 0,param_max_features,param_max_depth,rank_test_score,mean_test_score
0,300,70,20,0.45
1,400,70,19,0.48
2,500,70,18,0.49
3,700,70,11,0.51
4,1000,70,12,0.5
5,300,120,17,0.49
6,400,120,14,0.5
7,500,120,6,0.51
8,700,120,5,0.51
9,1000,120,8,0.51


In [84]:
search.best_params_

{'max_features': 1000, 'max_depth': 70}

In [79]:
search.best_params_

{'max_features': 5000, 'max_depth': 20}

### Лушчие параметры
**{'max_features': 700, 'max_depth': 200}**

In [88]:
search.best_params_

{'max_features': 700, 'max_depth': 200}

## Попробучем обучить модель на лучших параметрах (700, 700) и на 1000 деревьев, Что за прогноз будет

In [114]:
model = RandomForestRegressor(n_estimators=2000, n_jobs=-1, max_features=700, max_depth=700)

In [115]:
%%time
X = preprocess.transform(train_df)
y = train_df[["views", "depth", "full_reads_percent"]]


  result = getattr(ufunc, method)(*inputs, **kwargs)


Wall time: 3.71 s


In [116]:
%%time
model.fit(X,y)

Wall time: 3min 10s


RandomForestRegressor(max_depth=700, max_features=700, n_estimators=2000,
                      n_jobs=-1)

In [117]:
x_test = preprocess.transform(test_df)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [118]:
preds = model.predict(x_test)

In [120]:
write_down_predictions(preds, "local3. random_forest 2000 + fit params.csv")

Проверим улучшился ли результат на лидерборде:
Да результат улучшился **(0.65)**

In [122]:
preds = model.predict(X)

In [123]:
my_metric(y, preds)

(0.9399306741908753,
 0.37833452036341564,
 0.28952763119932506,
 0.2720685226281346)

In [127]:
with open("rf2000.pkl", "wb") as pkl_file:
    pickle.dump(model, pkl_file)

## Попробуем MultiOutputRegressor 
Делать отдельные модели для каждой 

In [17]:
valid_train_df, valid_test_df = train_test_split(train_df, test_size=0.2, random_state=42, shuffle=True)

In [18]:
model1 = RandomForestRegressor(n_estimators=20, max_features=700, max_depth=700, random_state=RANDOM_STATE)
model2 = MultiOutputRegressor(RandomForestRegressor(n_estimators=20, max_features=700, max_depth=700, random_state=RANDOM_STATE))

In [19]:
%%time
X_train = preprocess.transform(valid_train_df)
y_train = valid_train_df[["views", "depth", "full_reads_percent"]]

X_test = preprocess.transform(valid_test_df)
y_test = valid_test_df[["views", "depth", "full_reads_percent"]]


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Wall time: 2.94 s


In [20]:
%%time
model1.fit(X_train, y_train)
model2.fit(X_train, y_train)

Wall time: 18 s


MultiOutputRegressor(estimator=RandomForestRegressor(max_depth=700,
                                                     max_features=700,
                                                     n_estimators=20,
                                                     random_state=44))

In [27]:
preds1 = model1.predict(X_train)
preds2 = model2.predict(X_train)


In [28]:
my_metric(y_train, preds1)

(0.9218316140186791,
 0.3712003309810816,
 0.2852932008161508,
 0.26533808222144656)

In [29]:
my_metric(y_train, preds2)

(0.9212989838317761,
 0.366535239605197,
 0.28627478092025055,
 0.26848896330632854)

In [30]:
preds1 = model1.predict(X_test)
preds2 = model2.predict(X_test)


In [31]:
my_metric(y_test, preds1)

(0.5165268324760511,
 0.21239984391713654,
 0.21335739024112113,
 0.09076959831779347)

In [32]:
my_metric(y_test, preds2)

(0.5532161987070281,
 0.20428740366384845,
 0.22703070210431409,
 0.12189809293886558)

**Интересно:**
- Модель 2 лучше предсказывает 2 и 3 параметр, но хуже первый
- Идея попробовать модель MultiOutputRegressor на 2 и 3 колонку, а для первой взять предсказание обученной модели rf2000

In [55]:
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=2000, n_jobs=-1, max_features=700, max_depth=700))

In [56]:
%%time
X = preprocess.transform(train_df)
y = train_df[["depth", "full_reads_percent"]]

  result = getattr(ufunc, method)(*inputs, **kwargs)


Wall time: 2.57 s


In [57]:
%%time
model.fit(X,y)

Wall time: 5min 14s


MultiOutputRegressor(estimator=RandomForestRegressor(max_depth=700,
                                                     max_features=700,
                                                     n_estimators=2000,
                                                     n_jobs=-1))

In [58]:
x_test = preprocess.transform(test_df)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [59]:
preds_for_2_3 = model.predict(x_test)

In [42]:
with open("models/rf2000.pkl", "rb") as pkl_file:
    old_model = pickle.load(pkl_file)

In [43]:
preds_for_1 = old_model.predict(x_test)

In [60]:
preds = np.c_[preds_for_1[:,0], preds_for_2_3]

In [61]:
write_down_predictions(preds, "local4. random_forest 2000 + MultiOutputRegressor.csv")

## Эта комбинация показала пока лучший результат на лидерборде