In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.base import BaseEstimator
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from scipy.sparse import hstack

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.multioutput import MultiOutputRegressor
from catboost import CatBoostRegressor, Pool


pd.options.display.float_format = '{:,.2f}'.format
sns.set_theme()
nltk.download("stopwords");

RANDOM_STATE = 44

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train_df = pd.read_csv("data/train_dataset_train.csv", index_col=0)
test_df = pd.read_csv("data/test_dataset_test.csv", index_col=0)

train_df["is_train"] = 1
test_df["is_train"] = 0

df = pd.concat([train_df, test_df])

In [3]:
class RBKpreprocessor(BaseEstimator):
    def __init__(self):
        self._vectorizer_tags = TfidfVectorizer()
        self._vectorizer_authors = TfidfVectorizer()
        self._category_ohe = LabelEncoder()
        self._category_from_title_ohe = LabelEncoder()
        self._stop_words = stopwords.words("russian")
        self._stemmer = SnowballStemmer("russian")
        self._vectorizer_title = TfidfVectorizer()
        
        
    def _clean_list(self, title):
        return(title.
           replace("[", "").
           replace("]", "").
           replace(".", "").
           replace("'", "").
           replace(",", " ")
          )
    
    def _clean_title(self, title):
        if title.find("\n")>0:
            title = title[0:title.find("\n\n")].lower()
        title = " ".join([self._stemmer.stem(w) for w in title.split() if w not in self._stop_words])
        return title

    def _find_category_in_title(self, title):
        if title.find("\n")>0:
            title = title[title.find("\n\n"):].lower().strip()
        else:
            title = ""
        if "," in title:
            title = title[0:title.index(",")]
        else:
            title = ""

        return title
    
    def fit(self, df):
        self._category_ohe.fit(df.category.values.reshape(-1,1))
        
        authors_clean = df.authors.apply(self._clean_list)
        tags_clean = df.tags.apply(self._clean_list)
        self._vectorizer_tags.fit(tags_clean)
        self._vectorizer_authors.fit(authors_clean);
        
        title_clean = df.title.apply(self._clean_title)
        category_from_title =  df.title.apply(self._find_category_in_title)
        self._category_from_title_ohe.fit(category_from_title.values.reshape(-1,1))
        self._vectorizer_title.fit(title_clean)
        
        return(self)
        
        
    def transform(self, df):
        ctr_zero = (df.ctr == 0)
        ctr_log = np.log(df.ctr)
        mean_ctr_log = np.mean(ctr_log.values, where=(ctr_log != -np.inf))
        ctr_log = np.where(df["ctr"] == 0, mean_ctr_log, ctr_log)
        
        category_sparse = self._category_ohe.transform(df.category.values)
        
        authors_clean = df.authors.apply(self._clean_list)
        tags_clean = df.tags.apply(self._clean_list)
        
        authors_count = authors_clean.apply(lambda x: len(x.split()))
        tags_count = tags_clean.apply(lambda x: len(x.split()))
        
        tags_sparse = self._vectorizer_tags.transform(tags_clean)
        authors_sparse = self._vectorizer_authors.transform(authors_clean)
        
        publish_date = pd.to_datetime(df.publish_date)
        publish_year = publish_date.dt.year * 100 + publish_date.dt.month
        publish_day = publish_date.dt.day
        publish_weekday = publish_date.dt.weekday
        publish_hour = publish_date.dt.hour
        
        title_clean = df.title.apply(self._clean_title)
        title_sparse  = self._vectorizer_title.transform(title_clean)
        
        category_from_title =  df.title.apply(self._find_category_in_title)
        category_from_title_sparse = self._category_from_title_ohe.transform(category_from_title.values)
        
        return hstack([
            category_sparse.reshape(-1,1),
            category_from_title_sparse.reshape(-1,1),
            ctr_zero.values[:,None],
            ctr_log[:,None],
            authors_count.values[:,None],
            tags_count.values[:,None],
            tags_sparse,
            authors_sparse,
            publish_year.values[:,None],
            publish_day.values[:,None],
            publish_weekday.values[:,None],
            publish_hour.values[:,None],
            title_sparse            
        ])
        
def my_metric(y_real, preds, detail=True):
    overall = (
            0.4*r2_score(y_real.iloc[:,0], preds[:,0]) + 
            0.3*r2_score(y_real.iloc[:,1], preds[:,1]) + 
            0.3*r2_score(y_real.iloc[:,2], preds[:,2])
        )
    
    if not detail:
        return overall
    else:
        return (
            overall,
            0.4*r2_score(y_real.iloc[:,0], preds[:,0]), 
            0.3*r2_score(y_real.iloc[:,1], preds[:,1]), 
            0.3*r2_score(y_real.iloc[:,2], preds[:,2])
        )
    
def write_down_predictions(preds, output_file = "output.csv"):
    solution = pd.read_csv("data\sample_solution.csv")
    solution.iloc[:,1:4] = preds
    solution.to_csv(output_file, index=False)

In [4]:
class RBKpreprocessor(BaseEstimator):
    def __init__(self):
        self._vectorizer_tags = TfidfVectorizer()
        self._vectorizer_authors = TfidfVectorizer()
        self._category_ohe = OneHotEncoder()
        self._category_from_title_ohe = OneHotEncoder()
        self._stop_words = stopwords.words("russian")
        self._stemmer = SnowballStemmer("russian")
        self._vectorizer_title = TfidfVectorizer()
        
        
    def _clean_list(self, title):
        return(title.
           replace("[", "").
           replace("]", "").
           replace(".", "").
           replace("'", "").
           replace(",", " ")
          )
    
    def _clean_title(self, title):
        if title.find("\n")>0:
            title = title[0:title.find("\n\n")].lower()
        title = " ".join([self._stemmer.stem(w) for w in title.split() if w not in self._stop_words])
        return title

    def _find_category_in_title(self, title):
        if title.find("\n")>0:
            title = title[title.find("\n\n"):].lower().strip()
        else:
            title = ""
        if "," in title:
            title = title[0:title.index(",")]
        else:
            title = ""

        return title
    
    def fit(self, df):
        self._category_ohe.fit(df.category.values.reshape(-1,1))
        
        authors_clean = df.authors.apply(self._clean_list)
        tags_clean = df.tags.apply(self._clean_list)
        self._vectorizer_tags.fit(tags_clean)
        self._vectorizer_authors.fit(authors_clean);
        
        title_clean = df.title.apply(self._clean_title)
        category_from_title =  df.title.apply(self._find_category_in_title)
        self._category_from_title_ohe.fit(category_from_title.values.reshape(-1,1))
        self._vectorizer_title.fit(title_clean)
        
        return(self)
        
        
    def transform(self, df):
        ctr_zero = (df.ctr == 0)
        ctr_log = np.log(df.ctr)
        mean_ctr_log = np.mean(ctr_log.values, where=(ctr_log != -np.inf))
        ctr_log = np.where(df["ctr"] == 0, mean_ctr_log, ctr_log)
        
        category_sparse = self._category_ohe.transform(df.category.values.reshape(-1,1))
        
        authors_clean = df.authors.apply(self._clean_list)
        tags_clean = df.tags.apply(self._clean_list)
        
        authors_count = authors_clean.apply(lambda x: len(x.split()))
        tags_count = tags_clean.apply(lambda x: len(x.split()))
        
        tags_sparse = self._vectorizer_tags.transform(tags_clean)
        authors_sparse = self._vectorizer_authors.transform(authors_clean)
        
        publish_date = pd.to_datetime(df.publish_date)
        publish_year = publish_date.dt.year * 100 + publish_date.dt.month
        publish_day = publish_date.dt.day
        publish_weekday = publish_date.dt.weekday
        publish_hour = publish_date.dt.hour
        
        title_clean = df.title.apply(self._clean_title)
        title_sparse  = self._vectorizer_title.transform(title_clean)
        
        category_from_title =  df.title.apply(self._find_category_in_title)
        category_from_title_sparse = self._category_from_title_ohe.transform(category_from_title.values.reshape(-1,1))
        
        return hstack([
            ctr_zero.values[:,None],
            ctr_log[:,None],
            category_sparse,
            authors_count.values[:,None],
            tags_count.values[:,None],
            tags_sparse,
            authors_sparse,
            publish_year.values[:,None],
            publish_day.values[:,None],
            publish_weekday.values[:,None],
            publish_hour.values[:,None],
            title_sparse,
            category_from_title_sparse
        ])
        
def my_metric(y_real, preds, detail=True):
    overall = (
            0.4*r2_score(y_real.iloc[:,0], preds[:,0]) + 
            0.3*r2_score(y_real.iloc[:,1], preds[:,1]) + 
            0.3*r2_score(y_real.iloc[:,2], preds[:,2])
        )
    
    if not detail:
        return overall
    else:
        return (
            overall,
            0.4*r2_score(y_real.iloc[:,0], preds[:,0]), 
            0.3*r2_score(y_real.iloc[:,1], preds[:,1]), 
            0.3*r2_score(y_real.iloc[:,2], preds[:,2])
        )
    
def write_down_predictions(preds, output_file = "output.csv"):
    solution = pd.read_csv("data\sample_solution.csv")
    solution.iloc[:,1:4] = preds
    solution.to_csv(output_file, index=False)

In [5]:
%%time
preprocess = RBKpreprocessor()
preprocess.fit(df)

Wall time: 7.66 s


RBKpreprocessor()

In [6]:
# %%time
# valid_train_df, valid_test_df = train_test_split(train_df, test_size=0.2, random_state=RANDOM_STATE, shuffle=True)


# X_train = preprocess.transform(valid_train_df)
# y_train = valid_train_df[["views", "depth", "full_reads_percent"]]

# X_test = preprocess.transform(valid_test_df)
# y_test = valid_test_df[["views", "depth", "full_reads_percent"]]



## Tuning for catboost

In [7]:

X_train = preprocess.transform(train_df)
y_train = train_df[["views", "depth", "full_reads_percent"]]

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [8]:
model = CatBoostRegressor(iterations=120,
                          early_stopping_rounds=30,
                          random_seed=RANDOM_STATE,
                          eval_metric="R2",
                          loss_function="RMSE")

In [9]:
GOAL_NUM = 0

In [10]:
params = {'depth':[7,8,9,10],
          'learning_rate' : [0.05, 0.1]
         }
clf = RandomizedSearchCV(model, params, random_state=RANDOM_STATE, n_iter=20, n_jobs=-1)

In [None]:
%%time
search = clf.fit(X_train, 
          y_train.iloc[:,GOAL_NUM]
         )


In [12]:
search.best_params_

{'learning_rate': 0.1, 'depth': 8}

In [13]:
pd.DataFrame(search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,87.47,1.4,0.42,0.15,0.05,7,"{'learning_rate': 0.05, 'depth': 7}",0.44,0.53,0.38,0.04,0.44,0.37,0.17,4
1,117.72,36.2,0.55,0.1,0.1,7,"{'learning_rate': 0.1, 'depth': 7}",0.49,0.58,0.42,-0.17,0.46,0.36,0.27,5
2,207.34,1.4,0.39,0.04,0.05,8,"{'learning_rate': 0.05, 'depth': 8}",0.42,0.53,0.37,-0.26,0.47,0.31,0.29,8
3,265.14,27.81,0.89,0.54,0.1,8,"{'learning_rate': 0.1, 'depth': 8}",0.47,0.59,0.42,0.08,0.52,0.42,0.18,1
4,428.3,42.72,0.72,0.16,0.05,9,"{'learning_rate': 0.05, 'depth': 9}",0.46,0.53,0.38,-0.13,0.46,0.34,0.24,6
5,564.37,29.57,1.18,0.31,0.1,9,"{'learning_rate': 0.1, 'depth': 9}",0.53,0.59,0.41,-0.06,0.46,0.38,0.23,2
6,1219.94,39.24,0.97,0.32,0.05,10,"{'learning_rate': 0.05, 'depth': 10}",0.46,0.55,0.38,-0.12,0.41,0.33,0.24,7
7,1003.05,237.18,0.39,0.15,0.1,10,"{'learning_rate': 0.1, 'depth': 10}",0.48,0.6,0.43,-0.09,0.5,0.38,0.24,3


In [9]:
GOAL_NUM = 1
params = {'depth':[7,8,9,10],
          'learning_rate' : [0.1, 0.15]
         } 
clf = RandomizedSearchCV(model, params, random_state=RANDOM_STATE, n_iter=20, n_jobs=3)
search1 = clf.fit(X_train, 
          y_train.iloc[:,GOAL_NUM]
         )
search1.best_params_



0:	learn: 0.1793380	total: 1.07s	remaining: 2m 7s
1:	learn: 0.3117821	total: 1.97s	remaining: 1m 56s
2:	learn: 0.4105699	total: 2.97s	remaining: 1m 55s
3:	learn: 0.4862333	total: 3.97s	remaining: 1m 55s
4:	learn: 0.5470395	total: 4.79s	remaining: 1m 50s
5:	learn: 0.5973565	total: 5.64s	remaining: 1m 47s
6:	learn: 0.6374325	total: 6.61s	remaining: 1m 46s
7:	learn: 0.6690178	total: 7.55s	remaining: 1m 45s
8:	learn: 0.6907904	total: 8.43s	remaining: 1m 43s
9:	learn: 0.7097812	total: 9.29s	remaining: 1m 42s
10:	learn: 0.7237613	total: 10.3s	remaining: 1m 41s
11:	learn: 0.7364248	total: 11.2s	remaining: 1m 41s
12:	learn: 0.7452690	total: 12.2s	remaining: 1m 40s
13:	learn: 0.7551922	total: 13.1s	remaining: 1m 39s
14:	learn: 0.7568141	total: 14.1s	remaining: 1m 38s
15:	learn: 0.7589324	total: 15s	remaining: 1m 37s
16:	learn: 0.7606947	total: 15.9s	remaining: 1m 36s
17:	learn: 0.7689932	total: 16.8s	remaining: 1m 35s
18:	learn: 0.7742968	total: 17.8s	remaining: 1m 34s
19:	learn: 0.7754859	tota

{'learning_rate': 0.15, 'depth': 10}

In [10]:
pd.DataFrame(search1.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,61.12,2.14,0.15,0.03,0.1,7,"{'learning_rate': 0.1, 'depth': 7}",0.74,0.8,0.8,0.77,0.74,0.77,0.02,8
1,58.39,3.71,0.21,0.04,0.15,7,"{'learning_rate': 0.15, 'depth': 7}",0.75,0.8,0.8,0.76,0.75,0.77,0.02,7
2,94.43,1.51,0.29,0.1,0.1,8,"{'learning_rate': 0.1, 'depth': 8}",0.75,0.79,0.8,0.77,0.75,0.77,0.02,6
3,99.87,3.37,0.22,0.05,0.15,8,"{'learning_rate': 0.15, 'depth': 8}",0.75,0.8,0.81,0.78,0.76,0.78,0.02,2
4,166.5,6.9,0.24,0.07,0.1,9,"{'learning_rate': 0.1, 'depth': 9}",0.75,0.8,0.81,0.77,0.75,0.77,0.02,5
5,191.25,16.06,0.29,0.09,0.15,9,"{'learning_rate': 0.15, 'depth': 9}",0.75,0.8,0.82,0.77,0.74,0.78,0.03,3
6,313.67,28.0,0.24,0.08,0.1,10,"{'learning_rate': 0.1, 'depth': 10}",0.75,0.8,0.81,0.78,0.74,0.78,0.03,4
7,235.74,69.76,0.16,0.03,0.15,10,"{'learning_rate': 0.15, 'depth': 10}",0.76,0.81,0.82,0.79,0.74,0.78,0.03,1


In [11]:
GOAL_NUM = 2
params = {'depth':[7,8,9,10],
          'learning_rate' : [0.1, 0.15]
         }
clf = RandomizedSearchCV(model, params, random_state=RANDOM_STATE, n_iter=20, n_jobs=3)

search2 = clf.fit(X_train, 
          y_train.iloc[:,GOAL_NUM]
         )
search2.best_params_



0:	learn: 0.0682912	total: 697ms	remaining: 1m 22s
1:	learn: 0.1122100	total: 1.48s	remaining: 1m 27s
2:	learn: 0.1572043	total: 2.24s	remaining: 1m 27s
3:	learn: 0.1895488	total: 3.16s	remaining: 1m 31s
4:	learn: 0.2209927	total: 3.95s	remaining: 1m 30s
5:	learn: 0.2266515	total: 4.71s	remaining: 1m 29s
6:	learn: 0.2318317	total: 5.5s	remaining: 1m 28s
7:	learn: 0.2494058	total: 6.43s	remaining: 1m 30s
8:	learn: 0.2538562	total: 7.19s	remaining: 1m 28s
9:	learn: 0.2579687	total: 8.03s	remaining: 1m 28s
10:	learn: 0.2737321	total: 8.86s	remaining: 1m 27s
11:	learn: 0.2772531	total: 9.73s	remaining: 1m 27s
12:	learn: 0.2805615	total: 10.5s	remaining: 1m 26s
13:	learn: 0.2834666	total: 11.3s	remaining: 1m 25s
14:	learn: 0.2932175	total: 12.1s	remaining: 1m 24s
15:	learn: 0.2957963	total: 13s	remaining: 1m 24s
16:	learn: 0.2980195	total: 13.8s	remaining: 1m 23s
17:	learn: 0.3003324	total: 14.6s	remaining: 1m 22s
18:	learn: 0.3126008	total: 15.4s	remaining: 1m 22s
19:	learn: 0.3143604	tota

{'learning_rate': 0.15, 'depth': 10}

In [12]:
pd.DataFrame(search2.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,29.8,0.36,0.13,0.03,0.1,7,"{'learning_rate': 0.1, 'depth': 7}",0.24,0.38,0.3,0.39,0.33,0.33,0.05,8
1,31.25,2.54,0.12,0.05,0.15,7,"{'learning_rate': 0.15, 'depth': 7}",0.26,0.41,0.33,0.42,0.39,0.36,0.06,4
2,50.62,1.81,0.13,0.02,0.1,8,"{'learning_rate': 0.1, 'depth': 8}",0.25,0.39,0.3,0.4,0.35,0.34,0.06,7
3,54.5,2.28,0.17,0.03,0.15,8,"{'learning_rate': 0.15, 'depth': 8}",0.26,0.41,0.34,0.43,0.38,0.36,0.06,3
4,91.27,3.62,0.19,0.04,0.1,9,"{'learning_rate': 0.1, 'depth': 9}",0.25,0.39,0.31,0.4,0.37,0.34,0.05,6
5,95.8,2.58,0.28,0.1,0.15,9,"{'learning_rate': 0.15, 'depth': 9}",0.27,0.42,0.34,0.42,0.39,0.37,0.06,2
6,213.46,2.4,0.22,0.07,0.1,10,"{'learning_rate': 0.1, 'depth': 10}",0.26,0.39,0.32,0.4,0.36,0.35,0.05,5
7,185.09,40.88,0.27,0.15,0.15,10,"{'learning_rate': 0.15, 'depth': 10}",0.27,0.44,0.34,0.44,0.41,0.38,0.07,1
