In [5]:
import pandas as pd
import numpy as np
# speed up the loop
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import plot_importance
import matplotlib.pyplot as plt
import pickle 
pd.options.mode.chained_assignment = None
from sklearn.decomposition import TruncatedSVD
import lightgbm as lgb

In [8]:
train_df = pd.read_csv("train.csv", parse_dates=["activation_date"])
test_df = pd.read_csv("test.csv", parse_dates=["activation_date"])

In [9]:
stopWords = stopwords.words('russian')

In [4]:
train_y = train_df["deal_probability"].values
test_id = test_df["item_id"].values

In [10]:
train_df["activation_weekday"] = train_df["activation_date"].dt.weekday
train_df["activation_week"] = train_df["activation_date"].dt.week
train_df["activation_day"] = train_df["activation_date"].dt.day

test_df["activation_weekday"] = test_df["activation_date"].dt.weekday
test_df["activation_week"] = test_df["activation_date"].dt.week
test_df["activation_day"] = test_df["activation_date"].dt.day

In [6]:
#data = pd.concat([train_df, test_df], axis=0)

In [7]:
data["image_top_1"].fillna(-999,inplace=True)

In [8]:
data["price"] = np.log(data["price"]+0.001)
data["price"].fillna(data.price.mean(),inplace=True)

In [9]:
category_column = ['region','city','parent_category_name','category_name','user_type','image_top_1','item_seq_number','activation_weekday','activation_week','activation_day']
for item in tqdm(category_column):
        groupBy = train_df.groupby(item)['deal_probability']
        mean = groupBy.mean()
        std = groupBy.std()
        data[item + '_deal_probability_mean'] = data[item].map(mean)
        data[item + '_deal_probability_std'] = data[item].map(std)


for item in tqdm(category_column):
        groupBy = train_df.groupby(item)['price']
        mean = groupBy.mean()
        std = groupBy.std()
        data[item + '_price_mean'] = data[item].map(mean)
        data[item + '_price_std'] = data[item].map(std)

100%|██████████| 10/10 [00:04<00:00,  2.24it/s]
100%|██████████| 10/10 [00:04<00:00,  2.17it/s]


In [11]:
tfidf = TfidfVectorizer(max_features=200, stop_words = stopWords)
tfidf_title = TfidfVectorizer(max_features=100, stop_words = stopWords)

In [13]:
train_df['description'] = train_df['description'].fillna(' ')
test_df['description'] = test_df['description'].fillna(' ')
train_df['title'] = train_df['title'].fillna(' ')
test_df['title'] = test_df['title'].fillna(' ')
tfidf.fit(pd.concat([train_df['description'], test_df['description']]))
tfidf_title.fit(pd.concat([train_df['title'], test_df['title']]))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=100, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне', 'было', 'вот', 'от', 'меня', 'еще', 'нет', 'о', 'из', 'ему', 'теперь', 'когда', 'даже', 'ну', 'вдруг', '...гда', 'лучше', 'чуть', 'том', 'нельзя', 'такой', 'им', 'более', 'всегда', 'конечно', 'всю', 'между'],
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [14]:
train_des_tfidf = tfidf.transform(train_df['description'])
test_des_tfidf = tfidf.transform(test_df['description'])

In [15]:
train_title_tfidf = tfidf.transform(train_df['title'])
test_title_tfidf = tfidf.transform(test_df['title'])

In [16]:
n_comp = 5
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(tfidf.transform(pd.concat([train_df['description'], test_df['description']])))

svd_title = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_title.fit(tfidf.transform(pd.concat([train_df['title'], test_df['title']])))

TruncatedSVD(algorithm='arpack', n_components=5, n_iter=5, random_state=None,
       tol=0.0)

In [24]:
train_svd = pd.DataFrame(svd_obj.transform(train_des_tfidf))
test_svd = pd.DataFrame(svd_obj.transform(test_des_tfidf))
train_svd.columns = ['svd_des_'+str(i+1) for i in range(n_comp)]
test_svd.columns = ['svd_des_'+str(i+1) for i in range(n_comp)]
train_df = pd.concat([train_df, train_svd], axis=1)
test_df = pd.concat([test_df, test_svd], axis=1)

In [25]:
train_title_svd = pd.DataFrame(svd_title.transform(train_title_tfidf))
test_titile_svd = pd.DataFrame(svd_title.transform(test_title_tfidf))
train_title_svd.columns = ['svd_title_'+str(i+1) for i in range(n_comp)]
test_titile_svd.columns = ['svd_title_'+str(i+1) for i in range(n_comp)]
train_df = pd.concat([train_df, train_title_svd], axis=1)
test_df = pd.concat([test_df, test_titile_svd], axis=1)

In [29]:
# Label encode the categorical variables #
cat_vars = ["region", "city", "parent_category_name", "category_name", "user_type", "param_1", "param_2", "param_3"]
for col in tqdm(cat_vars):
    lbl = LabelEncoder()
    lbl.fit(list(train_df[col].values.astype('str')) + list(test_df[col].values.astype('str')))
    train_df[col] = lbl.transform(list(train_df[col].values.astype('str')))
    test_df[col] = lbl.transform(list(test_df[col].values.astype('str')))

cols_to_drop = ["item_id", "user_id", "title", "description", "activation_date", "image"]
print(train_df.columns)
train_X = train_df.drop(cols_to_drop + ["deal_probability"], axis=1)
test_X = test_df.drop(cols_to_drop, axis=1)

print(train_X.head())

100%|██████████| 8/8 [00:48<00:00,  6.05s/it]


Index(['item_id', 'user_id', 'region', 'city', 'parent_category_name',
       'category_name', 'param_1', 'param_2', 'param_3', 'title',
       'description', 'price', 'item_seq_number', 'activation_date',
       'user_type', 'image', 'image_top_1', 'deal_probability',
       'activation_weekday', 'activation_week', 'activation_day', 'svd_des_1',
       'svd_des_2', 'svd_des_3', 'svd_des_4', 'svd_des_5', 'svd_title_1',
       'svd_title_2', 'svd_title_3', 'svd_title_4', 'svd_title_5'],
      dtype='object')
   region  city  parent_category_name  category_name  param_1  param_2  \
0      11  1156                     4             37      167       16   
1       9   352                     2             15       27       16   
2       8   325                     0             12      355       16   
3      14  1698                     4             37      304       16   
4      22   996                     6              0      199       29   

   param_3    price  item_seq_number  user

In [30]:
print (train_X.shape, test_X.shape)

(1503424, 24) (508438, 24)


In [21]:
print (data.shape)

(2011862, 61)


In [22]:
train_period = pd.read_pickle('period.p')
test_period = pd.read_pickle('test_period.p')

In [23]:
train_price_rank = pd.read_pickle('price_rank_train.p')
test_price_rank = pd.read_pickle('price_rank_test.p')

In [24]:
train_df = pd.concat([train_df, train_period, train_price_rank], axis=1)
test_df = pd.concat([test_df, test_period, test_price_rank], axis=1)

In [26]:
train_X = train_df.drop(cols_to_drop + ["deal_probability"], axis=1)
test_X = test_df.drop(cols_to_drop, axis=1)

print(train_X.head())

   region  city  parent_category_name  category_name  param_1  param_2  \
0      19   462                     4             42      249      112   
1      17  1314                     2             22      122      112   
2      16  1290                     0              2       84      112   
3      21   950                     4             42       38      112   
4       4   318                     6              0      278      124   

   param_3    price  item_seq_number  user_type     ...       svd_title_3  \
0     1217    400.0                2          1     ...               0.0   
1     1217   3000.0               19          1     ...               0.0   
2     1217   4000.0                9          1     ...               0.0   
3     1217   2200.0              286          0     ...               0.0   
4       46  40000.0                3          1     ...               0.0   

   svd_title_4  svd_title_5  svd_title_6  svd_title_7  svd_title_8  \
0          0.0        

In [41]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.09,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.7,
        #"bagging_frequency" : 5,
        "bagging_seed" : 2018,
        "verbosity" : -1
    }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 6000, valid_sets=[lgval], early_stopping_rounds=150, verbose_eval=20, evals_result=evals_result)
    
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    return pred_test_y, model, evals_result

In [35]:
# Splitting the data for model training#
dev_X = train_X.iloc[:-200000,:]
val_X = train_X.iloc[-200000:,:]
train_y = train_df["deal_probability"].values
dev_y = train_y[:-200000]
val_y = train_y[-200000:]
print(dev_X.shape, val_X.shape, dev_y.shape, val_y.shape, test_X.shape)


(1303424, 24) (200000, 24) (1303424,) (200000,) (508438, 24)


In [42]:
import lightgbm as lgb
pred_test, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, test_X)

Training until validation scores don't improve for 150 rounds.
[20]	valid_0's rmse: 0.235521
[40]	valid_0's rmse: 0.232333
[60]	valid_0's rmse: 0.231138
[80]	valid_0's rmse: 0.23033
[100]	valid_0's rmse: 0.229803
[120]	valid_0's rmse: 0.229427
[140]	valid_0's rmse: 0.22912
[160]	valid_0's rmse: 0.22885
[180]	valid_0's rmse: 0.228633
[200]	valid_0's rmse: 0.228469
[220]	valid_0's rmse: 0.228336
[240]	valid_0's rmse: 0.228177
[260]	valid_0's rmse: 0.228062
[280]	valid_0's rmse: 0.227967
[300]	valid_0's rmse: 0.227872
[320]	valid_0's rmse: 0.22781
[340]	valid_0's rmse: 0.227729
[360]	valid_0's rmse: 0.227643
[380]	valid_0's rmse: 0.227562
[400]	valid_0's rmse: 0.227508
[420]	valid_0's rmse: 0.22741
[440]	valid_0's rmse: 0.227362
[460]	valid_0's rmse: 0.227301
[480]	valid_0's rmse: 0.227266
[500]	valid_0's rmse: 0.227223
[520]	valid_0's rmse: 0.227193
[540]	valid_0's rmse: 0.227144
[560]	valid_0's rmse: 0.227101
[580]	valid_0's rmse: 0.22708
[600]	valid_0's rmse: 0.227044
[620]	valid_0's r