In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/avito-demand-prediction/train.csv')

In [None]:
train.head()

In [None]:
# train

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

import re

from scipy.sparse import hstack, csr_matrix

from tqdm import tqdm_notebook
import matplotlib.pyplot as plt

In [None]:
spliter = StratifiedKFold(n_splits=5, shuffle=True)

_y = (train.deal_probability.round(2)*100).astype(int)

FOLD_LIST = list(spliter.split(_y, _y))

In [None]:
def text_preprocession(text):
    text = str(text)
    text = text.lower()
    clean = re.sub(r"[,.;@#?!&$]+\ *", " ", text)
    return clean

In [None]:
Y = train.deal_probability.values

In [None]:
train.description = train.description.apply(text_preprocession)
train.title = train.title.apply(text_preprocession)

In [None]:
description_vectorizer = TfidfVectorizer(max_df=0.9, min_df=7, max_features=50000)

title_vectorizer = TfidfVectorizer(max_df=0.9, analyzer='char', ngram_range=(3,3), min_df=7, max_features=50000)

In [None]:
train_description_tfidf = description_vectorizer.fit_transform(train.description )
train_title_tfidf = title_vectorizer.fit_transform(train.title)

In [None]:
df = pd.DataFrame()

In [None]:
categorial_features = [
    'user_type',
    'image_top_1',
    'region',
    'city',
    'parent_category_name',
    'category_name',
    'param_1',
    'param_2',
    'param_3'
    
]
label_encoder_list = []


for col in categorial_features:
    lbl = LabelEncoder()

    df[col] = lbl.fit_transform(train[col].fillna('N/A').astype(str))
    label_encoder_list.append(lbl)

In [None]:
categorial_features[4], label_encoder_list[4].inverse_transform([5])

In [None]:
df

In [None]:
onehot_encoder_list = []
onehot_features_list = []

for col in categorial_features:
    one = OneHotEncoder()
    one_hot_form = one.fit_transform(df[col].values.reshape(-1,1))
    onehot_features_list.append(one_hot_form)
    onehot_encoder_list.append(one)

In [None]:
for i in onehot_features_list:
    print(i.shape)

In [None]:
ohehot_features = hstack(onehot_features_list).tocsr()

In [None]:
# np.stack

In [None]:
ohehot_features

In [None]:
def rmse(y_true, y_pred):
    return (mean_squared_error(y_true, y_pred)**0.5).round(5)

In [None]:
desccription_models = []
title_models = []
onehot_features_models = []

oof_predictions = np.zeros(shape=[train.shape[0], 3])

for fold_id, (train_idx, val_idx) in tqdm_notebook(enumerate(FOLD_LIST)):
    
    
    descr_train, title_train, onehot_train, y_train = (
        train_description_tfidf[train_idx],
        train_title_tfidf[train_idx],
        ohehot_features[train_idx],
        Y[train_idx]
    )

    descr_val, title_val, onehot_val, y_val = (
        train_description_tfidf[val_idx],
        train_title_tfidf[val_idx],
        ohehot_features[val_idx],
        Y[val_idx]
    )
    
    
    descr_model = Ridge()
    title_model = Ridge()
    onehot_model = Ridge()
    
    descr_model.fit(descr_train, y_train)
    oof_predictions[val_idx, 0] = descr_model.predict(descr_val)
    desccription_models.append(descr_model)
    
    title_model.fit(title_train, y_train)
    oof_predictions[val_idx, 1] = title_model.predict(title_val)
    title_models.append(title_model)
    
    onehot_model.fit(onehot_train, y_train)
    oof_predictions[val_idx, 2] = onehot_model.predict(onehot_val)
    onehot_features_models.append(onehot_model)
    
    print('###', 'fold', fold_id,':', '###')
    print('descr_model rmse:', rmse(oof_predictions[val_idx, 0], y_val))
    print('title_model rmse:', rmse(oof_predictions[val_idx, 1], y_val))
    print('onehot_model rmse:', rmse(oof_predictions[val_idx, 2], y_val))
    print('#'*20)

In [None]:
pd.DataFrame(np.array([oof_predictions for i in range(3)]).T)

In [None]:
rmse(np.zeros(1503424)+Y.mean(), Y)

In [None]:
for i in range(3):
    print(rmse(oof_predictions[:,i], Y))

In [None]:
#     print('descr_model rmse:', rmse(oof_predictions[val_idx, 0], y_val))
#     print('title_model rmse:', rmse(oof_predictions[val_idx, 1], y_val))
#     print('onehot_model rmse:', rmse(oof_predictions[val_idx, 2], y_val))

In [None]:
train.price.hist(bins=100)

In [None]:
train.price.fillna(0).clip(0,5000000).hist(bins=100)

In [None]:
np.log1p(train.price.fillna(0).clip(0,5000000)).hist(bins=100)

In [None]:
train['log_clip_price'] = np.log1p(train.price.fillna(0).clip(0,5000000))

In [None]:
train['log_clip_price']

In [None]:
train[['parent_category_name','log_clip_price']].groupby(
    'parent_category_name')['log_clip_price'].agg(['mean','max','std'])

In [None]:
train[['parent_category_name','log_clip_price']].groupby('parent_category_name')['log_clip_price'].describe()

In [None]:
train[['parent_category_name','price']].groupby('parent_category_name')['price'].describe()

In [None]:
agg_price = train[['parent_category_name','price']].groupby('parent_category_name')['price'].describe()
agg_log_price = train[['parent_category_name','log_clip_price']].groupby('parent_category_name')['log_clip_price'].describe()

In [None]:
mean_parent_category_price = agg_price['mean'].reset_index()
print(mean_parent_category_price.columns)
mean_parent_category_price.columns = ['parent_category_name', 'mean_parent_category_price']

In [None]:
train = train.merge(mean_parent_category_price)

In [None]:
# train

In [None]:
deviation_price = (train.price.fillna(0).clip(0,5000000) - train['mean_parent_category_price'])#.hist(bins=100)

In [None]:
deviation_price.hist()

In [None]:
X = np.concatenate([
    deviation_price.values.reshape(-1,1),
    train['mean_parent_category_price'].values.reshape(-1,1),
    oof_predictions,
],axis=1)

In [None]:
pd.DataFrame(X)

In [None]:
from catboost import CatBoostRegressor, Pool

In [None]:
_models = []

oof_predictions = np.zeros(shape=[train.shape[0]])

for fold_id, (train_idx, val_idx) in tqdm_notebook(enumerate(FOLD_LIST)):
    
    X_train, Y_train = X[train_idx], Y[train_idx]
    X_val, Y_val = X[val_idx], Y[val_idx]
    
    eval_dataset = Pool(X_val, Y_val)
    
    model = CatBoostRegressor(
        learning_rate = 0.1,
        iterations=100, depth=16, max_leaves=37, eval_metric='RMSE',
        metric_period=10, use_best_model=True,
        grow_policy='Lossguide',
        max_bin=1024
        
    )
    model.fit(X_train, Y_train, eval_set = eval_dataset)
    
    _models.append(model)
    preds = model.predict(X_val)
    oof_predictions[val_idx] = preds
    print('fold_id:', fold_id, rmse(Y_val, preds))

In [None]:
rmse(oof_predictions, Y)

In [None]:
# при идеальных результатах точки лежат на одной диагонали
plt.figure(figsize=(10,10))
plt.scatter(Y, oof_predictions, alpha=0.01, s=30)
plt.show()

In [None]:
model.feature_importances_.round(1)

In [None]:
# А теперь применим все эти преобразования к TEST датасету :)