In [None]:
import numpy as np
import pandas as pd
import sys
import re
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor
from tqdm.notebook import tqdm
import catboost as cb
from catboost import CatBoostRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.tree import ExtraTreeRegressor

pd.set_option('max_columns', None)

In [None]:
!pip freeze > requirements.txt

In [None]:
RANDOM_SEED = 42

# Setup

In [None]:
VERSION    = 2
DIR_TRAIN  = '../input/autoru-parsed-0603-1304/'
#DIR_TRAIN  = '../input/parsing-all-moscow-auto-ru-09-09-2020/'
DIR_TEST   = '../input/sf-dst-car-price/'
VAL_SIZE   = 0.33
N_FOLDS    = 5

# CATBOOST
ITERATIONS = 5000

# Data

In [None]:
train = pd.read_csv(DIR_TRAIN+'new_data_99_06_03_13_04.csv')
#train = pd.read_csv(DIR_TRAIN+'all_auto_ru_09_09_2020.csv')
test = pd.read_csv(DIR_TEST+'test.csv')
sample_submission = pd.read_csv(DIR_TEST+'sample_submission.csv')

# Data Preprocessing

In [None]:
train = train.loc[(train.brand == "BMW")]
train.info()

In [None]:
train.head(3)

In [None]:
test.info()

In [None]:
test.head(3)

In [None]:
def preproc_train(df_input):
    
    
    df_output = df_input.copy()
    df_output = df_output.loc[(df_output.brand == "BMW")]
    df_output = df_output.loc[(df_output['Руль'] == "LEFT")]
    
    
    # ################### Предобработка ##############################################################                  
    
    
    df_output.drop(['Unnamed: 0', 'Таможня'], axis=1, inplace=True)
    #df_output.drop(['Таможня'], axis=1, inplace=True)
    df_output.dropna(axis = 0, thresh=18,inplace=True)
    df_output.dropna(subset=['Владельцы', 'ПТС'], inplace=True)
    
    
    # ################### Feature Engineering ####################################################
    
    
    df_output['bodyType'] = df_output['bodyType'].apply(lambda x: str(x))
    def bodyType(row):
        for body_type in ['внедорожник 5 дв.', 'седан-хардтоп', 'хэтчбек 5 дв.','внедорожник 3 дв.', 
                      'купе-хардтоп', 'внедорожник открытый','хэтчбек 3 дв.', 'пикап двойная кабина',
                      'пикап полуторная кабина', 'пикап одинарная кабина', 'седан 2 дв.','универсал 5 дв.', 
                      'родстер', 'кабриолет','фургон', 'микровэн','минивэн', 'компактвэн','лифтбек',
                      'купе','тарга', 'седан','лимузин']:
            if row.lower().startswith(body_type):
                return body_type
    df_output['bodyType'] = df_output['bodyType'].apply(bodyType)
    df_output['bodyType'] = df_output['bodyType'].apply(lambda x: 'купе' if x == 'купе-хардтоп' else x)
    
    df_output['color'] = df_output['color'].map({'CACECB': 'серебристый', 'FAFBFB':'белый', 'EE1D19':'красный', '97948F':'серый', 
                                     '660099':'пурпурный', '040001':'чёрный','4A2197':'фиолетовый', 
                                     '200204':'коричневый','0000CC':'синий', '007F00':'зелёный', 'C49648':'бежевый',
                                     '22A0F8':'голубой','DEA522':'золотистый','FFD600': 'жёлтый', 'FF8649':'оранжевый',
                                     'FFC0CB':'розовый'})
    
    df_output['vehicleConfiguration'] = df_output['vehicleConfiguration'].map({'AUTOMATIC': 'автоматическая', 'MECHANICAL': 'механическая', 'ROBOT': 'роботизированная',})
    
    df_output['ПТС'] = df_output['ПТС'].map({'ORIGINAL': 'Оригинал', 'DUPLICATE': 'Дубликат'})
    
    #Владение
    
    df_output['Владение']= df_output['Владение'].fillna('nodata')
    def months(row):
        row = re.findall('\d+',row)
        if row != []:
            years = 2020 - (int(row[0]))
            months = int(row[1])
            if years <=0:
                return months
            elif years > 0:
                return years*12+months
            return None
    df_output['month_ownership'] = df_output['Владение'].apply(months)
    df_output['month_ownership'] = df_output['month_ownership'].fillna(df_output['month_ownership'].median())
    
    # Целевая переменная
    
    df_output.Price=df_output.Price.apply(lambda x: np.log(x))
    
    # description
    
    df_output['description'] = df_output['description'].fillna('[]')
    df_output['description_word'] = df_output['description'].apply(lambda x: [str(i).lower() for i in x.split()])
    
    df_output['leather']= df_output['description_word'].apply(lambda x: 
                                                1 if ('темный' and 'салон') in x else 0)
    df_output['carter']= df_output['description_word'].apply(lambda x: 
                                                1 if ('защита' and 'картера') in x else 0)
    df_output['ABS']= df_output['description_word'].apply(lambda x: 
                                                1 if ('антиблокировочная' and 'система') in x else 0)
    df_output['airbags']= df_output['description_word'].apply(lambda x: 
                                                1 if ('подушки' and 'безопасности') in x else 0)
    df_output['immob']= df_output['description_word'].apply(lambda x: 
                                                1 if ('иммобилайзер') in x else 0)
    df_output['central_locking']= df_output['description_word'].apply(lambda x: 
                                                1 if ('центральный' and 'замок') in x else 0)
    df_output['on_board_computer']= df_output['description_word'].apply(lambda x: 
                                                1 if ('бортовой' and 'компьютер') in x else 0)
    df_output['cruise_control']= df_output['description_word'].apply(lambda x: 
                                                1 if ('круиз-контроль') in x else 0)
    df_output['climat_control']= df_output['description_word'].apply(lambda x: 
                                                1 if ('климат-контроль') in x else 0)
    df_output['multi_rudder']= df_output['description_word'].apply(lambda x: 
                                                1 if ('мультифункциональный' and 'руль') in x else 0)
    df_output['power_steering']= df_output['description_word'].apply(lambda x: 
                                                1 if ('гидроусилитель' or 'гидро' or 'усилитель' and 'руля') in x else 0)
    df_output['light_and_rain_sensors']= df_output['description_word'].apply(lambda x: 
                                                1 if ('датчики' and 'света' and 'дождя') in x else 0)
    df_output['сarbon_body_kits']= df_output['description_word'].apply(lambda x: 
                                                1 if ('карбоновые' and 'обвесы') in x else 0)
    df_output['rear_diffuser_rkp']= df_output['description_word'].apply(lambda x: 
                                                1 if ('задний' and 'диффузор') in x else 0)
    df_output['door_closers']= df_output['description_word'].apply(lambda x: 
                                                1 if ('доводчики' and 'дверей') in x else 0)
    df_output['rear_view_camera']= df_output['description_word'].apply(lambda x: 
                                                1 if ('камера' or 'видеокамера' and 'заднего' and 'вида') in x else 0)
    df_output['amg']= df_output['description_word'].apply(lambda x: 
                                                1 if ('amg') in x else 0)
    df_output['bi_xenon_headlights']= df_output['description_word'].apply(lambda x: 
                                                1 if ('биксеноновые' and 'фары') in x else 0)
    df_output['from_salon']= df_output['description_word'].apply(lambda x: 
                                                1 if ('рольф' or 'панавто' or 'дилер' or 'кредит' or 'ликвидация') in x else 0)
    df_output['alloy_wheels']= df_output['description_word'].apply(lambda x: 
                                                1 if ('легкосплавные' or 'колесные' or 'диски') in x else 0)
    df_output['parking_sensors']= df_output['description_word'].apply(lambda x: 
                                                1 if ('парктроник' or 'парктронник') in x else 0)
    df_output['dents']= df_output['description_word'].apply(lambda x: 
                                                1 if ('вмятины' or 'вмятина' or 'царапина' or 'царапины' or 'трещина') in x else 0)
    df_output['roof_with_panoramic_view']= df_output['description_word'].apply(lambda x: 
                                                1 if ('панорамная' and 'крыша') in x else 0)
    
    # ################### fix ############################################################## 
    
    
    for feature in ['modelDate', 'numberOfDoors', 'productionDate', 'enginePower', 'mileage', 'Владельцы','month_ownership']:
        df_output[feature]=df_output[feature].astype('int32')
    df_output.rename(columns={'vehicleConfiguration': 'vehicleTransmission'}, inplace=True)
    
    
    # ################### Clean ####################################################
    
    
    df_output.drop(['engineDisplacement', 'Комплектация', 'description', 'Владение', 'name', 'Руль','description_word'], axis=1, inplace=True,)
    
    
    return df_output

In [None]:
def preproc_test(df_input):
    
    
    df_output = df_input.copy()
    
    
    # ################### Предобработка ############################################################## 
    
    
    df_output.drop(['Таможня', 'Состояние', 'id'], axis=1, inplace=True,)
    
    
    # ################### Feature Engineering ####################################################
    
    
    df_output['enginePower'] = df_output['enginePower'].apply(lambda x: int(x.split()[0]))
    
    df_output['Владельцы'] = df_output['Владельцы'].apply(lambda x: int(x.split()[0]))
    
    # Владение
    
    def num_of_months(row):
        if pd.notnull(row):
            list_ownership = row.split()
            if len(list_ownership) == 2:
                if list_ownership[1] in ['год', 'года', 'лет']:
                    return int(list_ownership[0])*12
                return int(list_ownership[0])
            return int(list_ownership[0])*12 + int(list_ownership[3])
    df_output['month_ownership'] = df_output['Владение'].apply(num_of_months)
    df_output['month_ownership'] = df_output['month_ownership'].fillna(df_output['month_ownership'].median())
    
    # description
    
    df_output['description'] = df_output['description'].fillna('[]')
    df_output['description_word'] = df_output['description'].apply(lambda x: [str(i).lower() for i in x.split()])
    
    df_output['leather']= df_output['description_word'].apply(lambda x: 
                                                1 if ('темный' and 'салон') in x else 0)
    df_output['carter']= df_output['description_word'].apply(lambda x: 
                                                1 if ('защита' and 'картера') in x else 0)
    df_output['ABS']= df_output['description_word'].apply(lambda x: 
                                                1 if ('антиблокировочная' and 'система') in x else 0)
    df_output['airbags']= df_output['description_word'].apply(lambda x: 
                                                1 if ('подушки' and 'безопасности') in x else 0)
    df_output['immob']= df_output['description_word'].apply(lambda x: 
                                                1 if ('иммобилайзер') in x else 0)
    df_output['central_locking']= df_output['description_word'].apply(lambda x: 
                                                1 if ('центральный' and 'замок') in x else 0)
    df_output['on_board_computer']= df_output['description_word'].apply(lambda x: 
                                                1 if ('бортовой' and 'компьютер') in x else 0)
    df_output['cruise_control']= df_output['description_word'].apply(lambda x: 
                                                1 if ('круиз-контроль') in x else 0)
    df_output['climat_control']= df_output['description_word'].apply(lambda x: 
                                                1 if ('климат-контроль') in x else 0)
    df_output['multi_rudder']= df_output['description_word'].apply(lambda x: 
                                                1 if ('мультифункциональный' and 'руль') in x else 0)
    df_output['power_steering']= df_output['description_word'].apply(lambda x: 
                                                1 if ('гидроусилитель' or 'гидро' or 'усилитель' and 'руля') in x else 0)
    df_output['light_and_rain_sensors']= df_output['description_word'].apply(lambda x: 
                                                1 if ('датчики' and 'света' and 'дождя') in x else 0)
    df_output['сarbon_body_kits']= df_output['description_word'].apply(lambda x: 
                                                1 if ('карбоновые' and 'обвесы') in x else 0)
    df_output['rear_diffuser_rkp']= df_output['description_word'].apply(lambda x: 
                                                1 if ('задний' and 'диффузор') in x else 0)
    df_output['door_closers']= df_output['description_word'].apply(lambda x: 
                                                1 if ('доводчики' and 'дверей') in x else 0)
    df_output['rear_view_camera']= df_output['description_word'].apply(lambda x: 
                                                1 if ('камера' or 'видеокамера' and 'заднего' and 'вида') in x else 0)
    df_output['amg']= df_output['description_word'].apply(lambda x: 
                                                1 if ('amg') in x else 0)
    df_output['bi_xenon_headlights']= df_output['description_word'].apply(lambda x: 
                                                1 if ('биксеноновые' and 'фары') in x else 0)
    df_output['from_salon']= df_output['description_word'].apply(lambda x: 
                                                1 if ('рольф' or 'панавто' or 'дилер' or 'кредит' or 'ликвидация') in x else 0)
    df_output['alloy_wheels']= df_output['description_word'].apply(lambda x: 
                                                1 if ('легкосплавные' or 'колесные' or 'диски') in x else 0)
    df_output['parking_sensors']= df_output['description_word'].apply(lambda x: 
                                                1 if ('парктроник' or 'парктронник') in x else 0)
    df_output['dents']= df_output['description_word'].apply(lambda x: 
                                                1 if ('вмятины' or 'вмятина' or 'царапина' or 'царапины' or 'трещина') in x else 0)
    df_output['roof_with_panoramic_view']= df_output['description_word'].apply(lambda x: 
                                                1 if ('панорамная' and 'крыша') in x else 0)
    
    # ################### fix ############################################################## 
    
    
    for feature in ['modelDate', 'numberOfDoors', 'productionDate', 'enginePower', 'mileage','month_ownership']:
        df_output[feature]=df_output[feature].astype('int32')
    
    
    # ################### Clean #################################################### 
    
    
    df_output.drop(['engineDisplacement', 'Комплектация', 'description','description_word', 'Владение', 'vehicleConfiguration', 'name', 'Руль', ], axis=1, inplace=True,)
    
    
    return df_output

* Произведена качественная обработка признаков. 
* Данные в тесте и трейне приведены к единому формату. 
* Прологарифмирована целевая переменная для её нормального распределения. 
* Сгенерированы дополнительные признаки: 'month_ownership' - месяцы владения, набор новых признаков комлектации на основе переменной 'description'.

In [None]:
train_preproc = preproc_train(train)
train_preproc = train_preproc.drop(['brand'], axis=1,)

In [None]:
X = train_preproc.drop(['Price'], axis=1,)
y = train_preproc.Price.values

In [None]:
X_sub = preproc_test(test)
X_sub = X_sub.drop(['brand'], axis=1,)

In [None]:
X = pd.get_dummies(X, columns = ['bodyType', 'color', 'fuelType', 'vehicleTransmission', 'Привод', 'ПТС'], dummy_na=False)
X_sub = pd.get_dummies(X_sub, columns = ['bodyType', 'color', 'fuelType', 'vehicleTransmission', 'Привод', 'ПТС'], dummy_na=False)

In [None]:
#X.info()

In [None]:
#X_sub.info()

In [None]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_pred - y_true) / y_true))

# Train Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=VAL_SIZE, shuffle=True, random_state=RANDOM_SEED)

# Сategorial features

In [None]:
cat_features_ids = np.where(X_train.apply(pd.Series.nunique) < 3000)[0].tolist()

# Fit&Submit Catboost

Для начала, было решено использовать модель CatBoostRegressor отдельно и обучить данные только на ней.

In [None]:
#model = CatBoostRegressor(random_seed = RANDOM_SEED, iterations = ITERATIONS, eval_metric = 'MAPE')

### Tuning Catboost

In [None]:
#model_pool = cb.Pool(X_train, y_train, cat_features_ids)

In [None]:
# Очень долго работает
#from scipy import stats

#class StrangeDistribution:
#    def __init__(self, values):
#        self.values = values

    #def rvs(self):
#        return self.values[0]

#param_distribution = {
#    'one_hot_max_size': stats.bernoulli(p=0.2, loc=2),
#    'learning_rate': StrangeDistribution([0.03, 0.1]),
#    'l2_leaf_reg': [1, 3, 5, 7, 9],
#    'depth': stats.binom(n=10, p=0.2)}

#randomized_search_results = model.randomized_search(
#    param_distribution,
#    model_pool,
#    n_iter=12,
#    shuffle=False,
#    plot=True)

In [None]:
#randomized_search_results['params']

Настройка через randomized_search показала лучшие параметры для модели: 
*                           learning_rate = 0.03,
*                           depth = 3,
*                           l2_leaf_reg = 7,
*                           one_hot_max_size = 2.

##### Bayes

In [None]:
#space = {'depth': Integer(8, 15),
#         'learning_rate': Real(0.01, 1.0, 'log-uniform'),
#         'random_strength': Real(1e-9, 10, 'log-uniform'),
#         'bagging_temperature': Real(0.0, 1.0),
#         'border_count': Integer(1, 255),
#         'ctr_border_count': Integer(1, 255),
#         'l2_leaf_reg': Integer(1, 3),
#        }

In [None]:
#opt = BayesSearchCV(model,space, n_iter=32, random_state=RANDOM_SEED)
#opt.fit(X_train, y_train)

In [None]:
#print(opt.score(X_test, y_test))

Bayes настройка модели не дала ощутимых улучшений.

### Single Model

In [None]:
model = CatBoostRegressor(random_seed = RANDOM_SEED, 
                          iterations = ITERATIONS,
                          learning_rate = 0.03,
                          depth = 3,
                          l2_leaf_reg = 7,
                          eval_metric = 'MAPE',
                          one_hot_max_size = 2,
                         )

In [None]:
model.fit(X_train, y_train,
          eval_set=(X_test, y_test),
          use_best_model=True,
          plot=True,
          verbose=False
         )

In [None]:
model.save_model('catboost_single_model.model')

In [None]:
predict_submission = model.predict(X_sub)
sample_submission['price'] = predict_submission
sample_submission['price'] = sample_submission['price'].apply(lambda x: np.exp(x))
sample_submission['price'] = sample_submission['price'].apply(lambda x: round(x/1000)*1000)
sample_submission.to_csv(f'catboost_submission_v{VERSION}.csv', index=False)

In [None]:
sample_submission.head()

Вывод: Модель CatBoostRegressor с дополнительными настройками сильно переобучилась. MAPE =0.9% и на тесте, и на трейне.

### Blending

In [None]:
def cat_model(y_train, X_train, X_test, y_test):
    model = CatBoostRegressor(random_seed = RANDOM_SEED, 
                              iterations = ITERATIONS,
                              learning_rate = 0.03,
                              depth = 3,
                              l2_leaf_reg = 7,
                              eval_metric = 'MAPE',
                              one_hot_max_size = 2
                             )
    model.fit(X_train, y_train,
              eval_set=(X_test, y_test),
              verbose=False,
              use_best_model=True,
              plot=False,
             )
    return(model)

In [None]:
submissions = pd.DataFrame(0,columns=["sub_1"], index=sample_submission.index) # куда пишем предикты по каждой модели
score_ls = []
splits = list(KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED).split(X, y))

for idx, (train_idx, test_idx) in tqdm(enumerate(splits), total=N_FOLDS,):
    # use the indexes to extract the folds in the train and validation data
    X_train, y_train, X_test, y_test = X.iloc[train_idx], y[train_idx], X.iloc[test_idx], y[test_idx]#
    # model for this fold
    model = cat_model(y_train, X_train, X_test, y_test,)
    # score model on test
    test_predict = model.predict(X_test)
    test_score = mape(y_test, test_predict)
    score_ls.append(test_score)
    print(f"{idx+1} Fold Test MAPE: {mape(y_test, test_predict):0.3f}")
    # submissions
    submissions[f'sub_{idx+1}'] = model.predict(X_sub)
    model.save_model(f'catboost_fold_{idx+1}.model')
    
    submissions[f'sub_{idx+1}'] = submissions[f'sub_{idx+1}'].apply(lambda x: np.exp(x))
    submissions[f'sub_{idx+1}'] = submissions[f'sub_{idx+1}'].apply(lambda x: round(x/1000)*1000)
    
print(f'Mean Score: {np.mean(score_ls):0.4f}')
print(f'Std Score: {np.std(score_ls):0.4f}')
print(f'Max Score: {np.max(score_ls):0.4f}')
print(f'Min Score: {np.min(score_ls):0.4f}')

In [None]:
submissions['blend'] = (submissions.sum(axis=1))/len(submissions.columns)
sample_submission['price'] = submissions['blend'].values
sample_submission.to_csv(f'catboost_submission_blend_v{VERSION}.csv', index=False)

In [None]:
sample_submission.head()

Вывод: Blending дал результат чуть лучше одиночной модели, однако всё равно идет переобучение.

# Fit&Submit Stacking

In [None]:
models = [RandomForestRegressor(n_estimators =250,random_state = RANDOM_SEED, n_jobs = -1, verbose = 1),
         BaggingRegressor(ExtraTreeRegressor(random_state=RANDOM_SEED), random_state=RANDOM_SEED)]
         #CatBoostRegressor(random_state=RANDOM_SEED, verbose=False)]

def stacking_model_predict(models, X, y, X_sub, sample_submission):
    for model_ in tqdm(models):
        model_.fit(X, y)
        pred_subm = model_.predict(X_sub)
        
        sample_submission[str(model_)[:6]] = pred_subm
        sample_submission[str(model_)[:6]] = sample_submission[str(model_)[:6]].apply(lambda x: np.exp(x) )
        sample_submission[str(model_)[:6]] = sample_submission[str(model_)[:6]].apply(lambda x: round(x/1000)*1000)
    
    sample_submission['price'] = sample_submission.iloc[:,2:].mean(axis=1)
    sample_submission['price'] = sample_submission['price'].apply(lambda x: round(x/1000)*1000)
    sample_submission[['id', 'price']].to_csv(f'submission_stack_v{VERSION}.csv', index=False)

stacking_model_predict(models, X, y, X_sub, sample_submission)

In [None]:
sample_submission.head()

Вывод: Лучший результат был получен в результате стэкинга моделей RandomForestRegressor и BaggingRegressor. CatBoostRegressor ухудшал результат и был исключен из стэкинга.