In [1]:
import numpy as np
import pandas as pd
import logging
import os

from category_encoders import LeaveOneOutEncoder

from sklearn.model_selection import KFold, ParameterSampler
from sklearn.metrics import mean_absolute_error

from utilitis.utils import date_parser
from utilitis.feature_manager import FeatureManager
from utilitis.model import LGBModel
from utilitis.cate_embedding import CateEmbedding

%load_ext autoreload
%autoreload 2

In [2]:
df = pd.read_csv('data/used_car_train_20200313.csv', sep=' ', parse_dates=['regDate', 'creatDate'], date_parser=date_parser)
print(df.shape)
df['logPrice'] = np.log(df['price'])

(150000, 31)


In [3]:
# test = pd.read_csv('data/used_car_testB_20200421.csv', sep=' ', parse_dates=['regDate', 'creatDate'], date_parser=date_parser)
# print(test.shape)

dev = df.iloc[:120000, :]
test = df.iloc[120000:, :]

In [None]:
ebd_targets = [
    'power', 'kilometer', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_11', 'v_12', 'v_13', 
    'v_14', 'logPrice'
]
dev_ebd, test_ebd = [], []
for feat in ['name', 'model', 'regionCode']:
    print(feat)
    encoder = CateEmbedding(dev[feat], dev[ebd_targets], threshold=50)
    encoder.retrain(epoch_round=10)
    dev_ebd.append(cate_embedding_encoder.transform(dev[feat]))
    test_ebd.append(cate_embedding_encoder.transform(test[feat]))
    del encoder

name


In [5]:
class MineFeatureManager(FeatureManager):
    def __init__(self, num_config=None, categorical_config=None):
        self.num_features = [
            'power', 'kilometer', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_11', 
            'v_12', 'v_13', 'v_14', 'carAge', 'v_10_1', 'v_10_2', 'v_10_3', 'nameEncode', 
            'modelEncode', 'regionCodeEncode', 'gearbox', 'notRepairedDamage', 'seller', 
            'offerType'
        ]
        self.categorical_features = ['model', 'brand', 'bodyType', 'fuelType', 'createMon']
        self.encoded_cates = ['name', 'model', 'regionCode']
        self.cate_encoder = LeaveOneOutEncoder(cols=self.encoded_cates)
        self.general_model = None
        super().__init__(self.num_features, self.categorical_features, num_config, categorical_config)

    def _feature_engien(self, features):
        zero_na = {0: np.nan}
        features = features.replace({'power': zero_na, 'v_5': zero_na, 'v_6': zero_na})
        
        features['carAge'] = (features['creatDate'] - features['regDate']).apply(lambda x: x.days)
        features['createMon'] = features['creatDate'].dt.month
        features['notRepairedDamage'] = features['notRepairedDamage'].replace('-', np.nan).astype(float)
        
        features.loc[features['power'] > 600, 'power'] = np.nan
        features['power'] = np.log(features['power'])
        features.loc[features['v_7'] > 0.5, 'v_7'] = np.nan
        features.loc[features['v_11'] > 10, 'v_11'] = np.nan
        features.loc[features['v_13'] > 7.5, 'v_13'] = np.nan
        features.loc[features['v_14'] > 7.5, 'v_14'] = np.nan
        
        features.loc[features['v_10'] <= 0, 'v_10_1'] = features.loc[features['v_10'] <= 0, 'v_10']
        features.loc[(features['v_10'] >= 0) & (features['v_10'] < 6), 'v_10_2'] = features.loc[(features['v_10'] >= 0) & (features['v_10'] < 6), 'v_10']
        features.loc[features['v_10'] > 8, 'v_10_3'] = features.loc[features['v_10'] > 8, 'v_10']
        features.loc[~features['model'].isin(self.general_model), 'model'] = np.nan
        return features
    
    def get_model_features(self, features):
        features = features.copy()
        self.general_model = df.model.value_counts()[df.model.value_counts() < 2000].index
        encoded_cate = self.cate_encoder.fit_transform(features[self.encoded_cates], features['logPrice'])
        for cate in self.encoded_cates:
            features[cate + 'Encode'] = encoded_cate[cate]
        features = self._feature_engien(features)
        return super().get_model_features(features)
    
    def transform_feature(self, features):
        features = features.copy()
        encoded_cate = self.cate_encoder.transform(features[self.encoded_cates])
        for cate in self.encoded_cates:
            features[cate + 'Encode'] = encoded_cate[cate]
        features = self._feature_engien(features)
        return super().get_model_features(features)


In [9]:
feature_manager = MineFeatureManager(num_config={'missing_indicator': False}, categorical_config={'missing_indicator': False})
X_dev = feature_manager.get_model_features(df)
X_test = feature_manager.transform_feature(test)
print(X_dev.shape, X_test.shape)

(150000, 29) (50000, 29)


In [10]:
X_dev.head(3)

Unnamed: 0,power,kilometer,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_11,...,regionCodeEncode,gearbox,notRepairedDamage,seller,offerType,model,brand,bodyType,fuelType,createMon
0,4.094345,12.5,2.159744,1.143786,0.235676,0.101988,0.129549,0.022816,0.097462,2.804097,...,8.482204,0.0,0.0,0,0,,6,1.0,0.0,4
1,,15.0,1.380657,-1.422165,0.264777,0.121004,0.135731,0.026597,0.020582,2.096338,...,8.683351,0.0,,0,0,,1,2.0,0.0,3
2,5.09375,12.5,-0.998467,-0.996911,0.25141,0.114912,0.165147,0.062173,0.027075,1.803559,...,8.077353,0.0,0.0,0,0,115.0,15,1.0,0.0,4


In [13]:
model = LGBModel()
model.train(X_dev, df['logPrice'], verbose_eval=True)

In [14]:
model.predict(X_test)

array([7.02861755, 7.37158413, 8.79832039, ..., 8.43389111, 8.36175512,
       8.37362611])