# SF DST Car Price Prediction

Цель проекта: Прогнозирование стоимости автомобиля по характеристикам.

Данный проект в рамках курса "Специализация Data Science" был разделен на два юнита. В каждом из которых нужно было реализовать отдельные задачи. 



In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sys

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score

from tqdm.notebook import tqdm

from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

from tqdm import tqdm

import re

import pandas_profiling as PP
import category_encoders as ce

from keras.models import load_model

In [None]:
print('Python       :', sys.version.split('\n')[0])
print('Numpy        :', np.__version__)

In [None]:
!pip freeze > requirements.txt
RANDOM_SEED = 42

# Setup

In [None]:
VERSION    = 1
DIR_TRAIN  = '../input/train-cars/' # подключил к ноутбуку свой внешний датасет
DIR_TEST   = '../input/sf-dst-car-price/'
VAL_SIZE   = 0.33   # 33%
N_FOLDS    = 5

# CATBOOST
ITERATIONS = 1000
LR         = 1e-3

# Data

In [None]:
!ls ../input/

In [None]:
prepare = pd.read_csv('../input/for-preparation/backup_bmw.csv')

In [None]:
prepare

In [None]:
train = pd.read_csv(DIR_TRAIN+'mega_train.csv') #really huge train dataset
test = pd.read_csv(DIR_TEST+'test.csv')
sample_submission = pd.read_csv(DIR_TEST+'sample_submission.csv')

In [None]:
train.info()

In [None]:
test.head()

In [None]:
train.replace('no_data', np.nan, inplace=True)

# EDA

In [None]:
train.info()

In [None]:
train.head()

In [None]:
profile = PP.ProfileReport(train)

In [None]:
profile

По какой-то причине цена определилась как object type.

In [None]:
numeric_features = train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = train.select_dtypes(include=['object']).drop(['price'], axis = 1).columns

In [None]:
print(f'Numeric features are: {numeric_features} \n',
     f'Categorical features are: {categorical_features}')

# Naive solver + category encoder choosing

In [None]:
encoder_list = [ce.backward_difference.BackwardDifferenceEncoder, 
               ce.basen.BaseNEncoder,
               ce.binary.BinaryEncoder,
                ce.cat_boost.CatBoostEncoder,
                ce.hashing.HashingEncoder,
                ce.helmert.HelmertEncoder,
                ce.james_stein.JamesSteinEncoder,
                ce.one_hot.OneHotEncoder,
                ce.leave_one_out.LeaveOneOutEncoder,
                ce.m_estimate.MEstimateEncoder,
                ce.ordinal.OrdinalEncoder,
                ce.polynomial.PolynomialEncoder,
                ce.sum_coding.SumEncoder,
                ce.target_encoder.TargetEncoder,
                ce.woe.WOEEncoder
                ]

for encoder in encoder_list:
    
    numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('woe', encoder())])
    
    preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])
    
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(n_estimators=500))])
    
    model = pipe.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    print(encoder)
    print(f1_score(y_test, y_pred, average='macro'))

In [None]:
# drive - привод
# condition - состояние авто
# customs - таможня
# ownership - время владения
cols = ['body_type', 'brand', 'color', 'fuel_type', 'model_date', 'name',
        'num_of_doors', 'production_date', 'vehicle_config', 'vehicle_transmission',
               'engine_displacement', 'engine_power', 'description', 'mileage', 'complectation', 'drive',
               'wheel', 'condition', 'owners', 'PTS', 'customs', 'ownership', 'id']
test.columns = cols
test['sample'] = np.ones(len(test)) # flag test with 1
test['price'] = 0
train.drop_duplicates(inplace=True)
target = train.price
train.replace('no_data', np.nan, inplace=True)
#train.drop(['idprice', 'Unnamed: 0'], axis=1, inplace=True)
print(train.columns, len(train.columns))
cols = ['body_type', 'brand', 'color', 'fuel_type', 'model_date', 'name',
        'num_of_doors', 'production_date', 'vehicle_config', 'vehicle_transmission',
               'engine_displacement', 'engine_power', 'description', 'mileage', 'complectation', 'drive',
               'wheel', 'condition', 'owners', 'PTS', 'customs', 'ownership', 'id', 'price']
train.columns = cols
train['sample'] = np.zeros(len(train)) # flag train with 0

# some preparation
train.fuel_type.replace({1: 'бензин', 2: 'дизель'}, inplace=True)
train.vehicle_transmission.replace({1: 'автоматическая',
                                    2: 'механическая',
                                    3: 'вариатор',
                                    4: 'роботизированная'}, inplace=True)
train.drive.replace({1: 'передний',
                    2: 'задний',
                    3: 'полный',
                    0: 'не упомянут'}, inplace=True)

# to have same names for the same categories in 'owners' for train and test
own3, own1, own2 = test.owners.value_counts().index.tolist()
train.owners.replace({3: own3,
                      1: own1,
                      2: own2}, inplace=True)


train.drop(target[target.isnull()].index.tolist(), inplace=True) 
data1 = pd.concat([train, test])
data1.info()

# EDA

In [None]:
profile = pandas_profiling.ProfileReport(train)

In [None]:
profile.to_file("car_price_profiling.html")

In [None]:
profile

## Data Preprocessing

In [None]:
#regexps for name processing
pattern = re.compile('[0-9]+[a-z]')
pattern1 = re.compile('[A-Z][0-9]')
pattern2 = re.compile('[A-Z][0-9]+[a-z]')

In [None]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_pred-y_true)/y_true))

In [None]:
def make_other_cat(col, data, thresh):
    # make 'other' category in given feature by if the frequency of some class is less than threshold(%)
    
    valCount = data[col].value_counts(normalize=True)*100
    names = valCount[valCount < thresh].index.tolist()
    return data[col].apply(lambda x: 'other' if x in names else x)

def fuel_type_prep(other, data):
    # prepare fuel_type feature
    
    if other: # 'электро' & 'гибрид' to 'other'
        return data.fuel_type.apply(lambda x: 'other' if x in ['гибрид', 'электро'] else x)
    
    # consider 'электро' & 'гибрид' as 'бензин'because there are
    # too smal number of samples ih these categories
    # 63 in 'электро' & 'гибрид'
    else:
        return data.fuel_type.apply(lambda x: 'бензин' if x in ['гибрид', 'электро'] else x)
    
def vehicle_transmission_prep(drop, data):
    if drop: # there is no samples with category'вариатор' in test data
        idx = data[data.vehicle_transmission == 'вариатор'].index.tolist()
        data.drop(idx, inplace=True)
    #else:
        # merge 'вариатор' in some category

def mileage_prep(get_log, data):
    # get log of mileage feature
    if get_log:
        data.mileage = data.mileage.apply(lambda x: np.log(x+1))



def owners_prepare(method):
    if method == 'nan2no':
        data1.owners.fillna('no_data', inplace=True)
    elif method == 'popularity':
        data1.owners.fillna(data.owners.value_counts().idxmax(),
                          inplace=True)
    elif method == 'nan2zero':
        # consider owners feature as numeric
        data1.owners.replace({own3: 3,
                    own2: 2,
                    own1: 1}, inplace=True)
        data1.fillna(0, inplace=True)



def PTS_prepare(method):
    if method == 'no_data':
        data1.PTS.fillna('no_data', inplace=True)
    
    elif method == 'popularity':
        data1.PTS.fillna(data.PTS.value_counts().idxmax(),
                          inplace=True)


#
    
                                                             
    return data

In [None]:
def prepare_data(df_old, method):
    #categorical features to encode
    to_encode = ['use_name','brand','body_type', 'color', 'fuel_type', 'drive', 'PTS', 'vehicle_config']
    
    #top configurations
    configs = [i for i in df_old.vehicle_config.value_counts()[:10]]
    
    #Cleaning part
    df = df_old.copy(deep=True)
    
    mean = df.mileage.mean()
    lower = df.mileage.quantile(0.25)
    upper = df.mileage.quantile(0.75)
    
    print(df.columns)
    df.body_type = make_other_cat('body_type', data=df, thresh=1)
    df.color = make_other_cat('color', data=df, thresh=1)

    df.fuel_type = fuel_type_prep(other=False, data=df)
    vehicle_transmission_prep(drop=False, data=df)
#     mileage_prep(get_log=True, data=df)
    df.PTS.fillna('no_data', inplace=True)

    df['mileage'] = df['mileage'].apply(lambda x: 0 if x == 0 else x).apply(lambda x: 1 if (x <= lower and x != 0) else x).apply(
    lambda x: 2 if x > lower and x<=mean else x).apply(lambda x: 3 if x > mean and x<= upper else x).apply(
    lambda x: 4 if x > upper else x)
    
    df['vehicle_config'] = df['vehicle_config'].apply(lambda x: 'other' if x not in configs else x)
    
    df.drive.fillna(df.drive.value_counts().idxmax(), inplace=True)
    owners_prepare(method='nan2zero')
    PTS_prepare(method='no_data')
    ownership_prepare(method='no_data')
    
    #deleting all the samples with zero doors (occur only in train part)
    df = df.drop(df[df.num_of_doors == 0].index.tolist())

    df.engine_displacement = df.engine_displacement.replace({'undefined ': -1, 'undefined LTR': -1})
    df.engine_displacement = df.engine_displacement.apply(lambda x: str(x).replace('LTR', '')).astype(float)
    print(df.engine_displacement.isna().value_counts())
    #converting engine power to float type
    df.engine_power = df.engine_power.apply(lambda x: str(x).replace('N12', '')).astype(float)
    
    for feature in ['model_date', 'num_of_doors', 'mileage', 'production_date', 'engine_power']:
        df[feature]=df[feature].astype('int32')
    
    
    """Feature engineering part"""
    #adding age of model and car itself
    df['model_age'] = 2020 - df.model_date
    df['car_age'] = 2020 - df.production_date
    df['age_ratio'] = df['car_age']/df['model_age']
    df['age_ratio'].fillna(1.0, inplace = True)
    
    # define the 'taxes' attribute according to the column 'enginePower'
    df['taxes'] = fe_add_taxes(df, 'engine_power')
    #
    df['class_'] =  fe_add_classe(df, 'name')
    df['is_ownership'] = df.apply(lambda x: 0 if x.ownership == 'no_data' else 1, axis = 1)
    df['is_old'] = df.apply(lambda x: 1 if x.car_age != 0 else 0, axis = 1)
    df['old_and_mtnd'] = df.is_ownership + df.is_old
    
    df['description'] = df['description'].fillna('[]')
    df['description_len'] = df['description'].apply(lambda x: len(x.split()))
    df['description_word'] = df['description'].apply(lambda x: [str(i).lower() for i in x.split()])
    
    df = description_features(df)
    
    vectorizer = CountVectorizer()
    text_feat = vectorizer.fit_transform(df['description'])
    df['mean'] = text_feat.mean(axis=1)
    df['sum'] = text_feat.sum(axis=1)
    
    df['use_name'] = df['name'].apply(lambda x: str(x.replace('BMW', ' ').replace('Audi', ' ').replace(
    'Volkswagen', ' ').replace('Kia', ' ').replace('Mercedes', ' ')))
    
    label_encoder = LabelEncoder()
    #Encoding binary features from objects to numbers
    for column in to_encode:
        print(column)
        df[column] = label_encoder.fit_transform(df[column])
        
    if method == 'other':    
        df = pd.get_dummies(df, columns = ['use_name','brand', 'body_type', 'color', 'fuel_type', 'vehicle_config',
                                           'drive', 'wheel', 'owners', 'PTS', 'engine_displacement'], dummy_na=True)
    if method == 'cat_boost':
        df['age_ratio'] = df['age_ratio'].apply(lambda x: int(round(x*100)))
        df['engine_displacement'] = df['engine_displacement'].apply(lambda x: int(round(x*100)))
        df['mean'] = df['mean'].apply(lambda x: int(round(x)))

    """Dropping unprocessed features"""    
    df.drop(['id'], axis=1, inplace=True)
    object_columns = [s for s in df.columns if df[s].dtypes == 'object']
    df.drop(object_columns, axis = 1, inplace=True)
    
    
    
    
    return df

Separate Data Preparation for CatBoost and Forests and etc

In [None]:
#For CatBoost
X_all_cat = prepare_data(data1, 'cat_boost')

train_cat = X_all_cat[X_all_cat['sample'] == 0]
sub_cat = X_all_cat[X_all_cat['sample'] == 1]

y_cat = train_cat.price.values
X_cat = train_cat.drop(['price', 'sample'], axis=1,)
print(X_cat.shape, len(y_cat))

sub_cat.drop(['sample','price'], axis = 1, inplace = True)



In [None]:
#For other models !!!It's necessary to repeat data downloading again
X_all_oth = prepare_data(data1, 'other')

train_oth = X_all_oth[X_all_oth['sample'] == 0]
sub_oth = X_all_oth[X_all_oth['sample'] == 1]

y_oth = train_oth.price.values
X_oth = train_oth.drop(['price', 'sample'], axis=1,)
print(X_oth.shape, len(y_cat))

sub_oth.drop(['sample','price'], axis = 1, inplace = True)

# СV_randomizedsearch for CatBoost

In [None]:
X_all = prepare_data(data1, 'cat_boost')

x_train = X_all[X_all['sample'] == 0]
X_sub = X_all[X_all['sample'] == 1]

y = x_train.price.values
X = x_train.drop(['price', 'sample'], axis=1,)
print(X.shape, len(y))

X_sub.drop(['sample','price'], axis = 1, inplace = True)

In [None]:
model = CatBoostRegressor(random_seed = RANDOM_SEED, eval_metric='MAPE')

grid = {'learning_rate': [0.03, 0.1],
        'depth': [4, 6, 10, 15],
        'l2_leaf_reg': [1, 3, 5, 7, 9]}

randomized_search_result = model.randomized_search(grid,
                                                   X=X,
                                                   y=y,
                                                   cv = 5,
                                                   n_iter = 1000,
                                                   train_size = 0.67,
                                                   plot=True)

In [None]:
X.nunique() #for categorical features definition

# Models to stack and blend

# CatBoostRegressor

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_cat, y_cat, test_size=VAL_SIZE, shuffle=True, random_state=RANDOM_SEED)

In [None]:
# Keep list of all categorical features in dataset to specify this for CatBoost
cat_features_ids = np.where(X_train.apply(pd.Series.nunique) < 2000)[0].tolist()

In [None]:
cat = CatBoostRegressor(iterations = ITERATIONS,
                          learning_rate = 0.03,
                          depth = 15,
                          l2_leaf_reg = 1,
                          random_seed = RANDOM_SEED,
                          eval_metric='MAPE',
                          custom_metric=['R2', 'MAE']
                         )
cat.fit(X_train, y_train,
         cat_features=cat_features_ids,
         eval_set=(X_test, y_test),
         verbose_eval=100,
         use_best_model=True,
         plot=True
         )

# Random Forest Regressor

In [None]:
rf = RandomForestRegressor(n_estimators = 1000)

grid = {
    'criterion': ['mse'],
    'max_depth': [10,15],
    'min_samples_split': [2,3],
    'max_features': ['sqrt', 'log2'],
    'oob_score': [True],
    'random_state': [42]
}
    
reg = GridSearchCV(rf, grid, cv = N_FOLDS)
reg.fit(X_oth,y_oth)

In [None]:
mape(y_test, reg.predict(X_test))

# Gradient Boosting Regressor

In [None]:
et = ExtraTreeRegressor(max_depth = 15, random_state = RANDOM_SEED)
boost = GradientBoostingRegressor(random_state = RANDOM_SEED)
grid = {
    'loss': ['ls','huber'],
    'learning_rate':[0.05, 0.1],
    'max_depth': [10,15],
    'max_features': ['sqrt', 'log2'],
    'init': [et],
    'n_iter_no_change':[100]
}

gb = GridSearchCV(boost, grid, cv = N_FOLDS)
gb.fit(X_oth,y_oth)

In [None]:
mape(y_test, gb.predict(X_test))

# AdaBoost Regressor

In [None]:
et = ExtraTreeRegressor(max_depth = 15, random_state = RANDOM_SEED)
ada_bst = AdaBoostRegressor(et, n_estimators=1000, random_state=RANDOM_SEED)
grid = {
    'loss': ['square'],
    'learning_rate': [0.1, 0.05, 0.03]
}
ada = GridSearchCV(ada_bst, grid, cv = N_FOLDS)
ada.fit(X_oth,y_oth)

# Stacking and Blending

In [None]:
models = [reg, cat, gb, ada]

In [None]:
sample_submission = pd.read_csv(DIR_TEST+'sample_submission.csv')
def blending_pred(models, sample_submission):
    for model in tqdm(models):
        if str(model) == '<catboost.core.CatBoostRegressor object at 0x7f9715f32690>':
            pred_subm = model.predict(sub_cat)
        else:
            pred_subm = model.predict(sub_oth)
        sample_submission[str(model)[:6]] = pred_subm
    sample_submission['price'] = sample_submission.iloc[:,2:].mean(axis=1)
    sample_submission[['id', 'price']].to_csv('submission_blending.csv', index=False)
    sample_submission.head(10)
    
blending_pred(models, sample_submission)

In [None]:
sample_submission = pd.read_csv(DIR_TEST+'sample_submission.csv')
def stacking_pred(models, sample_submission):
    meta_set = pd.DataFrame()
    meta_sub = pd.DataFrame()
    for model in models:
        if str(model) == '<catboost.core.CatBoostRegressor object at 0x7f9715f32690>':
            meta_set[str(model)] = model.predict(X_cat)
        else:
            print(str(model))
            meta_set[str(model)] = model.predict(X_oth)
    for model in models:
        if str(model) == '<catboost.core.CatBoostRegressor object at 0x7f9715f32690>':
            meta_sub[str(model)] = model.predict(sub_cat)
        else:
            meta_sub[str(model)] = model.predict(sub_oth)
    meta_set['target'] = y_cat
    regr = RandomForestRegressor(n_jobs = -1)
    regr.fit(meta_set.drop('target' , axis = 1), meta_set.target)
    sample_submission['price'] = regr.predict(meta_sub)
    print(sample_submission[:10])
    sample_submission.to_csv('submission_stacking.csv', index = False)
stacking_pred(models, sample_submission)

![](https://i.ibb.co/6rmj4Kh/photo-2020-09-18-16-39-34.jpg)

In [None]:
sample_submission = pd.read_csv(DIR_TEST+'sample_submission.csv')
sample_submission['price'] = cat.predict(sub_cat)
sample_submission.to_csv('cat.csv', index = False)