### Predicting Used Cars Price with Deep Tables and Embeddings

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn-whitegrid')

import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold, train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from keras.models import Sequential
from keras.models import Model as KerasModel
from keras.layers import Input, Dense, Activation, Reshape,  Conv1D, MaxPooling1D, Flatten
#Merge,
from keras.layers import Concatenate
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint

import pickle
import matplotlib.pyplot as plt
%matplotlib inline

from lightgbm import LGBMRegressor

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.simplefilter('ignore')

In [None]:
import tensorflow as tf
import tensorflow._api.v2.compat.v1 as tf

In [None]:
df = pd.read_csv('../input/craigslist-carstrucks-data/vehicles.csv')
df.head(2)

In [None]:
df.shape

### Data Cleaning

In [None]:
df= df.drop(columns=['id','url', 'region_url',  'image_url', 'description',
                     'lat', 'long','region','posting_date','Unnamed: 0','paint_color'], axis=1)

'vin','county',

In [None]:
TARGET_COLS = ['price']

In [None]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
categorical_columns = []
features = df.columns.values.tolist()
for col in features:
    if df[col].dtype in numerics: continue
    categorical_columns.append(col)

In [None]:
for cat_var in categorical_columns:
    print (cat_var, df[cat_var].nunique())

In [None]:
df.loc[df["odometer"]>=3000000.0].shape
df.drop(df[df["odometer"]>=3000000.0].index,inplace=True)

In [None]:
excellent_odo_mean = df[df['condition'] == 'excellent']['odometer'].mean()
good_odo_mean = df[df['condition'] == 'good']['odometer'].mean()
like_new_odo_mean = df[df['condition'] == 'like new']['odometer'].mean()
salvage_odo_mean = df[df['condition'] == 'salvage']['odometer'].mean()
fair_odo_mean = df[df['condition'] == 'fair']['odometer'].mean()

In [None]:
df.loc[df.year>=2019, 'condition'] = df.loc[df.year>=2019, 'condition'].fillna('new')

In [None]:
df.loc[df['odometer'] <= like_new_odo_mean, 'condition'] = df.loc[df['odometer'] <= like_new_odo_mean, 'condition'].fillna('like new')

df.loc[df['odometer'] >= fair_odo_mean, 'condition'] = df.loc[df['odometer'] >= fair_odo_mean, 'condition'].fillna('fair')

df.loc[((df['odometer'] > good_odo_mean) & 
       (df['odometer'] <= excellent_odo_mean)), 'condition'] = df.loc[((df['odometer'] > good_odo_mean) & 
       (df['odometer'] <= excellent_odo_mean)), 'condition'].fillna('excellent')

df.loc[((df['odometer'] > like_new_odo_mean) & 
       (df['odometer'] <= good_odo_mean)), 'condition'] = df.loc[((df['odometer'] > like_new_odo_mean) & 
       (df['odometer'] <= good_odo_mean)), 'condition'].fillna('good')

df.loc[((df['odometer'] > good_odo_mean) & 
       (df['odometer'] <= fair_odo_mean)), 'condition'] = df.loc[((df['odometer'] > good_odo_mean) & 
       (df['odometer'] <= fair_odo_mean)), 'condition'].fillna('salvage')

In [None]:
# Odometer - fill with mean
df['odometer'] = df.groupby(['year'], sort=False)['odometer'].apply(lambda x: x.fillna(x.mean()))
df['odometer'] = df['odometer'].fillna(method="ffill")
df['odometer'].isnull().sum()

In [None]:
#df['paint_color'] = df['paint_color'].fillna(method='ffill')
df['drive'] = df['drive'].fillna(method='ffill')
df['type'] = df['type'].fillna(method='ffill')
df['cylinders'] = df['cylinders'].fillna(method='ffill')
df['condition'] = df.groupby(['year'], sort=False)['condition'].apply(lambda x: x.fillna(x.mode()))
df['type'] = df.groupby(['year'], sort=False)['type'].apply(lambda x: x.fillna(x.mode()))
df['condition'] = df['condition'].fillna(method='ffill')
df['type'] = df['type'].fillna(method='ffill')

#data['paint_color'].fillna(data['paint_color'].mode()[0], inplace=True)

In [None]:
df.drop(df[df["year"].isna()].index,inplace=True)
df['year'] = (df['year']-1900).astype(int)
df['odometer'] = df['odometer'].astype(int)

In [None]:
## Price
rr=sorted(df["price"])
quantile1, quantile3= np.percentile(rr,[10,90])
#print(quantile1,quantile3)

df=df[(df.price < 31500) & (df.price >= 390 )]
#df.shape

In [None]:
df=df.drop(["size"],axis=1)

In [None]:
df['year'] = df['year'].astype(float)
df['odometer'] = df['odometer'].astype(float)
df['price'] = df['price'].astype(float)


In [None]:
#import pandas_profiling as pp
#pp.ProfileReport(df)

In [None]:
df.drop(df[df["manufacturer"].isna()].index,inplace=True)
df.drop(df[df["model"].isna()].index,inplace=True)
df.drop(df[df["fuel"].isna()].index,inplace=True)
df.drop(df[df["title_status"].isna()].index,inplace=True)
df.drop(df[df["transmission"].isna()].index,inplace=True)

In [None]:
df= df.drop(columns=['VIN'], axis=1)

In [None]:
null_values_per_variable = 100 * (df.isnull().sum()/df.shape[0]).round(3)#.reset_index()
null_values_per_variable.sort_values(ascending=False)

In [None]:
df.shape

In [None]:
##

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, df[TARGET_COLS], test_size=0.2, random_state=0)

### Embeddings from Fast AI

### Neural Network with Embeddings

In [None]:
np.random.seed(10)
from keras.models import Model
from keras.layers import Input, Dense, Concatenate, Reshape, Dropout
from keras.layers.embeddings import Embedding

from sklearn.model_selection import StratifiedKFold

In [None]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
categorical_columns = []
features = df.columns.values.tolist()
for col in features:
    if df[col].dtype in numerics: continue
    categorical_columns.append(col)

for cat_var in categorical_columns:
    print (cat_var, df[cat_var].nunique())

In [None]:
other_cols = [c for c in X_train.columns if (not c in categorical_columns)]

In [None]:
other_cols

In [None]:
# add both categorical and numerical variables
cols_use = categorical_columns + other_cols

X_train = X_train[cols_use]
X_test = X_test[cols_use]

col_vals_dict = {c: list(X_train[c].unique()) for c in X_train.columns}

embed_cols = []
for c in col_vals_dict:
    if len(col_vals_dict[c])>2:
        embed_cols.append(c)
        print(c + ': %d values' % len(col_vals_dict[c])) #look at value counts to know the embedding dimensions

print('\n')

In [None]:
def getVar(categorical_var):
    no_of_unique_cat  = df[categorical_var].nunique()
    embedding_size = min(np.ceil((no_of_unique_cat)/2), 50 )
    embedding_size = int(embedding_size)
    vocab  = no_of_unique_cat #+1
    return vocab,embedding_size,no_of_unique_cat

In [None]:
def build_embedding_network():
    
    inputs = []
    embeddings = []
    
    # Manufacturer
    input_manufacturer_cat = Input(shape=(1,))
    vocab,embedding_size, no_of_unique_cat = getVar('manufacturer')
    embedding = Embedding(vocab, embedding_size, input_length=1)(input_manufacturer_cat)
    embedding = Reshape(target_shape=(embedding_size,))(embedding)
    inputs.append(input_manufacturer_cat)
    embeddings.append(embedding)
    
    #model
    input_model_cat = Input(shape=(1,))
    vocab,embedding_size, no_of_unique_cat = getVar('model')
    embedding = Embedding(vocab, embedding_size, input_length=1)(input_model_cat)
    embedding = Reshape(target_shape=(embedding_size,))(embedding)
    inputs.append(input_model_cat)
    embeddings.append(embedding)
    
    #condition
    input_condition_cat = Input(shape=(1,))
    vocab,embedding_size, no_of_unique_cat = getVar('condition')
    embedding = Embedding(vocab, embedding_size, input_length=1)(input_condition_cat)
    embedding = Reshape(target_shape=(embedding_size,))(embedding)
    inputs.append(input_condition_cat)
    embeddings.append(embedding)    
    
    #cylinders
    input_cylinders_cat = Input(shape=(1,))
    vocab,embedding_size, no_of_unique_cat = getVar('cylinders')
    embedding = Embedding(vocab, embedding_size, input_length=1)(input_cylinders_cat)
    embedding = Reshape(target_shape=(embedding_size,))(embedding)
    inputs.append(input_cylinders_cat)
    embeddings.append(embedding)
    
    #fuel
    input_fuel_cat = Input(shape=(1,))
    vocab,embedding_size, no_of_unique_cat = getVar('fuel')
    embedding = Embedding(vocab, embedding_size, input_length=1)(input_fuel_cat)
    embedding = Reshape(target_shape=(embedding_size,))(embedding)
    inputs.append(input_fuel_cat)
    embeddings.append(embedding)    
    
    #title_status
    input_title_status_cat = Input(shape=(1,))
    vocab,embedding_size, no_of_unique_cat = getVar('title_status')
    embedding = Embedding(vocab, embedding_size, input_length=1)(input_title_status_cat)
    embedding = Reshape(target_shape=(embedding_size,))(embedding)
    inputs.append(input_title_status_cat)
    embeddings.append(embedding)    

    #transmission
    input_transmission_cat = Input(shape=(1,))
    vocab,embedding_size, no_of_unique_cat = getVar('transmission')
    embedding = Embedding(vocab, embedding_size, input_length=1)(input_transmission_cat)
    embedding = Reshape(target_shape=(embedding_size,))(embedding)
    inputs.append(input_transmission_cat)
    embeddings.append(embedding)    

    #drive
    input_drive_cat = Input(shape=(1,))
    vocab,embedding_size, no_of_unique_cat = getVar('drive')
    embedding = Embedding(vocab, embedding_size, input_length=1)(input_drive_cat)
    embedding = Reshape(target_shape=(embedding_size,))(embedding)
    inputs.append(input_drive_cat)
    embeddings.append(embedding)   
    
    #type
    input_type_cat = Input(shape=(1,))
    vocab,embedding_size, no_of_unique_cat = getVar('type')
    embedding = Embedding(vocab, embedding_size, input_length=1)(input_type_cat)
    embedding = Reshape(target_shape=(embedding_size,))(embedding)
    inputs.append(input_type_cat)
    embeddings.append(embedding)     

    #paint_color
#     input_paint_color_cat = Input(shape=(1,))
#     vocab,embedding_size, no_of_unique_cat = getVar('paint_color')
#     embedding = Embedding(vocab, embedding_size, input_length=1)(input_paint_color_cat)
#     embedding = Reshape(target_shape=(embedding_size,))(embedding)
#     inputs.append(input_paint_color_cat)
#     embeddings.append(embedding)
    
    #state
    input_state_cat = Input(shape=(1,))
    vocab,embedding_size, no_of_unique_cat = getVar('state')
    embedding = Embedding(vocab, embedding_size, input_length=1)(input_state_cat)
    embedding = Reshape(target_shape=(embedding_size,))(embedding)
    inputs.append(input_state_cat)
    embeddings.append(embedding)

    #Numeric Variables
    input_numeric = Input(shape=(3,))
    embedding_numeric = Dense(8)(input_numeric) 
    inputs.append(input_numeric)
    embeddings.append(embedding_numeric)
    

    x = Concatenate()(embeddings)
    x = Dense(80, activation='relu')(x)
    x = Dropout(.35)(x)
    x = Dense(20, activation='relu')(x)
    x = Dropout(.15)(x)
    x = Dense(10, activation='relu')(x)
    x = Dropout(.15)(x)
    output = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs, output)

    model.compile(loss='mean_squared_error', optimizer='adam')
    
    return model

In [None]:
K = 8
runs_per_fold = 3
n_epochs = 15


In [None]:
NN = build_embedding_network()

In [None]:
NN.summary()

In [None]:
#NN.summary()
from tensorflow import keras
keras.utils.plot_model(NN, show_shapes=True, rankdir="LR")

In [None]:
# #X_train['paint_color'].value_counts()
# X_train['paint_color']=X_train['paint_color'].astype('object')
# X_train['paint_color'] = X_train['paint_color'].fillna(method='ffill')
# np.unique(X_train['type'])

In [None]:
## https://www.kaggle.com/aquatic/entity-embedding-neural-net

#converting data to list format to match the network structure
def preproc(X_train, X_val, X_test,embed_cols):

    input_list_train = []
    input_list_val = []
    input_list_test = []
    
    #the cols to be embedded: rescaling to range [0, # values)
    for c in embed_cols:
        print("NEW COL :" + c)
        #raw_vals = np.unique(X_train[c])
        raw_vals = X_train[c].unique()
        
        #print("Raw_vals" + c + str(len(raw_vals)))
        val_map = {}
        for i in range(len(raw_vals)):
            val_map[raw_vals[i]] = i  
        #print("FIN COLUMNS0")    
        input_list_train.append(X_train[c].map(val_map).values)
        input_list_val.append(X_val[c].map(val_map).fillna(0).values)
        input_list_test.append(X_test[c].map(val_map).fillna(0).values)
        #print("FIN COLUMNS1")    
     
    #the rest of the columns
    #print("OTHER COLUMNS")
    other_cols = [c for c in X_train.columns if (not c in categorical_columns)]
    
    from sklearn.preprocessing import MinMaxScaler
    scaler =  MinMaxScaler()
    X_train[other_cols] = scaler.fit_transform(X_train[other_cols])
    X_val[other_cols] =  scaler.fit_transform(X_val[other_cols])
    X_test[other_cols] =  scaler.fit_transform(X_test[other_cols])
    
    input_list_train.append(np.array(X_train[other_cols].values,dtype=np.float))
    input_list_val.append(np.array(X_val[other_cols].values,dtype=np.float))
    input_list_test.append(np.array(X_test[other_cols].values,dtype=np.float))
    
    return input_list_train, input_list_val, input_list_test  

In [None]:
trainX, ValX, trainy, Valy = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
proc_X_train_f, proc_X_val_f, proc_X_test_f = preproc(trainX, ValX, X_test,categorical_columns)

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from math import sqrt

NN.fit(proc_X_train_f,trainy, epochs=2, validation_data=proc_X_val_f)


In [None]:
# len(X_train.columns)
# other_cols = [c for c in X_train.columns if (not c in categorical_columns)]
# X_train[other_cols].columns

# proc_X_train_f, proc_X_val_f, proc_X_test_f = preproc(trainX, ValX, X_test,categorical_columns)


In [None]:
prediction = NN.predict(proc_X_val_f)
rms = sqrt(mean_squared_error(prediction, Valy.values))

In [None]:
prediction = NN.predict(proc_X_test_f)
rms = sqrt(mean_squared_error(prediction, y_test.values))

In [None]:
rms

In [None]:
len(prediction),len(y_test)

In [None]:
categorical_columns

In [None]:
dfembeddings = pd.DataFrame()
iLayer = 10
for each in categorical_columns:    
    #print(iLayer)
    dftemp = pd.DataFrame(NN.layers[iLayer].get_weights()[0],
             columns=[each + str(a) for a in range(NN.layers[iLayer].get_weights()[0].shape[1])])
    dftemp[each + 'orig'] = df[each].unique()
    X_train = pd.merge(X_train,dftemp,how='inner', left_on=each, right_on=each + 'orig')
    
    #dfembeddings = pd.concat([dfembeddings,dftemp],axis=1)
    iLayer = iLayer + 1
    
#dfembeddings.shape

In [None]:
dfembeddings = pd.DataFrame()
iLayer = 10
for each in categorical_columns:    
    #print(iLayer)
    dftemp = pd.DataFrame(NN.layers[iLayer].get_weights()[0],
             columns=[each + str(a) for a in range(NN.layers[iLayer].get_weights()[0].shape[1])])
    dftemp[each + 'orig'] = df[each].unique()
    X_test = pd.merge(X_test,dftemp,how='inner', left_on=each, right_on=each + 'orig')
    
    #dfembeddings = pd.concat([dfembeddings,dftemp],axis=1)
    iLayer = iLayer + 1

In [None]:
X_test.shape,X_train.shape


In [None]:
# manufacturer: 40 values
# model: 15288 values
# condition: 6 values
# cylinders: 8 values
# fuel: 5 values
# title_status: 6 values
# transmission: 3 values
# drive: 3 values
# type: 13 values
# paint_color: 12 values
# state: 51 values

# manufacturer = NN.layers[0].get_weights()[0]
# model = models[1].layers[0].get_weights()[0]
# condition = models[2].layers[0].get_weights()[0]
# cylinders = models[3].layers[0].get_weights()[0]
# fuel = models[4].layers[0].get_weights()[0]

### Light GBM

In [None]:
import lightgbm as lgb

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [None]:
excludecols = ['stateorig','manufacturerorig','conditionorig','cylindersorig',
               'fuelorig','title_statusorig','modelorig',
               'transmissionorig','driveorig','typeorig','stateorig']

In [None]:
excludecols = excludecols + categorical_columns
cols = [col for col in X_train.columns if col not in excludecols]
cols

In [None]:
# create dataset for lightgbm
trainX, ValX, trainy, Valy = train_test_split(X_train[cols], y_train, test_size=0.2, random_state=0)

#other_cols = [c for c in df.columns if (not c in categorical_columns)]
#proc_X_train_f, proc_X_val_f, proc_X_test_f = preproc(trainX, ValX, X_test,categorical_columns,other_cols)

lgb_train = lgb.Dataset(trainX, trainy)
lgb_eval = lgb.Dataset(ValX, Valy, reference=lgb_train)


In [None]:
print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)

In [None]:
y_pred = gbm.predict(X_test[cols], num_iteration=gbm.best_iteration)

In [None]:
rms = sqrt(mean_squared_error(y_pred, y_test.values))

In [None]:
rms

# Approach 2 - DeepTables

In [None]:
no_of_unique_cat  = df['manufacturer'].nunique()
no_of_unique_cat

In [None]:
!pip install deeptables

In [None]:
import numpy as np
from deeptables.models import deeptable, deepnets
from deeptables.datasets import dsutils
from sklearn.model_selection import train_test_split

In [None]:
%%time
#y = df.pop('price')
X = df
y = y.astype('float64')
conf = deeptable.ModelConfig(
    metrics=['RootMeanSquaredError'],
    nets=['dnn_nets'],
    #fixed_embedding_dim=True,
    #stacking_op = 'add',
    #output_use_bias = False,
    categorical_columns = categorical_columns,
    embeddings_output_dim = 20,
    dnn_params={
        'hidden_units': ((300, 0.3, True), (300, 0.3, True)),
        'dnn_activation': 'relu',
    },
    earlystopping_patience=5,
)

dt = deeptable.DeepTable(config=conf)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model, history = dt.fit(X_train, y_train, epochs=100)

score = dt.evaluate(X_test, y_test)

In [None]:
score

In [None]:
## finish