In [82]:
#Libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler, LabelEncoder
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow.keras.layers import Dense, InputLayer, BatchNormalization, Dropout, Input, Reshape, Embedding, Flatten, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.activations import relu, linear
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model, load_model

import optuna

In [29]:
train_data_path = os.path.join('..', 'data','raw','train.csv')
test_data_path = os.path.join('..', 'data','raw','test.csv')

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
train_data = train_data.drop('id', axis= 1)

In [30]:
print(train_data.shape, used_cars_df.shape)
train_data.head()

(188533, 12) (4009, 12)


Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [31]:
print(train_data.dtypes)

brand           object
model           object
model_year       int64
milage           int64
fuel_type       object
engine          object
transmission    object
ext_col         object
int_col         object
accident        object
clean_title     object
price            int64
dtype: object


There is a lot of categorical variables so we have to decide what to do with them, the first thing it occurs to me is see how many categories does have every categorical variable. Also we have missing values.

## Other values

High cardinality problems

## Creata data pre-proccesing function

In [32]:
def preproccess(df, threshold=0.03):
    df = df.copy()
    df['fuel_type'] = df['fuel_type'].apply(lambda x: 1.0 if x == 'Gasoline' else 0.0)
    df['accident'] = df['accident'].apply(lambda x: 1.0 if x == 'At least 1 accident or damage reported' else 0.0)
    df['clean_title'] = df['clean_title'].apply(lambda x: 1.0 if x == 'Yes' else 0.0)
    df['model_year'] = df['model_year'].astype('float64')
    df['milage'] = df['model_year'].astype('float64')
    return df

In [35]:
train_df = preproccess(train_data)
train_df['price'] = train_df['price'].astype('float64')
test_df = preproccess(test_data)
train_df.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,MINI,Cooper S Base,2007.0,2007.0,1.0,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,0.0,1.0,4200.0
1,Lincoln,LS V8,2002.0,2002.0,1.0,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,1.0,1.0,4999.0
2,Chevrolet,Silverado 2500 LT,2002.0,2002.0,0.0,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,0.0,1.0,13900.0
3,Genesis,G90 5.0 Ultimate,2017.0,2017.0,1.0,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,0.0,1.0,45000.0
4,Mercedes-Benz,Metris Base,2021.0,2021.0,1.0,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,0.0,1.0,97500.0


## Entity Embeddings

In [40]:
n_categories_df = train_df.nunique()
n_categories_df

brand             57
model           1897
model_year        34
milage            34
fuel_type          2
engine          1117
transmission      52
ext_col          319
int_col          156
accident           2
clean_title        2
price           1569
dtype: int64

### Train test split 

In [54]:
X = train_df[train_df.columns[:-1]]
y = train_df['price']
encoding_variables = ['brand', 'model', 'engine', 'transmission', 'ext_col', 'int_col']

encoders = {}

def train_encoders(X, vars):
    X = X.copy()
    for var in vars:
        lb_encoder = LabelEncoder()
        X[var] = lb_encoder.fit_transform(X[var])
        
        # Guarda el encoder en el diccionario
        encoders[var] = lb_encoder
    return X
X = train_encoders(X, encoding_variables)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size= 0.1, random_state= 3004)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

(169679, 11) (169679,) (18854, 11) (18854,)


In [55]:
X_train.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
22655,44,10,2021.0,2021.0,1.0,789,31,29,14,0.0,1.0
55006,50,1758,2015.0,2015.0,1.0,412,20,29,10,0.0,1.0
92481,9,510,2020.0,2020.0,1.0,939,38,304,14,0.0,1.0
153001,4,992,2023.0,2023.0,1.0,1058,31,29,14,0.0,1.0
69643,0,1646,2016.0,2016.0,1.0,478,38,29,14,1.0,1.0


In [56]:
categorical_cardinalities = [57, 1897, 1117, 52, 319, 156]

In [64]:
inputs = []
embeddings = []
for cardinality in categorical_cardinalities:
    input_layer = Input(shape=(1,))
    embedding_layer = Embedding(input_dim=cardinality, output_dim= 20)(input_layer)
    flat_embedding = Flatten()(embedding_layer)
    
    inputs.append(input_layer)
    embeddings.append(flat_embedding)

continuous_input_layer = Input(shape=(5,))
inputs.append(continuous_input_layer)

combined = Concatenate()(embeddings + [continuous_input_layer])
x = Dense(64, activation='relu')(combined)
x = Dense(32, activation='relu')(x)
output = Dense(1)(x)

model = Model(inputs=inputs, outputs=output)
model.compile(optimizer='adam', loss='mean_squared_error')

model.summary()



In [72]:
mm_scaler = MinMaxScaler()
rb_scaler = RobustScaler()
y_train_scaled = rb_scaler.fit_transform(y_train.values.reshape(-1,1))
X_train_continuous = mm_scaler.fit_transform(X_train_encoded.drop(encoding_variables, axis= 1))
model.fit([X_train_encoded['brand'], X_train_encoded['model'], X_train_encoded['engine'],
           X_train_encoded['transmission'], X_train_encoded['ext_col'], X_train_encoded['int_col']
           , X_train_continuous], y_train_scaled, epochs=100, batch_size=32)

Epoch 1/100
[1m5303/5303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 377us/step - loss: 377839712.0000
Epoch 2/100
[1m5303/5303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 376us/step - loss: 44760532.0000
Epoch 3/100
[1m5303/5303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 377us/step - loss: 43721764.0000
Epoch 4/100
[1m5303/5303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 376us/step - loss: 4582785.0000
Epoch 5/100
[1m5303/5303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 366us/step - loss: 6531117.5000
Epoch 6/100
[1m5303/5303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 372us/step - loss: 16070328.0000
Epoch 7/100
[1m5303/5303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 370us/step - loss: 5640041.5000
Epoch 8/100
[1m5303/5303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 367us/step - loss: 2363011.5000
Epoch 9/100
[1m5303/5303[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 376us/step - loss: 

<keras.src.callbacks.history.History at 0x2ee67fe50>

In [79]:
embedding_weights = model.layers[6].get_weights()[0]
embedding_weights.shape

(57, 20)

In [74]:
model.layers

[<InputLayer name=input_layer_6, built=True>,
 <InputLayer name=input_layer_7, built=True>,
 <InputLayer name=input_layer_8, built=True>,
 <InputLayer name=input_layer_9, built=True>,
 <InputLayer name=input_layer_10, built=True>,
 <InputLayer name=input_layer_11, built=True>,
 <Embedding name=embedding_6, built=True>,
 <Embedding name=embedding_7, built=True>,
 <Embedding name=embedding_8, built=True>,
 <Embedding name=embedding_9, built=True>,
 <Embedding name=embedding_10, built=True>,
 <Embedding name=embedding_11, built=True>,
 <Flatten name=flatten_6, built=True>,
 <Flatten name=flatten_7, built=True>,
 <Flatten name=flatten_8, built=True>,
 <Flatten name=flatten_9, built=True>,
 <Flatten name=flatten_10, built=True>,
 <Flatten name=flatten_11, built=True>,
 <InputLayer name=input_layer_12, built=True>,
 <Concatenate name=concatenate, built=True>,
 <Dense name=dense, built=True>,
 <Dense name=dense_1, built=True>,
 <Dense name=dense_2, built=True>]

In [80]:
layers = model.layers[6:12]  # Obtén las capas de la 6 a la 11
weights = [layer.get_weights() for layer in layers]

In [81]:
for i, w in enumerate(weights):
    np.save(f'embeddings_layer_{i+6}.npy', w)

In [83]:
with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)