In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [12]:
# Cargamos el dataset
data = pd.read_csv('datos.csv')
data.shape

  data = pd.read_csv('datos.csv')


(1510000, 34)

In [13]:
# Seleccionamos solo las columnas de interés
columns_of_interest = ['make', 'model', 'trimlevel', 'car_year', 'km', 'bodytype', 'fueltype', 'transmission', 'color', 'horse_power', 'cyl_capacity', 'price']
data_selected = data[columns_of_interest]

# Eliminar filas con valores nulos en 'price' ya que es nuestra variable objetivo
data_selected = data_selected.dropna(subset=['price'])

# Eliminar filas con valores nulos en 'horse_power' y 'cyl_capacity'

data_selected = data_selected.dropna(subset=['horse_power', 'cyl_capacity'])

# Eliminar filas con valores nulos en 'make', 'model', 'trimlevel', 'bodytype', 'fueltype', 'transmission', 'color'

data_selected = data_selected.dropna(subset=['make', 'model', 'trimlevel', 'bodytype', 'fueltype', 'transmission', 'color'])

# Conversión de 'car_year' a edad del vehículo
current_year = pd.to_datetime('today').year
data_selected['car_age'] = current_year - data_selected['car_year']
data_selected.drop('car_year', axis=1, inplace=True)

# Imprimir las primeras filas para verificar la selección
print(data_selected.head())

# Verificación de valores nulos
print(data_selected.isnull().sum())

# Verificación de duplicados
print(data_selected.duplicated().sum())

# Ver numero de columnas y filas
print(data_selected.shape)

      make    model          trimlevel        km          bodytype fueltype  \
0     Fiat     Egea    1.4 Fire Street  110000.0  Hatchback 5 kapi   Benzin   
1     Fiat     Egea    1.4 Fire Street   14200.0  Hatchback 5 kapi   Benzin   
2   Nissan  Qashqai   1.5 dCi Sky Pack  103000.0               SUV    Dizel   
3  Renault   Symbol        1.5 DCI Joy  180000.0             Sedan    Dizel   
4     Fiat     Egea  1.3 Multijet Easy   74461.0             Sedan    Dizel   

  transmission  color horse_power cyl_capacity   price  car_age  
0       Manuel  Beyaz       95 hp      1368 cc  449000        5  
1       Manuel  Beyaz       95 hp      1368 cc  485900        2  
2       Manuel  Beyaz      110 hp      1461 cc  704900        9  
3       Manuel  Beyaz       90 hp      1461 cc  246000       10  
4       Manuel  Beyaz       95 hp      1248 cc  439500        4  
make            0
model           0
trimlevel       0
km              2
bodytype        0
fueltype        0
transmission    0
col

In [14]:
# División de datos
X = data_selected.drop('price', axis=1)  # Características
y = data_selected['price']  # Variable objetivo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Transformaciones para las características categóricas
categorical_features = ['make', 'model', 'trimlevel', 'fueltype', 'transmission', 'bodytype', 'color']

# Definimos los transformadores
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combinamos los transformadores
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ])

In [16]:
# Inicialización de los modelos de regresión
modelos_regresion = {
    'Regresión Lineal': LinearRegression(),
    'Gradient Boosting Regresor': GradientBoostingRegressor(random_state=42, n_estimators=100),
}

In [17]:
# Entrenamiento y evaluación de modelos de regresión
for name, model in modelos_regresion.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', model)])
    pipeline.fit(X_train, y_train)  # Entrenamiento
    y_pred = pipeline.predict(X_test)  # Predicción
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    print(f"Modelo: {name}")
    print('Error cuadrático medio (MSE):', mse)
    print('Coeficiente de determinación (R^2):', r2)
    print('Error absoluto medio (MAE):', mae)
    print('\n')

Modelo: Regresión Lineal
Error cuadrático medio (MSE): 23710768075979.46
Coeficiente de determinación (R^2): 0.006061041228429653
Error absoluto medio (MAE): 140044.23828436367


Modelo: Gradient Boosting Regresor
Error cuadrático medio (MSE): 23756833142893.535
Coeficiente de determinación (R^2): 0.0041300255608814895
Error absoluto medio (MAE): 205549.15328921593


