In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [3]:
# Cargamos el dataset
data = pd.read_csv('datos.csv')

  data = pd.read_csv('datos.csv')


In [4]:
# Seleccionamos solo las columnas de interés
columns_of_interest = ['make', 'model', 'trimlevel', 'car_year', 'km', 'bodytype', 'fueltype', 'transmission', 'color', 'horse_power', 'cyl_capacity', 'price']
data_selected = data[columns_of_interest]

# Eliminar filas con valores nulos en 'price' ya que es nuestra variable objetivo
data_selected = data_selected.dropna(subset=['price'])

# Tratamiento de valores nulos en características
for column in data_selected.columns:
    # Para características categóricas, usamos la moda
    if data_selected[column].dtype == 'object':
        mode = data_selected[column].mode()[0]
        data_selected[column].fillna(mode, inplace=True)
    # Para características numéricas, usamos la mediana
    else:
        median = data_selected[column].median()
        data_selected[column].fillna(median, inplace=True)

# Conversión de 'car_year' a edad del vehículo
current_year = pd.to_datetime('today').year
data_selected['car_age'] = current_year - data_selected['car_year']
data_selected.drop('car_year', axis=1, inplace=True)

# Imprimir las primeras filas para verificar la selección
print(data_selected.head())

# Verificación de valores nulos
print(data_selected.isnull().sum())

# Verificación de duplicados
print(data_selected.duplicated().sum())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_selected[column].fillna(mode, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_selected[column].fillna(median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting 

      make    model          trimlevel        km          bodytype fueltype  \
0     Fiat     Egea    1.4 Fire Street  110000.0  Hatchback 5 kapi   Benzin   
1     Fiat     Egea    1.4 Fire Street   14200.0  Hatchback 5 kapi   Benzin   
2   Nissan  Qashqai   1.5 dCi Sky Pack  103000.0               SUV    Dizel   
3  Renault   Symbol        1.5 DCI Joy  180000.0             Sedan    Dizel   
4     Fiat     Egea  1.3 Multijet Easy   74461.0             Sedan    Dizel   

  transmission  color horse_power cyl_capacity   price  car_age  
0       Manuel  Beyaz       95 hp      1368 cc  449000        5  
1       Manuel  Beyaz       95 hp      1368 cc  485900        2  
2       Manuel  Beyaz      110 hp      1461 cc  704900        9  
3       Manuel  Beyaz       90 hp      1461 cc  246000       10  
4       Manuel  Beyaz       95 hp      1248 cc  439500        4  
make            0
model           0
trimlevel       0
km              0
bodytype        0
fueltype        0
transmission    0
col

In [5]:
# División de datos
X = data_selected.drop('price', axis=1)  # Características
y = data_selected['price']  # Variable objetivo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Transformaciones para las características categóricas
categorical_features = ['make', 'model', 'trimlevel', 'fueltype', 'transmission', 'bodytype', 'color']

# Definimos los transformadores
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combinamos los transformadores
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ])

In [7]:
# Inicialización de los modelos de regresión
modelos_regresion = {
    'Regresión Lineal': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'Random Forest Regresor': RandomForestRegressor(random_state=42, n_estimators=100),
    'Gradient Boosting Regresor': GradientBoostingRegressor(random_state=42, n_estimators=100),
    'Árbol de decisión': DecisionTreeRegressor(random_state=42),
    'SVR': SVR()
}

In [8]:
# Entrenamiento y evaluación de modelos de regresión
for name, model in modelos_regresion.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', model)])
    pipeline.fit(X_train, y_train)  # Entrenamiento
    y_pred = pipeline.predict(X_test)  # Predicción
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    print(f"Modelo: {name}")
    print('Error cuadrático medio (MSE):', mse)
    print('Coeficiente de determinación (R^2):', r2)
    print('Error absoluto medio (MAE):', mae)
    print('\n')

Modelo: Regresión Lineal
Error cuadrático medio (MSE): 18434456882551.375
Coeficiente de determinación (R^2): 0.007777415658875908
Error absoluto medio (MAE): 140713.917079941


Modelo: Ridge
Error cuadrático medio (MSE): 18435366564842.44
Coeficiente de determinación (R^2): 0.007728452604552016
Error absoluto medio (MAE): 141155.32875139787


