In [51]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pickle

In [71]:
# Cargamos el dataset
data = pd.read_csv('datos.csv')

#sample de 10000 registros

#data = data.sample(10000)
data.shape

  data = pd.read_csv('datos.csv')


(1510000, 34)

In [72]:
# Seleccionar solo las columnas de interés
columns_of_interest = ['make', 'model', 'car_year', 'km', 'fueltype', 'transmission', 'horse_power', 'price']
data_selected = data[columns_of_interest].dropna()

# Limpiar la columna 'horse_power' para eliminar texto y convertir a flotante
data_selected = data_selected.dropna(subset=['horse_power'])

# QUITAR TODO EL TEXTO DE LA COLUMNA HORSE_POWER

data_selected['horse_power'] = data_selected['horse_power'].str.extract('(\d+)').astype(float)

# quitar todas las filas que tengan nans

data_selected = data_selected.dropna()

# Conversión de 'car_year' a edad del vehículo
current_year = pd.to_datetime('today').year
data_selected['car_age'] = current_year - data_selected['car_year']
data_selected.drop('car_year', axis=1, inplace=True)

# Imprimir las primeras filas para verificar la selección
print(data_selected.head())

# Verificación de valores nulos
print(data_selected.isnull().sum())

# Verificación de duplicados
print(data_selected.duplicated().sum())

# Ver numero de columnas y filas
print(data_selected.shape)

      make    model        km fueltype transmission  horse_power   price  \
0     Fiat     Egea  110000.0   Benzin       Manuel         95.0  449000   
1     Fiat     Egea   14200.0   Benzin       Manuel         95.0  485900   
2   Nissan  Qashqai  103000.0    Dizel       Manuel        110.0  704900   
3  Renault   Symbol  180000.0    Dizel       Manuel         90.0  246000   
4     Fiat     Egea   74461.0    Dizel       Manuel         95.0  439500   

   car_age  
0        5  
1        2  
2        9  
3       10  
4        4  
make            0
model           0
km              0
fueltype        0
transmission    0
horse_power     0
price           0
car_age         0
dtype: int64
78822
(1483344, 8)


In [73]:
# División de datos
X = data_selected.drop('price', axis=1)  # Características
y = data_selected['price']  # Variable objetivo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [74]:
# Preprocesamiento

# Columnas categóricas

categorical_features = ['make', 'model', 'fueltype', 'transmission']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Columnas numéricas

numeric_features = ['km', 'horse_power', 'car_age']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Transformador de columnas

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])




# Prueba Modelos

In [62]:
modelos_regresion = {
    'Regresión Lineal': LinearRegression(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Random Forest': RandomForestRegressor()
}

parametros = {
    'Regresión Lineal': {},
    'K-Nearest Neighbors': {'regressor__n_neighbors': [3, 5, 7, 10]},
    'Random Forest': {'regressor__n_estimators': [100, 200], 'regressor__max_depth': [10, 20, None]}
}


In [63]:
for name, model in modelos_regresion.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', model)])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    pipeline.fit(X_train, y_train)  
    y_pred = pipeline.predict(X_test) 

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    print(f"Modelo: {name}")
    print('Coeficiente de determinación (R^2):', r2)
    print('Error absoluto medio (MAE):', mae)
    print('\n')

Modelo: Regresión Lineal
Coeficiente de determinación (R^2): -0.45194629632602434
Error absoluto medio (MAE): 273350.49053850863


Modelo: K-Nearest Neighbors
Coeficiente de determinación (R^2): 0.8598952340210707
Error absoluto medio (MAE): 95575.02258392675


Modelo: Random Forest
Coeficiente de determinación (R^2): 0.7686752691439864
Error absoluto medio (MAE): 99702.89269347963




In [64]:
for name, model in modelos_regresion.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', model)])
    if parametros[name]:  # Si hay parámetros para optimizar
        grid_search = GridSearchCV(pipeline, parametros[name], cv=5, scoring='neg_mean_absolute_error')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_  # Obtener los mejores parámetros
        y_pred = best_model.predict(X_test)
        print(f"Modelo Optimizado: {name}")
        print(f"Mejores Hiperparámetros: {best_params}")  # Imprimir los mejores hiperparámetros
    else:  # Si no hay parámetros, simplemente ajustar el modelo
        best_model = pipeline.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)
        print(f"Modelo: {name}")

Modelo: Regresión Lineal
Modelo Optimizado: K-Nearest Neighbors
Mejores Hiperparámetros: {'regressor__n_neighbors': 3}
Modelo Optimizado: Random Forest
Mejores Hiperparámetros: {'regressor__max_depth': 20, 'regressor__n_estimators': 100}


In [65]:
y_pred = best_model.predict(X_test)

In [66]:
# Métricas de evaluación
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print('Coeficiente de determinación (R^2):', r2)
print('Error absoluto medio (MAE):', mae)
print('\n')

Coeficiente de determinación (R^2): 0.7651053234907648
Error absoluto medio (MAE): 99257.6744053366




In [67]:
print(best_model['regressor'])

RandomForestRegressor(max_depth=20)


# Modelo Final

In [69]:
# Metricas Mejor Modelo Random Forest

mejormodelo = RandomForestRegressor(max_depth=20, n_estimators=100)

pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', mejormodelo)])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

r2 = r2_score(y_test, y_pred)

mae = mean_absolute_error(y_test, y_pred)

print('Coeficiente de determinación (R^2):', r2)

print('Error absoluto medio (MAE):', mae)

Coeficiente de determinación (R^2): 0.7487768423851631
Error absoluto medio (MAE): 100764.50686449114


In [75]:
# Metricas Mejor Modelo K-Nearest Neighbors

mejormodelo = KNeighborsRegressor(n_neighbors=3)

pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', mejormodelo)])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

r2 = r2_score(y_test, y_pred)

mae = mean_absolute_error(y_test, y_pred)

print('Coeficiente de determinación (R^2):', r2)

print('Error absoluto medio (MAE):', mae)

KeyboardInterrupt: 