In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pickle

In [9]:
# Cargamos el dataset
data = pd.read_csv('datos.csv')

#sample de 10000 registros

data = data.sample(10000)
data.shape

  data = pd.read_csv('datos.csv')


(10000, 34)

In [10]:
# Seleccionar solo las columnas de interés
columns_of_interest = ['make', 'model', 'car_year', 'km', 'fueltype', 'transmission', 'horse_power', 'price']
data_selected = data[columns_of_interest].dropna()

# Limpiar la columna 'horse_power' para eliminar texto y convertir a flotante
data_selected = data_selected.dropna(subset=['horse_power'])

# QUITAR TODO EL TEXTO DE LA COLUMNA HORSE_POWER

data_selected['horse_power'] = data_selected['horse_power'].str.extract('(\d+)').astype(float)

# quitar todas las filas que tengan nans

data_selected = data_selected.dropna()

# Conversión de 'car_year' a edad del vehículo
current_year = pd.to_datetime('today').year
data_selected['car_age'] = current_year - data_selected['car_year']
data_selected.drop('car_year', axis=1, inplace=True)

# Imprimir las primeras filas para verificar la selección
print(data_selected.head())

# Verificación de valores nulos
print(data_selected.isnull().sum())

# Verificación de duplicados
print(data_selected.duplicated().sum())

# Ver numero de columnas y filas
print(data_selected.shape)

                    make     model        km      fueltype   transmission  \
1295727          Renault    Captur   90000.0        Benzin  Yari Otomatik   
1065411          Hyundai      Kona   29500.0         Dizel       Otomatik   
550213   Mercedes - Benz  C Serisi  195000.0        Benzin  Yari Otomatik   
1119884            Honda     Civic   87500.0  Benzin & LPG       Otomatik   
749467           Hyundai       i20  165000.0  Benzin & LPG         Manuel   

         horse_power   price  car_age  
1295727        120.0  504900        6  
1065411        136.0  825000        4  
550213         156.0  840000       10  
1119884        125.0  645000        6  
749467          85.0  356000       12  
make            0
model           0
km              0
fueltype        0
transmission    0
horse_power     0
price           0
car_age         0
dtype: int64
42
(9816, 8)


In [11]:
# División de datos
X = data_selected.drop('price', axis=1)  # Características
y = data_selected['price']  # Variable objetivo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Preprocesamiento

# Columnas categóricas

categorical_features = ['make', 'model', 'fueltype', 'transmission']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Columnas numéricas

numeric_features = ['km', 'horse_power', 'car_age']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Transformador de columnas

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])




# Prueba Modelos

In [13]:
modelos_regresion = {
    'Regresión Lineal': LinearRegression(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Random Forest': RandomForestRegressor()
}

parametros = {
    'K-Nearest Neighbors': {'regressor__n_neighbors': [20, 30,40, 50]},
}


In [15]:
for name, model in modelos_regresion.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', model)])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    pipeline.fit(X_train, y_train)  
    y_pred = pipeline.predict(X_test) 

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    print(f"Modelo: {name}")
    print('Coeficiente de determinación (R^2):', r2)
    print('Error absoluto medio (MAE):', mae)
    print('\n')

Modelo: Regresión Lineal
Coeficiente de determinación (R^2): 0.4449242751529865
Error absoluto medio (MAE): 166051.00564948027


Modelo: K-Nearest Neighbors
Coeficiente de determinación (R^2): 0.6541619417493498
Error absoluto medio (MAE): 113655.12199592667


Modelo: Random Forest
Coeficiente de determinación (R^2): 0.6513285554714369
Error absoluto medio (MAE): 110672.86318902952




In [14]:
for name, params in parametros.items():
    for n_neighbors in params['regressor__n_neighbors']:
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('regressor', KNeighborsRegressor(n_neighbors=n_neighbors))])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        print(f"Modelo: {name}, Vecinos: {n_neighbors}")
        print('Coeficiente de determinación (R^2):', r2)
        print('Error absoluto medio (MAE):', mae)
        print('\n')

Modelo: K-Nearest Neighbors, Vecinos: 20
Coeficiente de determinación (R^2): 0.5961609485663689
Error absoluto medio (MAE): 119964.80066191446


Modelo: K-Nearest Neighbors, Vecinos: 30
Coeficiente de determinación (R^2): 0.548277944051315
Error absoluto medio (MAE): 124406.35984385609


Modelo: K-Nearest Neighbors, Vecinos: 40
Coeficiente de determinación (R^2): 0.5158184237119294
Error absoluto medio (MAE): 127747.48447046844


Modelo: K-Nearest Neighbors, Vecinos: 50
Coeficiente de determinación (R^2): 0.4880179651909652
Error absoluto medio (MAE): 130938.75960285132




# Modelo Final