In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import xgboost as xgb
from xgboost import XGBRegressor
import joblib
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [3]:
data = pd.read_csv('datos.csv')

  data = pd.read_csv('datos.csv')


In [4]:
data['car_age'] = pd.to_datetime('today').year - data['car_year']
data.drop(['car_year'], axis=1, inplace=True)
data['horse_power'] = data['horse_power'].str.extract('(\d+)').astype(float)

In [5]:
columns_of_interest = ['make', 'model', 'km', 'fueltype', 'transmission', 'horse_power', 'car_age', 'price']
data = data[columns_of_interest]

In [6]:
def remove_outliers(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
    return data

data = remove_outliers(data, 'price')
data = remove_outliers(data, 'km')
data = remove_outliers(data, 'car_age')

In [7]:
data.dropna(inplace=True)

data.shape

data.to_csv('datos_procesados.csv', index=False)

In [8]:
X = data.drop('price', axis=1)
y = data['price']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

In [10]:
categorical_features = ['make', 'model', 'fueltype', 'transmission']
numeric_features = ['km', 'horse_power', 'car_age']

In [11]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [11]:
modelos = [
    ('XGB Regressor', XGBRegressor(objective='reg:squarederror')),
    ('Gradient Boosting Regressor', GradientBoostingRegressor()),
    ('K-nearest Neighbors Regressor', KNeighborsRegressor()),
    #('Random Forest Regressor', RandomForestRegressor()),
]

In [36]:
resultados = []
for nombre, modelo in modelos:
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', modelo)])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    resultados.append((nombre, mae, r2))
    print(f'{nombre} - Error Absoluto Medio: {mae}, R2 Score: {r2}')

XGB Regressor - Error Absoluto Medio: 60058.59873405982, R2 Score: 0.8586882224023054
Gradient Boosting Regressor - Error Absoluto Medio: 73527.72965957753, R2 Score: 0.7937418432289671
K-nearest Neighbors Regressor - Error Absoluto Medio: 58004.632573787596, R2 Score: 0.8613889905016296


In [12]:
# guardar k-nearest neighbors
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', XGBRegressor())])

pipeline.fit(X, y)

y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)

r2 = r2_score(y_test, y_pred)

print(f'Error Absoluto Medio: {mae}, R2 Score: {r2}')

#guardar modelo

joblib.dump(pipeline, '../webpage/modelo.pkl')

Error Absoluto Medio: 59853.62993355811, R2 Score: 0.8599547922819029


['../webpage/modelo.pkl']