In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, max_error

## Load Data

In [None]:
vehicles = pd.read_csv("/kaggle/input/craigslist-carstrucks-data/vehicles.csv")
vehicles.head()

In [None]:
vehicles.shape

In [None]:
vehicles.nunique()

In [None]:
vehicles.isna().sum()

In [None]:
bad_predictors = [
    'id', 'url', 'region', 'region_url', 'VIN', 'drive', 'size', 'county', 'state', 
    'paint_color', 'image_url', 'description', 'lat', 'long', 'posting_date'
]
vehicles.drop(bad_predictors, axis=1, inplace=True)

In [None]:
vehicles.isna().sum()

## Primera Parte

### 1. Eliminime todos los NaN

In [None]:
vehicles_dropna = vehicles.dropna().copy()

## Limpiar los datos de los cilindros para hacerlo númerico

In [None]:
vehicles_dropna['cylinders'] = vehicles_dropna['cylinders'].str.replace(r'[^0-9]', '', regex=True)
vehicles_dropna['cylinders'] = vehicles_dropna['cylinders'].str.replace(r'^\s*$', '1', regex=True)
vehicles_dropna['cylinders'] = vehicles_dropna['cylinders'].astype(int)

In [None]:
vehicles_dropna.nunique()

In [None]:
vehicles_dropna = vehicles_dropna[
    (vehicles_dropna['price'] > vehicles_dropna['price'].quantile(.10)) &
    (vehicles_dropna['price'] < vehicles_dropna['price'].quantile(.90))
]


### Codificar categorias

In [None]:
manufacturer_encoder = OrdinalEncoder()
vehicles_dropna['manufacturer'] = manufacturer_encoder.fit_transform(vehicles_dropna[['manufacturer']])

In [None]:
condition_encoder = OrdinalEncoder()
vehicles_dropna['condition'] = manufacturer_encoder.fit_transform(vehicles_dropna[['condition']])

In [None]:
fuel_encoder = OrdinalEncoder()
vehicles_dropna['fuel'] = manufacturer_encoder.fit_transform(vehicles_dropna[['fuel']])

In [None]:
title_status_encoder = OrdinalEncoder()
vehicles_dropna['title_status'] = manufacturer_encoder.fit_transform(vehicles_dropna[['title_status']])

In [None]:
transmission_encoder = OrdinalEncoder()
vehicles_dropna['transmission'] = manufacturer_encoder.fit_transform(vehicles_dropna[['transmission']])

In [None]:
type_encoder = OrdinalEncoder()
vehicles_dropna['type'] = manufacturer_encoder.fit_transform(vehicles_dropna[['type']])

In [None]:
# drop model (for now!)
vehicles_dropna.drop(['model'], axis=1, inplace=True)

### Limpiar los datos del odometro y hacerlo categorias

In [None]:
fig = make_subplots(rows=1, cols=1)
fig.add_trace(go.Histogram(x=vehicles['odometer'], name='odometer'), row=1, col=1)
fig.show()

In [None]:
vehicles_bins = vehicles_dropna.copy()
vehicles_bins['odometer'] = np.where(vehicles_bins['odometer'] >= 250000, 250000, vehicles_bins['odometer'])

In [None]:
fig = make_subplots(rows=1, cols=1)
fig.add_trace(go.Histogram(x=vehicles_bins['odometer'], name='odometer'), row=1, col=1)
fig.show()

In [None]:
vehicles_bins['odometer'] = pd.cut(vehicles_bins['odometer'], bins=6, labels=[1,2,3,4,5,6])

In [None]:
fig = px.scatter(
    vehicles_bins, 
    x=vehicles_dropna['odometer'], 
    y=vehicles_bins['price'], 
    color="price", 
#    hover_data=['Type'],
    color_continuous_scale='portland')
fig.show()

In [None]:
vehicles_bins.nunique()

In [None]:
fig = px.imshow(vehicles_bins.corr())
fig.show()

### 2. Ajuste un Bosque Aleatorio

In [None]:
X = vehicles_bins.drop(['price'], axis=1)
y = vehicles_bins['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

### 3. [Extra] Puede utilizar GridSearchCV o RandomizedSearchCV para ajustar los hiperparámetros

### 4. Muestre sus resultados utilizando las métricas: Mean Absolute Error y Mean Squared Error

In [None]:
y_predict = model.predict(X_test)
mae = mean_absolute_error(y_test, y_predict)
mse = mean_squared_error(y_test, y_predict)
maxerror = max_error(y_test, y_predict)
print(f"mean absolute error: {mae}")
print(f"mean squared error: {mse}")
print(f"max error: {maxerror}")

## Segunda Parte

### 1. Proponga un método para completar los valores vacíos, o justifique porque no hacerlo.

En mi opinión, no se pueden completar los datos vacíos hasta que los datos actuales estén limpios. El dataset tiene problemas en dos columnas críticas: odómetro y modelo. Además, el dato VIN que deberia ser único está repetido

### 2. Ajuste otro Bosque Aleatorio 

N/A, igual que el árbol anterior

### 3. Compare sus nuevos resultados utilizando las métricas: Mean Absolute Error y Mean Squared Error

N/A, igual que el árbol anterior

### 4. Ajuste uno o varios modelos de los vistos en clase para mejorar sus resultados

In [None]:
from sklearn.linear_model import LinearRegression
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train, y_train)
y_predict = linear_regression_model.predict(X_test)
mae = mean_absolute_error(y_test, y_predict)
mse = mean_squared_error(y_test, y_predict)
maxerror = max_error(y_test, y_predict)
print(f"mean absolute error: {mae}")
print(f"mean squared error: {mse}")
print(f"max error: {maxerror}")

## from sklearn.linear_model import Lasso
lasso_model = Lasso(alpha=1, max_iter=10000, tol=1e-5)
lasso_model.fit(X_train, y_train)
y_predict = lasso_model.predict(X_test)
mae = mean_absolute_error(y_test, y_predict)
mse = mean_squared_error(y_test, y_predict)
maxerror = max_error(y_test, y_predict)
print(f"mean absolute error: {mae}")
print(f"mean squared error: {mse}")
print(f"max error: {maxerror}")