In [None]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.ensemble import RandomForestRegressor

np.set_printoptions (precision=4)

In [None]:
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

# Segunda Parte

In [None]:
dfna = pd.read_csv('../input/craigslist-carstrucks-data/vehicles.csv')

In [None]:
# drop bad predictors
bad_predictors = [
    'id', 'url', 'region', 'region_url', 'VIN', 'drive', 'size', 'county', 'state',
    'paint_color', 'image_url', 'description', 'lat', 'long', 'posting_date', 'model'
]

dfna = dfna.drop(columns = bad_predictors)

# More than 3 NaN in a row is considered irrelevant
dfna = dfna[dfna.isna().sum(axis=1) <= 3]

In [None]:
dfna.isna().sum()

In [None]:
from sklearn.preprocessing import OrdinalEncoder

dfna['manufacturer'].fillna('None', inplace=True)

manufacturer_encoder = OrdinalEncoder()
dfna['manufacturer'] = manufacturer_encoder.fit_transform(dfna[['manufacturer']])

print(manufacturer_encoder.categories_)

dfna['manufacturer'].replace(0, np.nan, inplace=True)

In [None]:
dfna['condition'].fillna('None', inplace=True)

condition_encoder = OrdinalEncoder()
dfna['condition'] = condition_encoder.fit_transform(dfna[['condition']])

print(condition_encoder.categories_)

dfna['condition'].replace(0, np.nan, inplace=True)

In [None]:
dfna['cylinders'].fillna('0', inplace=True)

dfna['cylinders'] = dfna['cylinders'].str.replace(r'[^0-9]', '', regex=True)
dfna['cylinders'] = dfna['cylinders'].str.replace(r'^\s*$', '1', regex=True)
dfna['cylinders'] = dfna['cylinders'].astype(int)

dfna['cylinders'].replace(0, np.nan, inplace=True)

In [None]:
dfna['fuel'].fillna('None', inplace=True)

fuel_encoder = OrdinalEncoder()
dfna['fuel'] = fuel_encoder.fit_transform(dfna[['fuel']])

print(fuel_encoder.categories_)

dfna['fuel'].replace(0, np.nan, inplace=True)

In [None]:
dfna['title_status'].fillna('None', inplace=True)

title_status_encoder = OrdinalEncoder()
dfna['title_status'] = title_status_encoder.fit_transform(dfna[['title_status']])

print(title_status_encoder.categories_)

dfna['title_status'].replace(0, np.nan, inplace=True)

In [None]:
dfna['transmission'].fillna('None', inplace=True)

transmission_encoder = OrdinalEncoder()
dfna['transmission'] = transmission_encoder.fit_transform(dfna[['transmission']])

print(transmission_encoder.categories_)

dfna['transmission'].replace(0, np.nan, inplace=True)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

dfna['type'].fillna('None', inplace=True)

type_encoder = OrdinalEncoder()
dfna['type'] = type_encoder.fit_transform(dfna[['type']])

print(type_encoder.categories_)

dfna['type'].replace(0, np.nan, inplace=True)

In [None]:
dfna.isna().sum()

In [None]:
# plot histogram of filtered prices 

df1 = dfna[(dfna['price'].between(dfna['price'].quantile(.09), dfna['price'].quantile(.98)))
]['price']

fig = make_subplots(rows=1, cols=1)

fig.add_trace(go.Histogram(x=df1), row=1, col=1)
    
fig.update_layout(height=400,)
    
fig.show()

In [None]:
# plot histogram of filtered years 

df2 = dfna[dfna['year'] > 1990]['year']

fig = make_subplots(rows=1, cols=1)

fig.add_trace(go.Histogram(x=df2), row=1, col=1)
    
fig.update_layout(height=400,)
    
fig.show()

In [None]:
# plot histogram of filtered odometer 

df2 = dfna[(dfna['odometer'].between(dfna['odometer'].quantile(.02), dfna['odometer'].quantile(.98)))
]['odometer']

fig = make_subplots(rows=1, cols=1)

fig.add_trace(go.Histogram(x=df2), row=1, col=1)
    
fig.update_layout(height=400,)
    
fig.show()

In [None]:
# outliers
# drop vehicles older than 1990
dfna = dfna[dfna.year >= 1990]

dfna = dfna[
    (dfna['price'].between(dfna['price'].quantile(.09), dfna['price'].quantile(.98))) &
    (dfna['odometer'].between(dfna['odometer'].quantile(.02), dfna['odometer'].quantile(.98)))
]

In [None]:
dfna.isna().sum()

## Completing missing values with Imputer

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp = IterativeImputer(max_iter=10, random_state=0)

dfna1 = pd.DataFrame(
    imp.fit_transform(dfna), 
    columns=dfna.columns
)

In [None]:
dfna1.isna().sum()

## 2. Ajuste otro Bosque Aleatorio (5 pts)

In [None]:
X = dfna1.drop(['price'], axis=1)
y = dfna1['price']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

from sklearn.ensemble import RandomForestRegressor
    
model = RandomForestRegressor()
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import max_error

mse = mean_absolute_error(y_test, model.predict(X_test))
maxerror = max_error(y_test, model.predict(X_test))

print(f'[Random Forest Regressor] mean_squared_error: {mse}')
print(f'[Random Forest Regressor] maxerror: {maxerror}')

## Ajuste otro modelo

In [None]:
svm_X = dfna1.drop(['price'], axis=1)
svm_y = dfna1['price']

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
svm_X = pca.fit_transform(svm_X)

svm_X_train, svm_X_test, svm_y_train, svm_y_test = train_test_split(
    svm_X, 
    svm_y, 
    test_size=0.20, 
    random_state=42
)

In [None]:
from sklearn.linear_model import Ridge

mlrmodel_l2 = Ridge(alpha=1, max_iter=10000, tol=1e-5).fit(svm_X_train, svm_y_train)

mse = mean_absolute_error(svm_y_test, mlrmodel_l2.predict(svm_X_test))
maxerror = max_error(svm_y_test, mlrmodel_l2.predict(svm_X_test))

print(f'[Random Forest Regressor] mean_squared_error: {mse}')
print(f'[Random Forest Regressor] maxerror: {maxerror}')