# Imports

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

np.set_printoptions(precision=4)

In [None]:
import ipywidgets as widgets
from ipywidgets import interact

# Data

In [None]:
vehicles = pd.read_csv("../input/craigslist-carstrucks-data/vehicles.csv")
vehicles.head()

In [None]:
vehicles.shape

# Nulls

In [None]:
vehicles.isna().sum()

In [None]:
# Bad Predictors

bad_predictors = [
    'id', 'url', 'region', 'region_url', 'VIN', 'drive', 'size', 'county', 'state', 
    'paint_color', 'image_url', 'description', 'lat', 'long', 'posting_date'
]

vehicles.drop(bad_predictors, axis=1, inplace=True)

# # # Eliminime todos los NaN (1 pt)

In [None]:
vehicles_dropna = vehicles.dropna().copy()
vehicles_dropna.head()

**Dataset dtype**

In [None]:
vehicles.info()


## Clean cylinders data

In [None]:
vehicles_dropna['cylinders'] = vehicles_dropna['cylinders'].str.replace(r'[^0-9]', '', regex=True)
vehicles_dropna['cylinders'] = vehicles_dropna['cylinders'].str.replace(r'^\s*$', '1', regex=True)
vehicles_dropna['cylinders'] = vehicles_dropna['cylinders'].astype(int)

## Encoding Categoricals

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
manufacturer_encoder = OrdinalEncoder()
vehicles_dropna['manufacturer'] = manufacturer_encoder.fit_transform(vehicles_dropna[['manufacturer']])
vehicles_dropna['model'] = manufacturer_encoder.fit_transform(vehicles_dropna[['model']])
vehicles_dropna['condition'] = manufacturer_encoder.fit_transform(vehicles_dropna[['condition']])
vehicles_dropna['fuel'] = manufacturer_encoder.fit_transform(vehicles_dropna[['fuel']])
vehicles_dropna['title_status'] = manufacturer_encoder.fit_transform(vehicles_dropna[['title_status']])
vehicles_dropna['transmission'] = manufacturer_encoder.fit_transform(vehicles_dropna[['transmission']])
vehicles_dropna['type'] = manufacturer_encoder.fit_transform(vehicles_dropna[['type']])
vehicles_dropna.head()

In [None]:
# drop model (for now!)
vehicles_dropna.drop(['model'], axis=1, inplace=True)

## Outliers

In [None]:
vehicles_dropna.describe()

Cars > 2007

In [None]:
# Lets drop  vehicles older 2007

vehicles_dropna = vehicles_dropna[vehicles_dropna['year'] > 2007]

In [None]:
# Lets drop weird prices

vehicles_dropna = vehicles_dropna[
    (vehicles_dropna['price'] > vehicles_dropna['price'].quantile(.10)) &
    (vehicles_dropna['price'] < vehicles_dropna['price'].quantile(.90))
]

In [None]:
vehicles_dropna.describe()

## Correlation Matrix

In [None]:
fig = px.imshow(vehicles_dropna.corr())
fig.show()

## Train and Test

In [None]:
X = vehicles_dropna.drop(['price'], axis=1)
y = vehicles_dropna['price']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, max_error

def PrintMetrics(y, y_pred, title=''):
    mae = mean_absolute_error(y, y_pred)
    mse = mean_squared_error(y, y_pred)
    maxerror = max_error(y, y_pred)

    print(title)
    print(f"mean_absolute_error: {mae}")
    print(f"mean_squared_error: {mse}")
    print(f"maxerror: {maxerror}")
    print('\n')

In [None]:
vehicles.dtypes

# Ajuste un Bosque Aleatorio (5 pts)

# Muestre sus resultados utilizando las métricas: Mean Absolute Error y Mean Squared Error (3 pts)

In [None]:
def iscorrect(x):
    return "circle" if x else "x"

iscorrectv = np.vectorize(iscorrect)


import ipywidgets as widgets
from ipywidgets import interact, interact_manual
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error 
from sklearn.ensemble import RandomForestRegressor

@interact
def effect_of_depth(depth=(2,20)):
    
    model = RandomForestRegressor()
    model.fit(X_train, y_train)

    PrintMetrics(y_test, model.predict(X_test), title='Random Forest Regressor: ')
    y_pred = model.predict(X)
    
    fig = make_subplots(rows=1, cols=1)

    fig.add_trace(go.Contour(
        x=X['condition'],
        y=X['transmission'],
        z=y_pred,
        showscale=False,
        opacity=0.40,
        colorscale='portland'
    ), row=1, col=1)

    fig.add_trace(go.Scatter(
        x=X['condition'], 
        y=X['transmission'],
        text=y,
        mode='markers',
        marker_symbol=iscorrectv(y==y_pred),
        marker=dict(color=y, colorscale='portland')
    ), row=1, col=1)
    
    dtscore = model.score(X, y)
    
    fig.update_layout(showlegend=False, title_text=f"score={dtscore}, max_depth={depth}")
    
    fig.show()

# # [Extra] Puede utilizar GridSearchCV o RandomizedSearchCV para ajustar los hiperparámetros

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]

forest = RandomForestRegressor()

grid_search = GridSearchCV(forest, param_grid, cv=5, return_train_score=True)

grid_search.fit(X, y)

print(grid_search.best_params_)

print(grid_search.best_score_)

# # # # # Segunda parte

# # # Proponga un método para completar los valores vacíos, o justifique porque no hacerlo. (5 pts)

In [None]:
vehicles.isna().sum()

Fill empty values

In [None]:
vehicles_dropna['price'].fillna('None', inplace=True)
vehicles_dropna['year'].fillna('9999', inplace=True)
vehicles_dropna['manufacturer'].fillna('0', inplace=True)
vehicles_dropna['condition'].fillna('0', inplace=True)
vehicles_dropna['manufacturer'].fillna('0', inplace=True)
vehicles_dropna['fuel'].fillna('0', inplace=True)
vehicles_dropna['odometer'].fillna('0000', inplace=True)
vehicles_dropna['title_status'].fillna('0', inplace=True)
vehicles_dropna['transmission'].fillna('0', inplace=True)
vehicles_dropna['type'].fillna('0', inplace=True)



Code None Values

In [None]:

manufacturer_encoder = OrdinalEncoder()
#year_encoder = OrdinalEncoder()
model_encoder = OrdinalEncoder()
fuel_encoder = OrdinalEncoder()
odometer_encoder = OrdinalEncoder()
title_encoder = OrdinalEncoder()
transmission_encoder = OrdinalEncoder()
type_encoder = OrdinalEncoder()
#vehicles['year']= year_encoder.fit_transform(vehicles[['year']])
vehicles_dropna['manufacturer']= manufacturer_encoder.fit_transform(vehicles_dropna[['manufacturer']])
vehicles_dropna['fuel']= manufacturer_encoder.fit_transform(vehicles_dropna[['fuel']])
#vehicles['odometer']= manufacturer_encoder.fit_transform(vehicles[['odometer']])
vehicles_dropna['title_status']= manufacturer_encoder.fit_transform(vehicles_dropna[['title_status']])
vehicles_dropna['transmission']= manufacturer_encoder.fit_transform(vehicles_dropna[['transmission']])
vehicles_dropna['type']= manufacturer_encoder.fit_transform(vehicles_dropna[['type']])

vehicles.head()


Replace 

In [None]:
vehicles_dropna['manufacturer'].replace(0, np.nan, inplace=True)
vehicles_dropna['fuel'].replace(0, np.nan, inplace=True)
#vehicles['odometer'].replace(0, np.nan, inplace=True)
vehicles_dropna['title_status'].replace(0, np.nan, inplace=True)
vehicles_dropna['transmission'].replace(0, np.nan, inplace=True)
vehicles_dropna['type'].replace(0, np.nan, inplace=True)
vehicles_dropna.head(1000)

In [None]:
from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=0)
vehicles_dropna = pd.DataFrame(

    imp.fit_transform(vehicles_dropna), 

    columns=vehicles_dropna.columns

)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# # Ajuste otro Bosque Aleatorio (5 pts)

In [None]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_estimators = 100) 
clf.fit(X_train,y_train)
y_pred = clf.predict(X_train)

import seaborn as sns
plt.figure(figsize=(5, 7))


ax = sns.distplot(y_train, hist=False, color="r", label="Actual Value")
sns.distplot(y_pred, hist=False, color="b", label="Fitted Values" , ax=ax)


plt.title('Actual vs Fitted Values for Price')


plt.show()
plt.close()



# # Compare sus nuevos resultados utilizando las métricas: Mean Absolute Error y Mean Squared Error (3 pts)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, max_error

mse = mean_absolute_error(y_test, clf.predict(X_test))
maxerror = max_error(y_test,clf.predict(X_test))
print(f"[linear regression] mean_squared_error: {mse}")
print(f"[linear regression] maxerror: {maxerror}")      


# # Ajuste uno o varios modelos de los vistos en clase para mejorar sus resultados (5 pts)

In [None]:
from sklearn.preprocessing import StandardScaler

need_scaling = ['price','year','manufacturer', 'fuel', 'cylinders','odometer', 'condition', 'title_status', 'transmission', 'type']

standard_scaler = StandardScaler()
vehicles_dropna[need_scaling] = standard_scaler.fit_transform(vehicles_dropna[need_scaling])


Linear Regression

In [None]:
X = vehicles_dropna.drop("price", axis=1)
Y = vehicles_dropna["price"]

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, max_error

model = LinearRegression()
model.fit(X_train, y_train)

mse = mean_absolute_error(y_test, model.predict(X_test))
maxerror = max_error(y_test, model.predict(X_test))

print(f"[linear regression] mean_squared_error: {mse}")
print(f"[linear regression] maxerror: {maxerror}")


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, max_error

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(4)
plr_X_train = poly.fit_transform(X_train)
plr_X_test = poly.transform(X_test)

model = LinearRegression()
model.fit(plr_X_train, y_train)

mse = mean_absolute_error(y_test, model.predict(plr_X_test))
maxerror = max_error(y_test, model.predict(plr_X_test))

print(f"[polynomial regression] mean_squared_error: {mse}")
print(f"[polynomial regression] maxerror: {maxerror}")

In [None]:
from sklearn.neighbors import KNeighborsRegressor

model = KNeighborsRegressor(n_neighbors=1)
model.fit(X_train, y_train)
print(f"[KNeighborsRegressor] mean_squared_error: {mse}")
print(f"[KNeighborsRegressor] maxerror: {maxerror}")