In [None]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

np.set_printoptions (precision=4)

In [None]:
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

In [None]:
df_db = pd.read_csv('../input/craigslist-carstrucks-data/vehicles.csv')

## Feature Engineering

In [None]:
# drop bad predictors
bad_predictors = [
    'id', 'url', 'region', 'region_url', 'VIN', 'drive', 'size', 'county', 'state',
    'paint_color', 'image_url', 'description', 'lat', 'long', 'posting_date', 'model'
]

df = df_db.drop(columns = bad_predictors)

# drop Nan values
df = df.dropna()

# outliers
# drop vehicles older than 2000
df = df[df.year >= 2000]

df = df[
    (df['price'].between(df['price'].quantile(.10), df['price'].quantile(.95))) &
    (df['odometer'].between(df['odometer'].quantile(.02), df['odometer'].quantile(.99)))
]

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# select non-numerical columns
cat_cols = list(df.select_dtypes(include='object').columns)

# convert to category
df[cat_cols] = df[cat_cols].astype('category')

# encode categoricals
encoder = OrdinalEncoder()
df[cat_cols] = encoder.fit_transform(df[cat_cols])

## Plot histograms

In [None]:
# df_sample = df.sample(2000)

# fig = make_subplots(rows=(df_sample.shape[1]//3)+1, cols=3)

# for i, col in enumerate(df_sample.columns):
#     fig.add_trace(go.Histogram(x=df[col], name=col), row=(i//3)+1, col=(i%3)+1)
    
# fig.update_layout(height=1000,)
    
# fig.show()

## Split dataset

In [None]:
# split dataset
X = df.drop(['price'], axis = 1)
y = df['price']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)

## Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfmodel = RandomForestRegressor()
rfmodel.fit(X_train, y_train)

In [None]:
accuracy_train = rfmodel.score(X_train, y_train)
accuracy_test = rfmodel.score(X_test, y_test)
print(f"Random Forest Classifier: Accuracy (Train): {accuracy_train}, Accuracy (Test): {accuracy_test}")

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import max_error

mse = mean_absolute_error(y_test, rfmodel.predict(X_test))
maxerror = max_error(y_test, rfmodel.predict(X_test))

print(f'[Random Forest Regressor] mean_squared_error: {mse}')
print(f'[Random Forest Regressor] maxerror: {maxerror}')

## GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
#     {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]

forest = RandomForestRegressor()

grid_search = GridSearchCV(forest, param_grid, cv=5, return_train_score=True)

grid_search.fit(X, y)

print(grid_search.best_params_)

print(grid_search.best_score_)