## Import Data

In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
data_path = os.path.join('datasets', 'CarPrice_Assignment.csv')
cars = pd.read_csv(data_path)

In [None]:
cars.head()

In [None]:
cars.info()

In [None]:
cars.describe()

It seems that we have no null value

## Data Cleaning

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
cars = cars.drop('car_ID', axis=1)

In [None]:
cars['CarName'] = cars['CarName'].str.split(' ', expand=True)

In [None]:
cars['CarName'].unique()

In [None]:
cars['CarName'] = cars['CarName'].replace({'maxda': 'mazda',
                                           'Nissan': 'nissan',
                                           'porcshce': 'porsche',
                                           'vokswagen': 'volkswagen',
                                           'vw': 'volkswagen',
                                           'toyouta': 'toyota'
                                          })

In [None]:
cars['doornumber'].unique()

In [None]:
cars['doornumber'] = cars['doornumber'].replace({'two': 2, 'four': 4})

In [None]:
cars['cylindernumber'].unique()

In [None]:
cars['cylindernumber'] = cars['cylindernumber'].replace({'four': 4,
                                                         'six': 6,
                                                         'five': 5,
                                                         'three': 3,
                                                         'twelve': 12,
                                                         'two': 2,
                                                         'eight': 8
                                                        })

In [None]:
cat_col = cars.select_dtypes(include=['object']).columns
num_col = cars.select_dtypes(exclude=['object']).columns

## Data Visualization

In [None]:
plt.rcParams['figure.figsize'] = [15, 8]
ax = cars['CarName'].value_counts().plot(kind='bar', stacked=True, colormap='Set1')
ax.title.set_text('Brands')
plt.xlabel('Brand', fontweight='bold')
plt.ylabel('Count of Cars', fontweight='bold')

In [None]:
plt.figure(figsize=(15,8))
plt.title('Price Distribution')
sns.distplot(cars['price'])

### Visualising Numeric Variables

In [None]:
for i in range(0,4):
    batch=5*i
    sns.pairplot(
        data=cars,
        y_vars=['price'],
        x_vars=num_col[0+batch:5+batch],
        kind="reg"
    )

In [None]:
corr = cars[num_col].corr()
corr['price'].sort_values(ascending=False)

- Relevant positive correlation: 
    - enginesize 
    - curbweight 
    - horsepower 
    - carwidth 
    - cylindernumber 
    - carlength
    - wheelbase
    - boreratio
    
- Relevant negative correlation: 
    - citympg
    - highwaympg

In [None]:
num_col_rel = ['enginesize', 'curbweight', 'horsepower', 'carwidth', 'cylindernumber', 'carlength', 'wheelbase', 'boreratio', 'price']
num_col_rel

### Visualising Categorical Variables

In [None]:
plt.figure(figsize=(20,15))

for i in range(1, len(cat_col)):
    plt.subplot(3,3,i)
    sns.boxplot(data=cars, x=cat_col[i], y='price')
    
plt.show()

All the categorical variables impact on the price

In [None]:
num_col_rel.extend(cat_col)
columns = num_col_rel
columns

In [None]:
cars = cars[columns]

## Create Sets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_set, test_set = train_test_split(cars, test_size=0.2, random_state=42)

## Prepare Data

In [None]:
train_set.head()

In [None]:
cars_data = train_set.drop('price', axis=1)
cars_label = train_set['price']

In [None]:
cars_data.head()

In [None]:
cars_label.head()

### Pipelines

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [None]:
cat_col = cars_data.select_dtypes(include=['object']).columns
num_col = cars_data.select_dtypes(exclude=['object']).columns

In [None]:
num_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])

In [None]:
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_col),
    ('cat', OneHotEncoder(drop='first'), cat_col)
])

In [None]:
cars_prepared = full_pipeline.fit_transform(cars_data)

## Select and Train a Model

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
test_data = test_set.drop('price', axis=1)
test_prepared = full_pipeline.transform(test_data)
test_labels = test_set['price']

In [None]:
def display_scores(model):
    predictions = model.predict(test_prepared)
    mse = mean_squared_error(test_labels,predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(test_labels, predictions)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2:', r2)

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(cars_prepared, cars_label)

In [None]:
display_scores(lin_reg)

### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
forest_reg = RandomForestRegressor()
forest_reg.fit(cars_prepared, cars_label)

In [None]:
display_scores(forest_reg)

### XGBRegressor

In [None]:
from xgboost import XGBRegressor

In [None]:
xg_reg = XGBRegressor()
xg_reg.fit(cars_prepared, cars_label)

In [None]:
display_scores(xg_reg)

### Grid Search Random Forest Regressor

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = [
    {'n_estimators': np.arange(10, 200, 10), 'max_features': np.arange(2, 200, 2)},
    {'bootstrap': [False], 'n_estimators': np.arange(10, 200, 10), 'max_features': np.arange(2, 200, 2)}
]

forest_reg_grid = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg_grid, param_grid, cv=10, scoring='neg_mean_squared_error', return_train_score=True, n_jobs=-1)

In [None]:
grid_search.fit(cars_prepared, cars_label)

In [None]:
grid_search.best_estimator_

In [None]:
forest_best_estimator = grid_search.best_estimator_

In [None]:
display_scores(forest_best_estimator)

### Grid Search XGBRegressor

In [None]:
param_grid = [
    {'n_estimators': np.arange(10, 200, 10), 'max_depth': np.arange(5, 10, 1)},
]

xgb_reg_grid = XGBRegressor()
grid_search = GridSearchCV(xgb_reg_grid, param_grid, cv=10, scoring='neg_mean_squared_error', return_train_score=True, n_jobs=-1)

In [None]:
grid_search.fit(cars_prepared, cars_label)

In [None]:
grid_search.best_estimator_

In [None]:
xgb_best_estimator = grid_search.best_estimator_

In [None]:
display_scores(xgb_best_estimator)

## Conclusion

The best model obtained is the RandomForestRegressor with its default values.

In [None]:
display_scores(forest_reg)

## Save Best Model

In [None]:
import joblib

In [None]:
joblib.dump(forest_reg, 'car_price_estimator.pkl')