In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Reading the data and first info

In [None]:
cars = pd.read_csv('/kaggle/input/craigslist-carstrucks-data/vehicles.csv')
print('Columns:',cars.columns.tolist())
cars.info()

In [None]:
cars.head()

# EDA

## Removing unnecessary data

In [None]:
cars.drop(columns=['url', 'id', 'size', 'county', 'region_url', 'image_url', 'vin', 'description', 'state', 'lat', 'long', 'region', 'title_status'], inplace=True)

In [None]:
cars.info()

In [None]:
cars.head()

In [None]:
cars.describe()

In [None]:
cars.isnull().sum()

In [None]:
cars['year'].fillna(cars.year.median(), inplace=True)
cars['year']= cars.year.astype('int32')
cars['odometer'].fillna(cars.odometer.median(), inplace=True)
cars['paint_color'].fillna('Unknown', inplace=True)

Drop offers before 1960

In [None]:
cars = cars[cars['year']>=1960]
cars.year.value_counts()

## Sales per year

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="darkgrid")
plt.figure(figsize=(30,15))
sns.countplot(x='year', data=cars)
plt.xticks(rotation=90)
plt.xlabel('Year')
plt.ylabel('Number of offers')

2016 was the year with the most offers.

## Sales per manufacturer

In [None]:
plt.figure(figsize=(30,15))
sns.countplot(y='manufacturer', data=cars, order=cars['manufacturer'].value_counts().index)
plt.xlabel('Manufacturer')
plt.ylabel('Number of offers')

Ford is the manufacturer with the most offers, followed by Chevrolet and Toyota.

## Evolution of type with year

In [None]:
plt.figure(figsize=(30,15))
sns.boxplot(x='year', y='type', data=cars)

## Evolution of paint color with years

In [None]:
cars.paint_color.value_counts()

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(x='paint_color', order=cars.paint_color.value_counts().index, data=cars)

We can explore the evolution of car's color with year:

In [None]:
cars.groupby('year').paint_color.value_counts()

We can create a pivot table with the paint_color and year columns, aggregated with the count:

In [None]:
reduced_cars_year_color=cars[['paint_color', 'year']]
table2=pd.pivot_table(reduced_cars_year_color, values='paint_color',index='year', columns='paint_color', aggfunc=len)

Heatmap using seaborn for the year and paint_color:

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(table2, annot=True, fmt='g')

## Exploring the car's condition

The list of the different conditions for the cars dataset:

In [None]:
cars['condition'].fillna('Unknown', inplace=True)
cars.condition.unique()

And the list cars manufacturers:

In [None]:
cars['manufacturer'].fillna('Unknown', inplace=True)
cars.manufacturer.unique()

We plot an histogram for the car's condition:

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(x='condition', order=cars.condition.value_counts().index, data=cars)
plt.xlabel('Condition')
plt.ylabel('Number of cars')

Let's create a pivot table with the condition and manufacturer columns:

In [None]:
reduced_cars=cars[['condition', 'manufacturer']]
table=pd.pivot_table(reduced_cars, values='condition',index='manufacturer', columns='condition', aggfunc=len)

Using seaborn's heatmap function, create a heatmap for the condition and manufacturer features with the annotated count in each box:

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(table, annot=True, fmt='g')

# ML

## Preparing the dataset

In [None]:
cars.info()

Let's drop rows with a NaN value:

In [None]:
cars.dropna(inplace=True)
cars.info()

In [None]:
cars.describe()

Now we encode target columns ( year, drive, odometer, manufacturer, model, condition, cylinders, fuel, type, paint_color, transmission) using OrdinalEncoder, to be able to feed the ML algorithms:

In [None]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
categorical_columns=['year', 'drive', 'odometer', 'manufacturer', 'model', 'condition', 'cylinders', 'fuel', 'type', 'paint_color', 'transmission']
cars[categorical_columns] = ordinal_encoder.fit_transform(cars[categorical_columns])
cars.info()

and look for the standard correlation coefficient (Pearsons r) for every pair of attributes, and especifically for the 'price' column:

In [None]:
corr_matrix = cars.corr()
corr_matrix['price']

Now we are ready to create the train and test sets. We first select the y ('price') and the X(the categorical encoded columns) sets, and create a target and label sets for both the train and test process, using train_test_split from sklearn:

In [None]:
from sklearn.model_selection import train_test_split
cars_y = cars['price']
cars_X = cars[categorical_columns]
cars_X_train, cars_X_test, cars_y_train, cars_y_test = train_test_split(cars_X, cars_y, test_size=0.2, random_state=42)

As the input numerical features (categorical encoded columns) have very different scales, we use StandardScaler to get all attributes to have the same scale. Standardization works by substracting the mean value (so standardizated values always have a zero mean) and then dividing by the standard deviation, so the resulting distribution has a unit variance.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
cars_X_train = pd.DataFrame(scaler.fit_transform(cars_X_train), columns = cars_X_train.columns)
cars_X_test = pd.DataFrame(scaler.fit_transform(cars_X_test), columns = cars_X_test.columns)

In [None]:
cars_X_train.head()

## Linear Regression model

We will start by training a Linear Regression model, feeding it with the train set:

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg=LinearRegression()
lin_reg.fit(cars_X_train, cars_y_train)

Once the model is trained with the train set, let's predict the labels of the X_train set, compute the RMSE of the model and display it:

In [None]:
predictions = lin_reg.predict(cars_X_train)
from sklearn.metrics import mean_squared_error
lin_mse=mean_squared_error(cars_y_train, predictions)
lin_rmse=np.sqrt(lin_mse)

## Decission Tree Regressor model

Now we try with a Decission Tree Regressor model, which is powerful model capable of finding complex nonlinear relationships in the data:

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(cars_X_train, cars_y_train)

again, we predict the labels values from the X_train dataset and compute the RMSE:

In [None]:
tree_predictions = tree_reg.predict(cars_X_train)
from sklearn.metrics import mean_squared_error
tree_mse=mean_squared_error(cars_y_train, tree_predictions)
tree_rmse=np.sqrt(tree_mse)

## Cross-Validation with Decission Tree Regressor model

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, cars_X_train, cars_y_train, scoring='neg_mean_squared_error', cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [None]:
def display_score(scores):
    print('Scores:', scores)
    print('Mean:', scores.mean())
    print('Standard deviation:', scores.std())
display_score(tree_rmse_scores)

## Cross-Validation with Linear Regressor

In [None]:
scores = cross_val_score(lin_reg, cars_X_train, cars_y_train, scoring='neg_mean_squared_error', cv=30)
lin_rmse_scores = np.sqrt(-scores)
display_score(lin_rmse_scores)

## Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(cars_X_train, cars_y_train)

In [None]:
forest_predictions = forest_reg.predict(cars_X_train)
forest_mse=mean_squared_error(cars_y_train, forest_predictions)
forest_rmse=np.sqrt(forest_mse)

## Comparing the RMSE

In [None]:
print("Computed RMSE's for the different models:")
print('Linear Regression Model:', lin_rmse)
print('Decision Tree Regressor Model:', tree_rmse)
print('Random Forest Regressor Model:', forest_rmse)

## Tuning the Decission Tree Model

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid= [
    {'max_depth': [2,4,6,8,10], 'max_features': [2,3,4]}
]
tree_reg = DecisionTreeRegressor()
grid_search = GridSearchCV(tree_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(cars_X_train, cars_y_train)

In [None]:
grid_search.cv_results_

In [None]:
grid_search.best_estimator_

In [None]:
tree_predictions = tree_reg.predict(cars_X_train)
from sklearn.metrics import mean_squared_error
tree_mse=mean_squared_error(cars_y_train, tree_predictions)
tree_rmse=np.sqrt(tree_mse)

# This kernel is still WIP. Any comment aimed to improve it will be very helpful.