In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. EDA

In [None]:
data = pd.read_csv('../input/used-car-dataset-ford-and-mercedes/skoda.csv')
data.head()

In [None]:
data.info()

In [None]:
data["model"].value_counts()

In [None]:
data.describe()

In [None]:
import matplotlib.pyplot as plt

In [None]:
data.hist(bins=20, figsize=(20,15));

In [None]:
from sklearn.model_selection import train_test_split

# 2. Train/test sample & stratification

In [None]:
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
data["tax_cat"] = pd.cut(data["tax"], bins=[-1, 65, 130, 195, 260, np.inf], labels=[1, 2, 3, 4, 5])

In [None]:
data.sample(5)

In [None]:
data["tax_cat"].hist();

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["tax_cat"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [None]:
strat_test_set["tax_cat"].value_counts() / len(strat_test_set)

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("tax_cat", axis=1, inplace=True)

# 3. Visualisation

In [None]:
data_train = strat_train_set.copy()

In [None]:
import seaborn as sns

sns.catplot(kind="bar", x="model", y="mileage", data=data_train, alpha=.25, height=8, palette="bright")
plt.xticks(rotation=70)
plt.tight_layout();

# 4. Correlations

In [None]:
corr_matrix = data_train.corr()

In [None]:
corr_matrix["price"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["price", "year", "engineSize", "tax", "mpg", "mileage"]
scatter_matrix(data_train[attributes], figsize=(16,10));

In [None]:
data_train.plot(kind="scatter", x="year", y="price", alpha=0.1);

In [None]:
data_train.head(2)

In [None]:
data_train["miles_per_year"] = data_train.mileage / (2021 - data_train.year)

In [None]:
data_train.head(2)

In [None]:
corr_matrix = data_train.corr()

In [None]:
corr_matrix["price"].sort_values(ascending=False)

In [None]:
data_train.plot(kind="scatter", x="miles_per_year", y="price", alpha=0.1);

# 5. Data Cleaning

In [None]:
data_train = strat_train_set.drop("price", axis=1)
data_train_labels = strat_train_set["price"].copy()

In [None]:
data_train.info()

Dataset complete

### 5.1 Creating numbers for categories

In [None]:
data_train_cat = data_train[["model", "transmission", "fuelType"]]
data_train_cat.sample(10)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
data_train_cat_encoded = ordinal_encoder.fit_transform(data_train_cat)
data_train_cat_encoded[:5]

In [None]:
ordinal_encoder.categories_

### 5.2 Use OneHotEncoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
data_train_cat_1hot = cat_encoder.fit_transform(data_train_cat)
data_train_cat_1hot

In [None]:
data_train_cat_1hot.toarray()

### 5.3 Using a pipeline

In [None]:
from sklearn.compose import ColumnTransformer

cat_attribs = ["model", "transmission", "fuelType"]

full_pipeline = ColumnTransformer([
    ("cat", OneHotEncoder(), cat_attribs),
])

data_train_prepared = full_pipeline.fit_transform(data_train)

# 6. Run regression

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(data_train_prepared, data_train_labels)

### 6.1 Trial on some instances of the training set

In [None]:
some_data = data_train.iloc[:5]
some_labels = data_train_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions: ", lin_reg.predict(some_data_prepared))
print("Labels: ", list(some_labels))

# 7. Measuring prediction error with RMSE

In [None]:
from sklearn.metrics import mean_squared_error

data_predictions = lin_reg.predict(data_train_prepared)
lin_mse = mean_squared_error(data_train_labels, data_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

# 8. Use a different Model (Decision Tree)

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(data_train_prepared, data_train_labels)

In [None]:
data_predictions = tree_reg.predict(data_train_prepared)
tree_rmse = mean_squared_error(data_train_labels, data_predictions)
tree_rmse = np.sqrt(tree_rmse)
tree_rmse

#### => Better model als RMSE is lower than with linear regression!

# 9. Split the training set in a smaller training set and validation set (K-fold cross-validation)

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, data_train_prepared, data_train_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

### Results:

In [None]:
def display_scores(scores):
    print(f"Scores: {scores}")
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

display_scores(tree_rmse_scores)

### 9.1 Comparing these results to the Linear Regression model

In [None]:
lin_scores = cross_val_score(lin_reg, data_train_prepared, data_train_labels, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)

display_scores(lin_rmse_scores)

# 10. Use a different Model (Random Forest)

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(data_train_prepared, data_train_labels)

In [None]:
forest_scores = cross_val_score(forest_reg, data_train_prepared, data_train_labels, scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)

display_scores(forest_rmse_scores)

# 11. Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {"n_estimators": [3, 10, 30], "max_features": [2, 4, 6, 8]},
    {"bootstrap": [False], "n_estimators": [3, 10], "max_features": [2, 3, 4]},
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring="neg_mean_squared_error", return_train_score=True)

grid_search.fit(data_train_prepared, data_train_labels)

In [None]:
grid_search.best_params_

### 11.1 Get best estimator directly

In [None]:
grid_search.best_estimator_

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

# 12. Selecting the best model

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_

extra_attribs = ["miles_per_year"]
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

# 13. Evaluation on the test set

In [None]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("price", axis=1)
y_test = strat_test_set["price"].copy()

X_test_prepared = full_pipeline.transform(X_test)

final_predictions =final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [None]:
final_rmse

### 13.1 How confident are we in generalising?

In [None]:
from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1, loc=squared_errors.mean(), scale=stats.sem(squared_errors)))

# 14. Result

### Based on the test set, the model predicts prices for Skoda cars with a RMSE of ~3,085, which is off by about 22%.