In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. EDA

In [None]:
data = pd.read_csv('../input/used-car-dataset-ford-and-mercedes/ford.csv')
data.head()

In [None]:
data.info()

In [None]:
data.model.value_counts()

In [None]:
data.describe()

# 2. Stratifying sample

### 2.1 Alternative 1

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
data["tax_cat"] = pd.cut(data["tax"], bins=[-1, 116, 232, 348, 464, np.inf], labels=[1 ,2, 3, 4, 5])
data.sample(5)

In [None]:
data["tax_cat"].value_counts().sort_index()

### 2.2 Alternative 2

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["tax_cat"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [None]:
# Result Alternative 2:
strat_test_set["tax_cat"].value_counts().sort_index() / len(strat_test_set)

In [None]:
# Result Alternative 1:
data["tax_cat"].value_counts().sort_index() / len(data)

### 2.3 Compare stratification results to default

In [None]:
def tax_cat_proportions(test_values):
    return test_values["tax_cat"].value_counts() / len(test_values)

train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

compare_props = pd.DataFrame({
    "Overall": tax_cat_proportions(data),
    "Stratified": tax_cat_proportions(strat_test_set),
    "Random": tax_cat_proportions(test_set),
}).sort_index()
compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100

In [None]:
compare_props

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("tax_cat", axis=1, inplace=True)

# 3. Encoding Categorical and Numerical variables

In [None]:
# Split into independent variables and dependent variable (y)
x_predictors = strat_train_set.drop("price", axis=1)
y_labels = strat_train_set["price"].copy()

# Generate variable without strings
ford_num = x_predictors.drop(["model", "transmission", "fuelType"], axis=1)

In [None]:
# Pipeline for NUMERICAL values
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ("std_scaler", StandardScaler()),
])

In [None]:
# Transform CATEGORICAL values
from sklearn.preprocessing import OneHotEncoder

ford_cat = x_predictors[["model", "transmission", "fuelType"]]
cat_encoder = OneHotEncoder()
ford_cat_1hot = cat_encoder.fit_transform(ford_cat)
ford_cat_1hot

In [None]:
# Do all at once (i.e. NUMERICAL/CATAGEORICAL variables)
from sklearn.compose import ColumnTransformer

num_attribs = list(ford_num)
cat_attribs = ["model", "transmission", "fuelType"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

ford_prepared = full_pipeline.fit_transform(x_predictors)

# 4. Using a Support Vector Machine (SVM) regressor

In [None]:
from sklearn.svm import SVR

svr_linear = SVR(kernel="linear", C=100)
svr_linear.fit(ford_prepared, y_labels)

### 4.1 Evaluation of results

In [None]:
from sklearn.metrics import mean_squared_error

data_predictions = svr_linear.predict(ford_prepared)
svr_mse = mean_squared_error(y_labels, data_predictions)
svr_rmse = np.sqrt(svr_mse)
svr_rmse

### 4.2 Testing different kernels and hyperparameter combinations with GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10, 100, 1000]}
svr_eval = SVR()
clf = GridSearchCV(svr_eval, parameters)
clf.fit(ford_prepared, y_labels)

sorted(clf.cv_results_.keys())

In [None]:
clf.best_params_

In [None]:
clf.best_estimator_

In [None]:
clf.best_score_

In [None]:
clf.best_index_

### 4.3 Using optimised parameters

In [None]:
svr_linear_opt = SVR(kernel="rbf", C=1000)
svr_linear_opt.fit(ford_prepared, y_labels)

data_predictions = svr_linear_opt.predict(ford_prepared)
svr_mse = mean_squared_error(y_labels, data_predictions)
svr_rmse = np.sqrt(svr_mse)
svr_rmse

### 4.4 Overview of all combinations

In [None]:
clf_results = clf.cv_results_
for svr_mse, params in zip(clf_results["mean_test_score"], clf_results["params"]):
    print(np.sqrt(svr_mse), params)


# 5. Evaluation on the test set

In [None]:
final_model = clf.best_estimator_

X_test = strat_test_set.drop("price", axis=1)
y_test = strat_test_set["price"].copy()

In [None]:
X_test_prepared = full_pipeline.transform(X_test)

In [None]:
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [None]:
final_rmse

In [None]:
from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1, loc=squared_errors.mean(), scale=stats.sem(squared_errors)))

Based on the test set, the SVM model predicts prices for Ford cars with a RMSE of ~1,247, which is off by about 10% of the average price.