In [None]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("../input/cars-moldova/cars.csv")

First we get rid off potential duplicates. Some people post their cars multiple times.

In [None]:
data.duplicated().sum()

So there are quite a lot of them.

In [None]:
data = data.drop_duplicates()
data.tail()

In [None]:
data.info()

After we drop the duplicates we need to reset the index. We can see that the last data entry has index 41008 but there are 37264 entries

In [None]:
data = data.reset_index(drop=True)
data.tail()

In [None]:
data.describe()

The minimum values for Distance, Engine capacity and Price look suspicious. Some sellers might have been a little reticent to tell the real values. So a little cleaning needs to be done.

Lets start with the distance. There is no way for a car manufactured earlier than 2021 to have lets say less than 1000 km. So we should drop all the cars that have less than 1000 km and were made earlier than 2021.

In [None]:
question_dist = data[(data.Year < 2021) & (data.Distance < 1000)]
question_dist.describe()

We can see that there are quite a lot o cars that need to be dropped. Also the mean year 75% is 2010. So a lot of cars below this year are supposed to have less than 1000km. It is unlikely. So this drop seems necesary.

In [None]:
data = data.drop(question_dist.index)

#resetting index
data = data.reset_index(drop=True)

Now engine capacity. It may differ by country but where I live, the smallest capacity for an engine is 200 cm3, so all cars with less than that will be dropped.

In [None]:
question_engine = data[data["Engine_capacity(cm3)"] < 200]
question_engine.describe()

Some sellers might have been lazy enough to have writen 100 instead of 1000 or 150 instead of 1500 and it might make sense to multiply values >= 20 with 10 so as to respect the legal definition of a car. But the number of cars that will be dropped is to low and my supposition might be false. So just drop them.

In [None]:
data = data.drop(question_engine.index)

#reset index
data = data.reset_index(drop = True)
data

Now the price. It's quite a common practice to set it's value to 1 euro so the buyer has to call you and now you can negotiate. Also it seems very unlikely for a car be sold for less than 100 euros.

In [None]:
question_price = data[data["Price(euro)"] < 101]
question_price.describe()

All the quantiles seem to be more or less random numbers. And there are just 10 cars in this subset. Drop them!

In [None]:
data = data.drop(question_price.index)

#reset index
data = data.reset_index(drop=True)
data.describe()

The last thing that bothers me is the minimum year.

In [None]:
data.sort_values(by=["Year"]).head(10)

Wow. I was thinking that these were collection cars. To choose a year as aline for cars to be dropped I think I should think about the use for these data. It might be used to predict a price for a car a person wants to buy. So a typical car, not a old one. Let's leave these task to the experts. I think 1980 is reasonable.

In [None]:
question_year = data[data.Year < 1980]
question_year.describe()

Only 94 cars from aprox. 33000. No big deal.

In [None]:
data = data.drop(question_year.index)

data = data.reset_index(drop=True)
data.describe()

This looks better.

In [None]:
%matplotlib inline
import matplotlib as plt

hist1 = data.hist(column = ["Year"])

In [None]:
hist2 = data.hist(column = ["Distance"])

In [None]:
hist3 = data.hist(column=["Engine_capacity(cm3)"])

Year feature is skewed to the left. So we can raise it ot the power of 2 (or higher powers) to normalize it. Engine capacity and distance are skewed to the left so in our transformation pipeline I will log transoform them. Also creating a distance per year metric seems to make sense and might be helpful.

In [None]:
#separating predicted value from the rest of the data
y = data["Price(euro)"]
X = data.drop(["Price(euro)"], axis = 1)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=3000)

In [None]:
from sklearn.base import TransformerMixin, BaseEstimator

#costum transformer to create distance per year column
class Dist_year(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        km_year = X.Distance / (2022 - X.Year)
        km_year.name = "Km_year"
        X = X.join(km_year)
        return X

In [None]:
#costum transformer to raise to a power
class Power(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X**3
        return X

In [None]:
#numeric columns without year
num_cols = ["Distance", "Engine_capacity(cm3)", "Km_year"]

#categorical columns
cat_cols = ["Make", "Model", "Fuel_type", "Transmission", "Style"]

In [None]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder

num_pipeline = Pipeline([
    ("LogTransform",FunctionTransformer(np.log1p)),
    ("Scaler", StandardScaler())
])

num_pipeline_year = Pipeline([
    ("Power", Power()),
    ("Scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("OHE", OneHotEncoder(sparse=False, handle_unknown="ignore"))
])

In [None]:
preprocessing_pipe= ColumnTransformer([
    ("NumTrans", num_pipeline, num_cols),
    ("NumTransYear", num_pipeline_year, ["Year"]),
    ("CatTrans", cat_pipeline, cat_cols)
], remainder="passthrough")

final_pipe = Pipeline([
    ("DistYear", Dist_year()),
    ("ColTransLog", preprocessing_pipe),
])

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

final_pipe_for_real = Pipeline([
    ("preprocessing", final_pipe),
    ("model", KNeighborsRegressor(n_jobs=-1))
])

nr_neigh = list(range(10,81,5))

params = {
    "model__n_neighbors": nr_neigh,
    "model__weights": ["uniform", "distance"]
}

grid = GridSearchCV(final_pipe_for_real, params, cv = 10, scoring="neg_mean_squared_error")

In [None]:
grid.fit(X_train, y_train)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#make a dataframe
test_scores = []
weights = []
neigh = []
for i in range(len(grid.cv_results_["mean_test_score"])):
    test_scores.append(np.sqrt(-1*grid.cv_results_["mean_test_score"][i]))
    weights.append(grid.cv_results_["params"][i]["model__weights"])
    neigh.append(grid.cv_results_["params"][i]["model__n_neighbors"])

metric = pd.DataFrame({"test_scores": test_scores, "weights": weights, "neighbors": neigh})


In [None]:
sns.lineplot(x=metric.neighbors, y = metric.test_scores, hue = metric.weights)

In [None]:
from sklearn.metrics import mean_squared_error

train_predictions = grid.predict(X_train)
train_erorr = np.sqrt(mean_squared_error(train_predictions, y_train))
train_erorr

In [None]:
test_predictions = grid.predict(X_test)
test_erorr = np.sqrt(mean_squared_error(test_predictions, y_test))
test_erorr

The best a result a kneighbors can get