In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Data and primary Data analysis

In [None]:
df = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/toyota.csv')
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
categ = ['model','year','transmission','fuelType','engineSize']
str_categ = ['model','transmission','fuelType']
numer = ['price','mileage','tax','mpg']

# Visualising Data

In [None]:
fig2, axes2 = plt.subplots(2, 2)
fig2.set_figheight(10)
fig2.set_figwidth(18)
for i in range(len(numer)):
  axes2[int(i/2),i%2].hist(df[numer[i]], bins=50)
  axes2[int(i/2),i%2].set(title=numer[i])

In [None]:
fig2, axes2 = plt.subplots(2, 2)
fig2.set_figheight(10)
fig2.set_figwidth(18)
for i in range(len(numer)):
  axes2[int(i/2),i%2].plot(df[numer[i]])
  axes2[int(i/2),i%2].set(title=numer[i])

In [None]:
fig = plt.figure(figsize=(15,25))
ax1 = fig.add_subplot(411)
ax2 = fig.add_subplot(412)
ax3 = fig.add_subplot(425)
ax4 = fig.add_subplot(426)
ax5 = fig.add_subplot(414)


ax1.bar(df['model'].value_counts().index, list(df['model'].value_counts()))
ax1.tick_params(axis='x',labelrotation=30)
ax1.set_title('Model')

ax2.bar(list(map(str, df['year'].value_counts().index)), list(df['year'].value_counts()))
ax2.tick_params(axis='x',labelrotation=60)
ax2.set_title('Year')

ax3.bar(df['transmission'].value_counts().index, list(df['transmission'].value_counts()))
ax3.set_title('Transmission')

ax4.bar(df['fuelType'].value_counts().index, list(df['fuelType'].value_counts()))
ax4.set_title('Fuel Type')

ax5.bar(list(map(str, df['engineSize'].value_counts().index)), list(df['engineSize'].value_counts()))
ax5.set_title('Engine Size')


for i in range(len(list(df['model'].value_counts()))):
  ax1.text(i, list(df['model'].value_counts())[i] + 15, list(df['model'].value_counts())[i], ha='center', va='bottom')
for i in range(len(list(df['year'].value_counts()))):
  ax2.text(i, list(df['year'].value_counts())[i] + 15, list(df['year'].value_counts())[i], ha='center', va='bottom')
for i in range(len(list(df['transmission'].value_counts()))):
  ax3.text(i, list(df['transmission'].value_counts())[i] + 15, list(df['transmission'].value_counts())[i], ha='center', va='bottom')
for i in range(len(list(df['fuelType'].value_counts()))):
  ax4.text(i, list(df['fuelType'].value_counts())[i] + 15, list(df['fuelType'].value_counts())[i], ha='center', va='bottom')
for i in range(len(list(df['engineSize'].value_counts()))):
  ax5.text(i, list(df['engineSize'].value_counts())[i] + 15, list(df['engineSize'].value_counts())[i], ha='center', va='bottom')

In [None]:
plt.figure(figsize=(4,4))
sns.heatmap(df[numer].corr(), annot=True, linewidths=1, cmap='vlag', vmin=-1, vmax=1,);

# Preprocessing and Modeling

**Using OrdinalEncoder for categorial features with "str" type**

In [None]:
import sklearn
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()
df_oe = df.copy()
df_oe[str_categ] = oe.fit_transform(df_oe[str_categ])

**Split Data**

In [None]:
from sklearn.model_selection import train_test_split
X = df_oe.drop(['price'], axis=1)
y = df_oe['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

**Modeling.**
**First set of models on data with OrdinalEncoder for categorial features**

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lars, Lasso

linreg = [ ('LR', LinearRegression()),
           ('Ridge', Ridge()),
           ('LARS', Lars()),
           ('LASSO', Lasso()),
           ] 


for name, model in linreg:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    rs = model.score(X_test, y_test)
    print(f'{name} RMSE: {round(rmse,3)}')
    print(f'       R^2: {round(rs,3)}\n')

**Second set of models on data with OrdinalEncoder for categorial features**

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

regrmodel = [ ('KNeighborsReg', KNeighborsRegressor()),
              ('SVR', SVR()),
              ('RandomForestReg', RandomForestRegressor()),
             ] 

for name, model in regrmodel:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    rs = model.score(X_test, y_test)
    print(f'{name} RMSE: {round(rmse,3)}')
    print(f'       R^2: {round(rs,3)}\n')

* Good results on Random Forest, but some other models doesn't work.
* Let's scale data and use other encoder for categorial features.

**Using One Hot Encoding (pandas get_dummies() to simplify) and RobustScaler to scale data.**

In [None]:
from sklearn.preprocessing import RobustScaler
data_ohe = pd.get_dummies(df)
scl = RobustScaler()
data_scl_ohe = scl.fit_transform(data_ohe)
data_scl_ohe = pd.DataFrame(data_scl_ohe, columns = data_ohe.columns)
data_scl_ohe.head()

In [None]:
X = data_scl_ohe.drop(['price'], axis=1)
y = data_scl_ohe['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

**First set of models on data with OHE and RobustScaler**

In [None]:
linreg = [ ('LR', LinearRegression()),
           ('Ridge', Ridge()),
           ('LARS', Lars()),
           ('LASSO', Lasso()),
           ] 


for name, model in linreg:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    rs = model.score(X_test, y_test)
    print(f'{name} RMSE: {round(rmse,3)}')
    print(f'       R^2: {round(rs,3)}\n')

Better result on Linear Regression, R^2: 0.926

**Second set of models on data with OHE and RobustScaler**

In [None]:
regrmodel = [ ('KNeighborsReg', KNeighborsRegressor()),
              ('SVR', SVR()),
              ('RandomForestReg', RandomForestRegressor()),
             ] 

for name, model in regrmodel:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    rs = model.score(X_test, y_test)
    print(f'{name} RMSE: {round(rmse,3)}')
    print(f'       R^2: {round(rs,3)}\n')

Best result in Support Vector regression model, R^2: 0.968

**Hyperparameter optimization**

In [None]:
from sklearn.model_selection import GridSearchCV
model = SVR()
h_param = {'C':list(range(2,12)), 'epsilon':[0.05, 0.1, 0.15, 0.2]}
grid = GridSearchCV(model, h_param, scoring='r2')
grid.fit(X_train, y_train);
print(f'Best params: {grid.best_params_}')
print(f'R^2: {round(grid.score(X_test, y_test),3)}')