In [1]:
from itertools import combinations
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import normalize
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import SGDRegressor, Lasso, LogisticRegression
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.svm import LinearSVR, SVR
from sklearn.neural_network import MLPRegressor

In [2]:
df = pd.read_csv("carbi.data.csv")

In [3]:
df = df.drop(['Unnamed: 0'], axis=1)
df.sample(2)

Unnamed: 0,cylinders,displayments,horsepower,weight,acceleration,model year,cluster,mpg,origin_1,origin_2,origin_3,origin,car name,brand,mpg_scaled,cylinders_str,origin_str
134,6,258.0,110.0,3632,18.0,74,1,16.0,1,0,0,1,amc matador,amc,0.18617,6,1
345,4,81.0,60.0,1760,16.1,81,0,35.1,0,0,1,3,honda civic 1300,honda,0.694149,4,3


In [4]:
temp = df[['cylinders']]
df = pd.get_dummies(df, columns=['cylinders'])
df['cylinders'] = temp
df.sample(2)

Unnamed: 0,displayments,horsepower,weight,acceleration,model year,cluster,mpg,origin_1,origin_2,origin_3,origin,car name,brand,mpg_scaled,cylinders_str,origin_str,cylinders_4,cylinders_6,cylinders_8,cylinders
30,140.0,90.0,2264,15.5,71,0,28.0,1,0,0,1,chevrolet vega 2300,chevrolet,0.505319,4,1,1,0,0,4
150,108.0,93.0,2391,15.5,74,0,26.0,0,0,1,3,subaru,subaru,0.452128,4,3,1,0,0,4


In [5]:
df['brand_code'] = pd.Categorical(df.brand).codes
df.sample(2)

Unnamed: 0,displayments,horsepower,weight,acceleration,model year,cluster,mpg,origin_1,origin_2,origin_3,...,car name,brand,mpg_scaled,cylinders_str,origin_str,cylinders_4,cylinders_6,cylinders_8,cylinders,brand_code
374,151.0,80.67068,3035,20.5,82,0,23.0,1,0,0,...,amc concord dl,amc,0.37234,4,1,1,0,0,4,0
119,114.0,91.0,2582,14.0,73,0,20.0,0,1,0,...,audi 100ls,audi,0.292553,4,2,1,0,0,4,1


In [6]:
df.columns

Index(['displayments', 'horsepower', 'weight', 'acceleration', 'model year',
       'cluster', 'mpg', 'origin_1', 'origin_2', 'origin_3', 'origin',
       'car name', 'brand', 'mpg_scaled', 'cylinders_str', 'origin_str',
       'cylinders_4', 'cylinders_6', 'cylinders_8', 'cylinders', 'brand_code'],
      dtype='object')

In [7]:
# generate all posible attribute combinations
attributes = [
    'displayments',
    'horsepower',
    'weight',
    'acceleration',
    'model year',
    'cluster',
#     'origin_1',
#     'origin_2',
#     'origin_3',
    'origin',
    'cylinders',
#     'cylinders_4'
#     'cylinders_6'
#     'cylinders_8'
#     'brand',
    'brand_code'
]
attribute_combinations = []
for i in range(1, len(attributes)):
    for combination in combinations(attributes, i + 1):
        attribute_combinations.append(combination)

len(attribute_combinations)

502

In [None]:
# kFold testing of all default regressors
normalized = False # used to test normalized values
folds = 5 # size of kfolds

yData = (df[['mpg']]).values.ravel()
if (normalized):
    yData = normalize(yData.reshape(1, -1))[0]

models = [
    DummyRegressor,
    
    AdaBoostRegressor,
    BaggingRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
    
    KernelRidge,
    
    SVR,
    LinearSVR,
    
#     LogisticRegression,
    SGDRegressor,
    Lasso,
    
    MLPRegressor
]
results = []
for modelFactory in models:
    results = []
    for attributes in attribute_combinations:
        xData = df[list(attributes)].values
        if (normalized):
            xData = normalize(xData)
        kfold = KFold(n_splits=folds)
        sumScore = 0
        for trainIndex, testIndex in kfold.split(xData):
            model = modelFactory()
            model.fit(xData[trainIndex], yData[trainIndex].ravel())
            sumScore += model.score(xData[testIndex], yData[testIndex])
        results.append([str(model), str(attributes), sumScore / folds])
        print(f"{model} - { attributes } - { sumScore / folds }")
    resultDf = pd.DataFrame(results)
    resultDf.to_csv(f"{modelFactory()}.csv") # edit file to avoid duplicates

DummyRegressor() - ('displayments', 'horsepower') - -1.0151704813239477
DummyRegressor() - ('displayments', 'weight') - -1.0151704813239477
DummyRegressor() - ('displayments', 'acceleration') - -1.0151704813239477
DummyRegressor() - ('displayments', 'model year') - -1.0151704813239477
DummyRegressor() - ('displayments', 'cluster') - -1.0151704813239477
DummyRegressor() - ('displayments', 'origin') - -1.0151704813239477
DummyRegressor() - ('displayments', 'cylinders') - -1.0151704813239477
DummyRegressor() - ('displayments', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('horsepower', 'weight') - -1.0151704813239477
DummyRegressor() - ('horsepower', 'acceleration') - -1.0151704813239477
DummyRegressor() - ('horsepower', 'model year') - -1.0151704813239477
DummyRegressor() - ('horsepower', 'cluster') - -1.0151704813239477
DummyRegressor() - ('horsepower', 'origin') - -1.0151704813239477
DummyRegressor() - ('horsepower', 'cylinders') - -1.0151704813239477
DummyRegressor() - ('hor

DummyRegressor() - ('model year', 'cluster', 'origin') - -1.0151704813239477
DummyRegressor() - ('model year', 'cluster', 'cylinders') - -1.0151704813239477
DummyRegressor() - ('model year', 'cluster', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('model year', 'origin', 'cylinders') - -1.0151704813239477
DummyRegressor() - ('model year', 'origin', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('model year', 'cylinders', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('cluster', 'origin', 'cylinders') - -1.0151704813239477
DummyRegressor() - ('cluster', 'origin', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('cluster', 'cylinders', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('origin', 'cylinders', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('displayments', 'horsepower', 'weight', 'acceleration') - -1.0151704813239477
DummyRegressor() - ('displayments', 'horsepower', 'weight', 'model year') - -1.0151704813239477
DummyRegressor

DummyRegressor() - ('horsepower', 'model year', 'cylinders', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('horsepower', 'cluster', 'origin', 'cylinders') - -1.0151704813239477
DummyRegressor() - ('horsepower', 'cluster', 'origin', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('horsepower', 'cluster', 'cylinders', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('horsepower', 'origin', 'cylinders', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('weight', 'acceleration', 'model year', 'cluster') - -1.0151704813239477
DummyRegressor() - ('weight', 'acceleration', 'model year', 'origin') - -1.0151704813239477
DummyRegressor() - ('weight', 'acceleration', 'model year', 'cylinders') - -1.0151704813239477
DummyRegressor() - ('weight', 'acceleration', 'model year', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('weight', 'acceleration', 'cluster', 'origin') - -1.0151704813239477
DummyRegressor() - ('weight', 'acceleration', 'cluster', 'cylinders') 

DummyRegressor() - ('displayments', 'acceleration', 'model year', 'cylinders', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('displayments', 'acceleration', 'cluster', 'origin', 'cylinders') - -1.0151704813239477
DummyRegressor() - ('displayments', 'acceleration', 'cluster', 'origin', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('displayments', 'acceleration', 'cluster', 'cylinders', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('displayments', 'acceleration', 'origin', 'cylinders', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('displayments', 'model year', 'cluster', 'origin', 'cylinders') - -1.0151704813239477
DummyRegressor() - ('displayments', 'model year', 'cluster', 'origin', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('displayments', 'model year', 'cluster', 'cylinders', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('displayments', 'model year', 'origin', 'cylinders', 'brand_code') - -1.0151704813239477
DummyRegresso

DummyRegressor() - ('displayments', 'horsepower', 'model year', 'cluster', 'origin', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('displayments', 'horsepower', 'model year', 'cluster', 'cylinders', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('displayments', 'horsepower', 'model year', 'origin', 'cylinders', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('displayments', 'horsepower', 'cluster', 'origin', 'cylinders', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('displayments', 'weight', 'acceleration', 'model year', 'cluster', 'origin') - -1.0151704813239477
DummyRegressor() - ('displayments', 'weight', 'acceleration', 'model year', 'cluster', 'cylinders') - -1.0151704813239477
DummyRegressor() - ('displayments', 'weight', 'acceleration', 'model year', 'cluster', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('displayments', 'weight', 'acceleration', 'model year', 'origin', 'cylinders') - -1.0151704813239477
DummyRegressor() - ('displa

DummyRegressor() - ('horsepower', 'acceleration', 'model year', 'cluster', 'origin', 'cylinders', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('weight', 'acceleration', 'model year', 'cluster', 'origin', 'cylinders', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('displayments', 'horsepower', 'weight', 'acceleration', 'model year', 'cluster', 'origin', 'cylinders') - -1.0151704813239477
DummyRegressor() - ('displayments', 'horsepower', 'weight', 'acceleration', 'model year', 'cluster', 'origin', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('displayments', 'horsepower', 'weight', 'acceleration', 'model year', 'cluster', 'cylinders', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('displayments', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'cylinders', 'brand_code') - -1.0151704813239477
DummyRegressor() - ('displayments', 'horsepower', 'weight', 'acceleration', 'cluster', 'origin', 'cylinders', 'brand_code') - -1.015170481323947

In [14]:
# δοκιμές με παραμέτρους για να βρω τον ιδανικό συνδιασμό
yData = (df[['mpg']]).values.ravel()
xData = df[attributes].values

model = GradientBoostingRegressor()
parameters = {
#     'loss': ('huber', 'absolute_error', 'quantile'),
#     'penalty': ('l2', 'l1', 'elasticnet')
#     'solver': ['lbfgs', 'sgd', 'adam'],
#     'activation': ('identity', 'logistic', 'tanh', 'relu')
}
scoring = ('max_error', 'r2', 'explained_variance', 'neg_mean_absolute_error', 'neg_mean_squared_error')
for scorer in scoring:
    clf = GridSearchCV(
        estimator=model,
        param_grid=parameters,
        scoring=scoring,
        refit=scorer,
        cv=5
    )
    clf.fit(xData, yData)

    print((scorer, clf.best_params_, clf.best_score_))

Traceback (most recent call last):
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/ensemble/_gb.py", line 448, in fit
    self._check_params()
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/ensemble/_gb.py", line 239, in _check_params
    raise ValueError("Loss '{0:s}' not supported. ".format(self.loss))
ValueError: Loss 'absolute_error' not supported. 

Traceback (most recent call last):
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/ensemble/_gb.py", line 448, in fit
    self._check_params()
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/skl

('max_error', {'loss': 'huber'}, -9.066762900023125)


Traceback (most recent call last):
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/ensemble/_gb.py", line 448, in fit
    self._check_params()
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/ensemble/_gb.py", line 239, in _check_params
    raise ValueError("Loss '{0:s}' not supported. ".format(self.loss))
ValueError: Loss 'absolute_error' not supported. 

Traceback (most recent call last):
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/ensemble/_gb.py", line 448, in fit
    self._check_params()
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/skl

('r2', {'loss': 'huber'}, 0.7875486793293822)


Traceback (most recent call last):
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/ensemble/_gb.py", line 448, in fit
    self._check_params()
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/ensemble/_gb.py", line 239, in _check_params
    raise ValueError("Loss '{0:s}' not supported. ".format(self.loss))
ValueError: Loss 'absolute_error' not supported. 

Traceback (most recent call last):
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/ensemble/_gb.py", line 448, in fit
    self._check_params()
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/skl

('explained_variance', {'loss': 'huber'}, 0.7944547619553458)


Traceback (most recent call last):
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/ensemble/_gb.py", line 448, in fit
    self._check_params()
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/ensemble/_gb.py", line 239, in _check_params
    raise ValueError("Loss '{0:s}' not supported. ".format(self.loss))
ValueError: Loss 'absolute_error' not supported. 

Traceback (most recent call last):
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/ensemble/_gb.py", line 448, in fit
    self._check_params()
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/skl

('neg_mean_absolute_error', {'loss': 'huber'}, -2.0302382533208294)


Traceback (most recent call last):
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/ensemble/_gb.py", line 448, in fit
    self._check_params()
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/ensemble/_gb.py", line 239, in _check_params
    raise ValueError("Loss '{0:s}' not supported. ".format(self.loss))
ValueError: Loss 'absolute_error' not supported. 

Traceback (most recent call last):
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/sklearn/ensemble/_gb.py", line 448, in fit
    self._check_params()
  File "/home/karatakis/miniconda3/lib/python3.9/site-packages/skl

('neg_mean_squared_error', {'loss': 'huber'}, -8.142123554450595)
