In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, cross_val_score, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')



In [2]:
# Define column names and load the dataset
column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight',
                'Acceleration', 'Model Year', 'Origin']
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'

def load_data(url, column_names):
    return pd.read_csv(url, names=column_names, na_values="?", comment='\t', sep=" ", skipinitialspace=True)

data = load_data(url, column_names)


In [3]:
data.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


Data Preprocessing

In [4]:
class CustomAttrAdder(BaseEstimator, TransformerMixin):
    def __init__(self, acc_on_power=True):
        self.acc_on_power = acc_on_power
        self.acc_ix = 4
        self.hpower_ix = 2
        self.cyl_ix = 0
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        acc_on_cyl = X[:, self.acc_ix] / X[:, self.cyl_ix]
        if self.acc_on_power:
            acc_on_power = X[:, self.acc_ix] / X[:, self.hpower_ix]
            return np.c_[X, acc_on_power, acc_on_cyl]
        return np.c_[X, acc_on_cyl]




In [5]:
def preprocess_origin_cols(df):
    df["Origin"] = df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})
    return df

def num_pipeline_transformer(data):
    numerics = ['float64', 'int64']
    num_attrs = data.select_dtypes(include=numerics)
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attrs_adder', CustomAttrAdder()),
        ('std_scaler', StandardScaler()),
    ])
    return num_attrs, num_pipeline

def pipeline_transformer(data):
    cat_attrs = ["Origin"]
    num_attrs, num_pipeline = num_pipeline_transformer(data)
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, list(num_attrs)),
        ("cat", OneHotEncoder(), cat_attrs),
    ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

def preprocess_data(data):
    data_preprocessed = preprocess_origin_cols(data)
    prepared_data = pipeline_transformer(data_preprocessed)
    return prepared_data

def stratified_split(data, test_size=0.2, random_state=42):
    split = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    for train_index, test_index in split.split(data, data["Cylinders"]):
        strat_train_set = data.loc[train_index]
        strat_test_set = data.loc[test_index]
    return strat_train_set, strat_test_set

In [6]:
strat_train_set, strat_test_set = stratified_split(data)
train_data = strat_train_set.drop("MPG", axis=1)
train_labels = strat_train_set["MPG"].copy()
test_data = strat_test_set.drop("MPG", axis=1)
test_labels = strat_test_set["MPG"].copy()

train_prepared = preprocess_data(train_data)
test_prepared = preprocess_data(test_data)

Model Training and Evaluation

In [7]:
def train_model(model, train_data, train_labels):
    model.fit(train_data, train_labels)
    return model

def evaluate_model(model, test_data, test_labels):
    predictions = model.predict(test_data)
    mse = mean_squared_error(test_labels, predictions)
    rmse = np.sqrt(mse)
    return rmse

def cross_validate(model, train_data, train_labels, cv=10):
    scores = cross_val_score(model, train_data, train_labels, scoring="neg_mean_squared_error", cv=cv)
    rmse_scores = np.sqrt(-scores)
    return rmse_scores.mean()

lin_reg = train_model(LinearRegression(), train_prepared, train_labels)
tree_reg = train_model(DecisionTreeRegressor(), train_prepared, train_labels)
forest_reg = train_model(RandomForestRegressor(), train_prepared, train_labels)
svm_reg = train_model(SVR(kernel='linear'), train_prepared, train_labels)

lin_rmse = evaluate_model(lin_reg, test_prepared, test_labels)
tree_rmse = evaluate_model(tree_reg, test_prepared, test_labels)
forest_rmse = cross_validate(forest_reg, train_prepared, train_labels)
svm_rmse = cross_validate(svm_reg, train_prepared, train_labels)

print(f'Linear Regression RMSE: {lin_rmse}')
print(f'Decision Tree RMSE: {tree_rmse}')
print(f'Random Forest CV RMSE: {forest_rmse}')
print(f'SVM CV RMSE: {svm_rmse}')


Linear Regression RMSE: 3.2820016228546476
Decision Tree RMSE: 4.405649781814256
Random Forest CV RMSE: 2.593864800593029
SVM CV RMSE: 3.086591620802809


Hyperparameter tuning

In [8]:
# Perform cross-validation and print CV scores for different models

def print_cv_scores(model, prepared_data, data_labels, cv=10):
    scores = cross_val_score(model, prepared_data, data_labels, scoring="neg_mean_squared_error", cv=cv)
    rmse_scores = np.sqrt(-scores)
    print(f'{model.__class__.__name__} CV RMSE Scores: {rmse_scores}')
    print(f'{model.__class__.__name__} CV RMSE Mean: {rmse_scores.mean()}')
    print(f'{model.__class__.__name__} CV RMSE Standard Deviation: {rmse_scores.std()}')
    return rmse_scores



In [9]:
# Cross-validate models
lin_reg_scores = print_cv_scores(LinearRegression(), train_prepared, train_labels)
tree_reg_scores = print_cv_scores(DecisionTreeRegressor(), train_prepared, train_labels)
forest_reg_scores = print_cv_scores(RandomForestRegressor(), train_prepared, train_labels)
svm_reg_scores = print_cv_scores(SVR(kernel='linear'), train_prepared, train_labels)

LinearRegression CV RMSE Scores: [3.43254597 3.45157629 3.6621715  2.59652976 2.48023405 2.74798115
 3.32524647 2.42208917 3.78133275 2.8573747 ]
LinearRegression CV RMSE Mean: 3.075708179370932
LinearRegression CV RMSE Standard Deviation: 0.483654669078112
DecisionTreeRegressor CV RMSE Scores: [2.9375266  2.95148268 2.9460779  3.48819438 2.33057128 2.89946116
 3.4700054  4.20026041 4.23449734 2.47138462]
DecisionTreeRegressor CV RMSE Mean: 3.192946176257158
DecisionTreeRegressor CV RMSE Standard Deviation: 0.6163121638153273
RandomForestRegressor CV RMSE Scores: [2.12772414 2.50380232 2.71243648 2.41408267 2.04792082 2.528754
 2.66922799 2.63846972 4.06249368 1.89640034]
RandomForestRegressor CV RMSE Mean: 2.5601312167883963
RandomForestRegressor CV RMSE Standard Deviation: 0.5668942226556964
SVR CV RMSE Scores: [3.52759194 3.10435954 3.6573445  2.83823604 2.56665123 2.68448844
 3.4378112  2.34802163 3.85665021 2.84476148]
SVR CV RMSE Mean: 3.086591620802809
SVR CV RMSE Standard Devia

In [10]:
# Hyperparameter Tuning
param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]

grid_search = GridSearchCV(RandomForestRegressor(), param_grid, scoring='neg_mean_squared_error', return_train_score=True, cv=10)
grid_search.fit(train_prepared, train_labels)

best_model = grid_search.best_estimator_
print(f'Best Parameters: {grid_search.best_params_}')

# Final Evaluation on Test Data
final_model = best_model
final_rmse = evaluate_model(final_model, test_prepared, test_labels)
print(f'Final Model RMSE on Test Data: {final_rmse}')


Best Parameters: {'max_features': 8, 'n_estimators': 30}
Final Model RMSE on Test Data: 3.1643660502539848


In [11]:
def save_model(model, filename="model.bin"):
    with open(filename, 'wb') as f_out:
        pickle.dump(model, f_out)

def load_model(filename="model.bin"):
    with open(filename, 'rb') as f_in:
        return pickle.load(f_in)

save_model(final_model)
loaded_model = load_model()


Prediction Function

In [12]:
def predict_mpg(config, model):
    if isinstance(config, dict):
        df = pd.DataFrame(config)
    else:
        df = config

    preprocessed_df = preprocess_origin_cols(df)
    prepared_df = pipeline_transformer(preprocessed_df)
    return model.predict(prepared_df)

vehicle_config = {
    'Cylinders': [4, 6, 8],
    'Displacement': [155.0, 160.0, 165.5],
    'Horsepower': [93.0, 130.0, 98.0],
    'Weight': [2500.0, 3150.0, 2600.0],
    'Acceleration': [15.0, 14.0, 16.0],
    'Model Year': [81, 80, 78],
    'Origin': [3, 2, 1]
}

predictions = predict_mpg(vehicle_config, loaded_model)
print(f'Predictions: {predictions}')


Predictions: [32.58333333 17.48666667 20.15333333]
