In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
warnings.filterwarnings("ignore")

In [46]:
# Initialize an empty list to hold the 
data = [] 
# Open and read the file 
with open("C:\\Datasets\\auto-mpg.data", "r") as f: 
    for line in f.readlines(): 
        # Remove extra spaces and split by spaces or tab 
        cleaned_line = re.split(r'\s{2,}|\t', line.strip()) 
        data.append(cleaned_line) 
# Convert the list of lists to a DataFrame 
columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
           'acceleration', 'model_year', 'origin', 'car_name'] 
df = pd.DataFrame(data, columns=columns) 
# Convert numeric columns to appropriate types 
df['mpg'] = df['mpg'].astype(float) 
df['cylinders'] = df['cylinders'].astype(int) 
df['displacement'] = df['displacement'].astype(float) 
df['horsepower'] = pd.to_numeric(df['horsepower'],errors="coerce")
df['weight'] = df['weight'].astype(float) 
df['acceleration'] = df['acceleration'].astype(float) 
df['model_year'] = df['model_year'].astype(int) 
df['origin'] = df['origin'].astype(int) 
# Display the first few rows of the DataFrame
df.head(2)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,"""chevrolet chevelle malibu"""
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,"""buick skylark 320"""


In [60]:
df["horsepower"].unique()

array([130., 165., 150., 140., 198., 220., 215., 225., 190., 170., 160.,
        95.,  97.,  85.,  88.,  46.,  87.,  90., 113., 200., 210., 193.,
        nan, 100., 105., 175., 153., 180., 110.,  72.,  86.,  70.,  76.,
        65.,  69.,  60.,  80.,  54., 208., 155., 112.,  92., 145., 137.,
       158., 167.,  94., 107., 230.,  49.,  75.,  91., 122.,  67.,  83.,
        78.,  52.,  61.,  93., 148., 129.,  96.,  71.,  98., 115.,  53.,
        81.,  79., 120., 152., 102., 108.,  68.,  58., 149.,  89.,  63.,
        48.,  66., 139., 103., 125., 133., 138., 135., 142.,  77.,  62.,
       132.,  84.,  64.,  74., 116.,  82.])

In [78]:
# There are only 6 missing values 
df = df.dropna(axis=0)

In [88]:
# Taking only Company name
df["car_name"] = df["car_name"].apply(lambda x: x.split()[0].replace('"',''))

df["car_name"].unique()

array(['chevrolet', 'buick', 'plymouth', 'amc', 'ford', 'pontiac',
       'dodge', 'toyota', 'datsun', 'volkswagen', 'peugeot', 'audi',
       'saab', 'bmw', 'chevy', 'hi', 'mercury', 'opel', 'fiat',
       'oldsmobile', 'chrysler', 'mazda', 'volvo', 'renault', 'toyouta',
       'maxda', 'honda', 'subaru', 'chevroelt', 'capri', 'vw',
       'mercedes-benz', 'cadillac', 'mercedes', 'vokswagen', 'triumph',
       'nissan'], dtype=object)

In [92]:
car_name_corrections = {
    'chevy': "chevrolet",
    'hi': "mercedes_benz",
    'toyouta': "Toyota",
    'maxda': "Mazda",
    'vw': "volkswagen",
    'vokswagen': "volkswagen"
}

df["car_name"] = df["car_name"].apply(lambda x: car_name_corrections[x] if x in car_name_corrections else x)
df["car_name"].unique()

array(['chevrolet', 'buick', 'plymouth', 'amc', 'ford', 'pontiac',
       'dodge', 'toyota', 'datsun', 'volkswagen', 'peugeot', 'audi',
       'saab', 'bmw', 'mercedes_benz', 'mercury', 'opel', 'fiat',
       'oldsmobile', 'chrysler', 'mazda', 'volvo', 'renault', 'Toyota',
       'Mazda', 'honda', 'subaru', 'chevroelt', 'capri', 'mercedes-benz',
       'cadillac', 'mercedes', 'triumph', 'nissan'], dtype=object)

In [184]:
# Create a mask for car names that appear less than 4 times
mask = df['car_name'].map(df['car_name'].value_counts()) < 2

# Replace those car names with "others"
df.loc[mask, 'car_name'] = 'others'

df.sample(4)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
203,29.5,4,97.0,71.0,1825.0,12.2,76,2,volkswagen
226,20.5,6,231.0,105.0,3425.0,16.9,77,1,buick
156,16.0,8,400.0,170.0,4668.0,11.5,75,1,pontiac
351,34.4,4,98.0,65.0,2045.0,16.2,81,1,ford


In [188]:
df["car_name"].value_counts()

car_name
ford             48
chevrolet        46
plymouth         31
dodge            28
amc              27
toyota           25
datsun           23
volkswagen       22
buick            17
pontiac          16
honda            13
mercury          11
mazda            10
oldsmobile       10
fiat              8
peugeot           8
others            7
audi              7
chrysler          6
volvo             6
opel              4
saab              4
subaru            4
renault           3
bmw               2
Mazda             2
mercedes-benz     2
cadillac          2
Name: count, dtype: int64

In [190]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int32  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    float64
 5   acceleration  392 non-null    float64
 6   model_year    392 non-null    int32  
 7   origin        392 non-null    int32  
 8   car_name      392 non-null    object 
dtypes: float64(5), int32(3), object(1)
memory usage: 26.0+ KB


In [192]:
## Creating Train and test data
X,y = df.drop(columns="mpg"),df["mpg"]
X.shape, y.shape

((392, 8), (392,))

In [118]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [194]:
# Splitting Train and test data
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42,stratify=X["car_name"])
X_train.shape, X_test.shape

((313, 8), (79, 8))

In [198]:
# Define columns
numeric_columns = ['displacement', 'horsepower', 'weight', 'acceleration', 'model_year']
categorical_col = X.select_dtypes(include="object").columns  

ohe = OneHotEncoder(drop="first")
stScaler = StandardScaler()

# Create a column transformer
preprocess = ColumnTransformer(
    [
        ("Standard Scaler", stScaler, numeric_columns),
        ("One Hot Encoder", ohe, categorical_col)
    ],
    remainder="passthrough"
)


In [200]:
# Define models
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(),
    "GradientBoost": GradientBoostingRegressor(),
    "SVR": SVR()
}

# Define parameter grids
param_grids = {
    "LinearRegression": {
        "LinearRegression__fit_intercept": [True, False]
    },
    "RandomForest": {
        "RandomForest__n_estimators": [100, 150, 200, 250],
        "RandomForest__max_depth": [None, 4, 6],
        "RandomForest__min_samples_split": [2, 4, 5],
        "RandomForest__min_samples_leaf": [2, 4]
    },
    "GradientBoost": {
        "GradientBoost__learning_rate": [0.1, 0.2, 0.4],
        "GradientBoost__n_estimators": [100, 50, 150],
        "GradientBoost__min_samples_split": [2, 4, 6],
        "GradientBoost__min_samples_leaf": [2, 4],
        "GradientBoost__max_depth": [3, 4],
        "GradientBoost__alpha": [0.9, 0.7, 0.5]
    },
    "SVR": {
        'SVR__kernel': ['linear', 'poly', 'rbf'],
        'SVR__C': [0.1, 1, 10, 100],
        'SVR__gamma': ['scale', 'auto'],
        'SVR__degree': [2, 3, 4]
    }
}

In [204]:
# this will contain best pipelines for each model
best_pipelines = {}

kf = KFold(n_splits=3) #3 split per epochs
for name, model in models.items():
    # create a pipeline
    pipeline = Pipeline([("preprocess",preprocess),(name,model)])

    # creating GridSearchCV
    gridcv = GridSearchCV(estimator=pipeline,
                          param_grid=param_grids[name],
                          cv=kf,verbose=2,
                          n_jobs=-1,
                          return_train_score=True,
                          scoring="accuracy"
                         )
    # Trainning best model
    gridcv.fit(X_train,y_train)
    #Saving best model
    best_pipelines[name]=gridcv.best_estimator_

    #Print Best value and best parameters
    print("="*20)
    print(f"{name}: {gridcv.best_score_}")
    print(gridcv.best_params_)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
LinearRegression: nan
{'LinearRegression__fit_intercept': True}
Fitting 3 folds for each of 72 candidates, totalling 216 fits
RandomForest: nan
{'RandomForest__max_depth': None, 'RandomForest__min_samples_leaf': 2, 'RandomForest__min_samples_split': 2, 'RandomForest__n_estimators': 100}
Fitting 3 folds for each of 324 candidates, totalling 972 fits
GradientBoost: nan
{'GradientBoost__alpha': 0.9, 'GradientBoost__learning_rate': 0.1, 'GradientBoost__max_depth': 3, 'GradientBoost__min_samples_leaf': 2, 'GradientBoost__min_samples_split': 2, 'GradientBoost__n_estimators': 100}
Fitting 3 folds for each of 72 candidates, totalling 216 fits
SVR: nan
{'SVR__C': 0.1, 'SVR__degree': 2, 'SVR__gamma': 'scale', 'SVR__kernel': 'linear'}


In [205]:
for name,model in best_pipelines.items():
    train_score = model.score(X_train,y_train)
    test_score = model.score(X_test,y_test)

    print("="*20)
    print(f"{name}: train accuracy: {train_score}, test accuracy: {test_score}")

LinearRegression: train accuracy: 0.8430806152458042, test accuracy: 0.8191147492652976
RandomForest: train accuracy: 0.9707014278928051, test accuracy: 0.8557281533328229
GradientBoost: train accuracy: 0.9686133414348465, test accuracy: 0.8523774265997387
SVR: train accuracy: 0.7928004892348843, test accuracy: 0.8224632661794927
