In [87]:
import pandas as pd
import numpy as np
import random
import pickle
import re

from sklearn.metrics import r2_score, mean_squared_error as MSE
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

random.seed(42)
np.random.seed(42)

In [88]:
df_train = pd.read_csv('https://raw.githubusercontent.com/Murcha1990/MLDS_ML_2022/main/Hometasks/HT1/cars_train.csv')
df_test = pd.read_csv('https://raw.githubusercontent.com/Murcha1990/MLDS_ML_2022/main/Hometasks/HT1/cars_test.csv')

X_train = df_train.drop(['selling_price', 'name', 'torque'], axis=1)
y_train = df_train['selling_price']

X_test = df_test.drop(['selling_price', 'name', 'torque'], axis=1)
y_test = df_test['selling_price']

In [89]:
NUM_COLS = ['year', 'km_driven', 'mileage', 'engine', 'max_power', 'seats']
CAT_COLS = ['fuel', 'seller_type', 'transmission', 'owner']
model = Ridge(random_state=42, alpha=4)

In [90]:
def clean_text_columns(df):
    for col in ['mileage', 'engine', 'max_power']:
        df[col] = df[col].str.extract(r'([\d\.]+)').astype('float')
    return df


numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())]
)


categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))]
)


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, NUM_COLS),
        ('cat', categorical_transformer, CAT_COLS)
    ])


final_pipeline = Pipeline([
    ('clean_text_columns', FunctionTransformer(clean_text_columns)),
    ('preprocessor', preprocessor),
    ('model', model)]
)

In [91]:
final_pipeline.fit(X_train, y_train)

In [92]:
pred_pipeline = final_pipeline.predict(X_test)
print(f'MSE (test): {MSE(y_test, pred_pipeline)}')
print(f'R^2 (test): {r2_score(y_test, pred_pipeline)}')

MSE (test): 179934891818.5655
R^2 (test): 0.6869764209630864


In [93]:
MODEL_NAME = "ridge.pickle"
pickle.dump(final_pipeline, open(MODEL_NAME, "wb"))