In [59]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
# from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.base import BaseEstimator, TransformerMixin
import mlflow
import mlflow.sklearn

In [60]:
data=pd.read_csv("data.csv")
data.head(5)

Unnamed: 0,car_name,registration_year,insurance_validity,fuel_type,ownsership,transmission,manufacturing_year,seats,kms_driven,mileage(kmpl),engine(cc),torque(Nm),price(in lakhs),short_carname
0,2017 Mercedes-Benz S-Class S400,2017,Comprehensive,Petrol,First Owner,Automatic,2017,5.0,56000.0,7.81,2996.0,333.0,63.75,Mercedes-Benz S-Class
1,2020 Nissan Magnite Turbo CVT XV Premium Opt BSVI,2021,Comprehensive,Petrol,First Owner,Automatic,2020,5.0,30615.0,17.4,999.0,9863.0,8.99,Nissan Magnite
2,2018 BMW X1 sDrive 20d xLine,2018,Comprehensive,Diesel,First Owner,Automatic,2018,5.0,24000.0,20.68,1995.0,188.0,23.75,BMW X1
3,2019 Kia Seltos GTX Plus,2019,Comprehensive,Petrol,First Owner,Manual,2019,5.0,18378.0,16.5,1353.0,13808.0,13.56,Kia Seltos
4,2019 Skoda Superb LK 1.8 TSI AT,2019,Comprehensive,Petrol,First Owner,Automatic,2019,5.0,44900.0,14.67,1798.0,17746.0,24.0,Skoda Superb


In [61]:
df=data.copy()
df.shape

(1085, 14)

In [62]:
df.drop(['car_name','registration_year'],inplace=True,axis=1)
df.shape

(1085, 12)

In [63]:
df.dtypes

insurance_validity     object
fuel_type              object
ownsership             object
transmission           object
manufacturing_year      int64
seats                 float64
kms_driven            float64
mileage(kmpl)         float64
engine(cc)            float64
torque(Nm)            float64
price(in lakhs)       float64
short_carname          object
dtype: object

# Encoding

In [64]:
df_categorical_features=df.select_dtypes(include="object")
df_numerical_features=df.select_dtypes(include="number")

In [65]:
df_categorical_features.dtypes

insurance_validity    object
fuel_type             object
ownsership            object
transmission          object
short_carname         object
dtype: object

In [66]:
df_numerical_features.dtypes

manufacturing_year      int64
seats                 float64
kms_driven            float64
mileage(kmpl)         float64
engine(cc)            float64
torque(Nm)            float64
price(in lakhs)       float64
dtype: object

In [67]:
# Splitting the data into training and test sets
X = df.drop('price(in lakhs)', axis=1)
y = df['price(in lakhs)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [68]:
type(X_train)

pandas.core.frame.DataFrame

In [69]:
# Preprocessing for numerical features
numeric_features = ['manufacturing_year', 'seats', 'kms_driven',
       'mileage(kmpl)', 'engine(cc)', 'torque(Nm)']
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=3)),  # Using KNNImputer
    ('scaler', StandardScaler())
])

In [70]:
df['insurance_validity'].value_counts()

insurance_validity
Comprehensive    778
Third Party      256
Zero Dep          50
Not Available      1
Name: count, dtype: int64

In [71]:
# Preprocessing for categorical features excluding target encoding
categorical_features = ['insurance_validity', 'fuel_type', 'ownsership', 'transmission']
categorical_transformer = ColumnTransformer(
    transformers=[
        # Label Encoding for insurance_validity
        ('insurance_validity', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['insurance_validity']),

        # One-Hot Encoding for fuel_type and transmission
        ('fuel_type', OneHotEncoder(handle_unknown='ignore'), ['fuel_type']),
        ('transmission', OneHotEncoder(handle_unknown='ignore'), ['transmission']),

        # Ordinal Encoding for ownership
        ('ownsership', OrdinalEncoder(categories=[['First Owner', 'Second Owner', 'Third Owner', 'Fourth Owner','Fifth Owner']]), ['ownsership'])
    ],
    remainder='passthrough'  # Pass the numeric features through without transformation
)

In [72]:
# Create the full pipeline with categorical and regressor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


In [73]:
# Custom transformer for Target Encoding
class TargetEncodingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None):
        self.cols = cols
        self.encoder = TargetEncoder(cols=self.cols)
    
    def fit(self, X, y=None):
        self.encoder.fit(X[self.cols], y)
        return self
    
    def transform(self, X, y=None):
        X[self.cols] = self.encoder.transform(X[self.cols])
        return X


In [77]:
target_encoder= TargetEncodingTransformer(cols=['short_carname'])
X_transformed=target_encoder.fit(X_train,y_train)
X_transformed.transform(X_train)

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


Unnamed: 0,insurance_validity,fuel_type,ownsership,transmission,manufacturing_year,seats,kms_driven,mileage(kmpl),engine(cc),torque(Nm),short_carname
522,Comprehensive,Petrol,Second Owner,Automatic,2017,4.0,31900.0,16.47,1998.0,231.0,15.968585
851,Comprehensive,Petrol,First Owner,Manual,2018,7.0,35000.0,15.40,1497.0,1173.0,12.593560
215,Comprehensive,Petrol,First Owner,Automatic,2022,4.0,4000.0,11.86,2998.0,33525.0,15.054416
628,Zero Dep,Petrol,First Owner,Manual,2017,5.0,103209.0,21.40,1197.0,831.0,6.637297
593,Comprehensive,Petrol,First Owner,Manual,2015,5.0,75499.0,1197.00,82.0,114.0,5.024833
...,...,...,...,...,...,...,...,...,...,...,...
330,Third Party,Diesel,Second Owner,Automatic,2015,5.0,43600.0,17.11,1968.0,17433.0,14.932899
466,Comprehensive,Diesel,Second Owner,Automatic,2017,7.0,52000.0,12.90,2755.0,1745.0,16.866601
121,Comprehensive,Petrol,First Owner,Manual,2010,5.0,75000.0,19.00,998.0,661.0,11.440456
1044,Comprehensive,Diesel,First Owner,Automatic,2020,5.0,24000.0,24.30,1248.0,885.0,11.977645


In [57]:
def train_and_log_model(model, model_name, X_train, y_train, X_test, y_test):
    with mlflow.start_run(run_name=model_name):
        
        print(y_train)
        # Create the full pipeline with the model
        full_pipeline = Pipeline(steps=[
            ('target_encoder', TargetEncodingTransformer(cols=['short_carname'])),
            ('preprocessor', preprocessor),
            ('regressor', model)
        ])
        
        # Fit the model
        full_pipeline.fit(X_train, y_train)
        
        # Predict on the test data
        y_pred = full_pipeline.predict(X_test)
        
        # Evaluate the model
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        r2 = r2_score(y_test, y_pred)
        
        # Log model and metrics in MLflow
        mlflow.sklearn.log_model(full_pipeline, model_name)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("R2", r2)



# Set the experiment
mlflow.set_experiment("Car Price Prediction")



<Experiment: artifact_location='file:///home/reby/Desktop/personal_Projects/car_price_prediction/notebooks/mlruns/507330318150376776', creation_time=1719420685111, experiment_id='507330318150376776', last_update_time=1719420685111, lifecycle_stage='active', name='Car Price Prediction', tags={}>

In [58]:
# List of models to train
models = [
    (SGDRegressor(max_iter=1000, tol=1e-3), "SGDRegressor"),
    (RandomForestRegressor(n_estimators=100), "RandomForestRegressor"),
    # (CatBoostRegressor(iterations=100, silent=True), "CatBoostRegressor"),
    (AdaBoostRegressor(n_estimators=100), "AdaBoostRegressor"),
    (XGBRegressor(n_estimators=100, objective='reg:squarederror'), "XGBRegressor")
]

# Train and log each model
for model, model_name in models:
    train_and_log_model(model, model_name, X_train, y_train, X_test, y_test)


<class 'pandas.core.series.Series'> <class 'pandas.core.frame.DataFrame'>
    insurance_validity fuel_type    ownsership transmission  \
522      Comprehensive    Petrol  Second Owner    Automatic   
851      Comprehensive    Petrol   First Owner       Manual   
215      Comprehensive    Petrol   First Owner    Automatic   
628           Zero Dep    Petrol   First Owner       Manual   
593      Comprehensive    Petrol   First Owner       Manual   

     manufacturing_year  seats  kms_driven  mileage(kmpl)  engine(cc)  \
522                2017    4.0     31900.0          16.47      1998.0   
851                2018    7.0     35000.0          15.40      1497.0   
215                2022    4.0      4000.0          11.86      2998.0   
628                2017    5.0    103209.0          21.40      1197.0   
593                2015    5.0     75499.0        1197.00        82.0   

     torque(Nm)  short_carname  
522       231.0         Mini 3  
851      1173.0     Honda BR-V  
215     3

ValueError: Expected a 2-dimensional container but got <class 'pandas.core.series.Series'> instead. Pass a DataFrame containing a single row (i.e. single sample) or a single column (i.e. single feature) instead.

In [18]:
!mlflow ui

[2024-07-07 14:19:47 +0530] [11192] [INFO] Starting gunicorn 22.0.0
[2024-07-07 14:19:47 +0530] [11192] [INFO] Listening at: http://127.0.0.1:5000 (11192)
[2024-07-07 14:19:47 +0530] [11192] [INFO] Using worker: sync
[2024-07-07 14:19:47 +0530] [11193] [INFO] Booting worker with pid: 11193
[2024-07-07 14:19:48 +0530] [11194] [INFO] Booting worker with pid: 11194
[2024-07-07 14:19:48 +0530] [11195] [INFO] Booting worker with pid: 11195
[2024-07-07 14:19:48 +0530] [11196] [INFO] Booting worker with pid: 11196
^C
[2024-07-07 14:22:17 +0530] [11192] [INFO] Handling signal: int
[2024-07-07 14:22:17 +0530] [11196] [INFO] Worker exiting (pid: 11196)
[2024-07-07 14:22:17 +0530] [11193] [INFO] Worker exiting (pid: 11193)
[2024-07-07 14:22:17 +0530] [11195] [INFO] Worker exiting (pid: 11195)
[2024-07-07 14:22:17 +0530] [11194] [INFO] Worker exiting (pid: 11194)


In [None]:
!pkill -f gunicorn