In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow
import mlflow.sklearn

# Sample data
data = {
    'insurance_validity': ['valid', 'expired', 'valid', 'expiring', 'valid'],
    'fuel_type': ['petrol', 'diesel', 'electric', 'petrol', 'diesel'],
    'ownership': ['first_owner', 'second_owner', 'first_owner', 'third_owner', 'second_owner'],
    'transmission': ['manual', 'automatic', 'manual', 'automatic', 'manual'],
    'manufacturing_year': [2015, 2016, 2017, None, 2016],  # Including a missing value
    'short_carname': ['carA', 'carB', 'carA', 'carC', 'carB'],
    'seats': [5, 5, 4, None, 5],  # Including a missing value
    'kms_driven': [50000, 30000, 40000, 35000, None],  # Including a missing value
    'mileage(kmpl)': [18.0, 20.0, 15.0, 17.5, None],  # Including a missing value
    'engine(cc)': [1500, 1600, 1200, 1400, 1300],
    'torque(Nm)': [200, 250, 180, 190, 210],
    'price(in lakhs)': [5.5, 6.0, 4.0, 5.0, 5.8]
}

df = pd.DataFrame(data)

# Splitting the data into training and test sets
X = df.drop('price(in lakhs)', axis=1)
y = df['price(in lakhs)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing for numerical features
numeric_features = ['seats', 'kms_driven', 'mileage(kmpl)', 'engine(cc)', 'torque(Nm)', 'manufacturing_year']
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=3)),  # Using KNNImputer
    ('scaler', StandardScaler())
])

# Preprocessing for categorical features excluding target encoding
categorical_features = ['insurance_validity', 'fuel_type', 'ownership', 'transmission']
categorical_transformer = ColumnTransformer(
    transformers=[
        # Label Encoding for insurance_validity
        ('insurance_validity', OrdinalEncoder(), 'insurance_validity'),

        # One-Hot Encoding for fuel_type and transmission
        ('fuel_type', OneHotEncoder(handle_unknown='ignore'), 'fuel_type'),
        ('transmission', OneHotEncoder(handle_unknown='ignore'), 'transmission'),

        # Ordinal Encoding for ownership
        ('ownership', OrdinalEncoder(categories=[['first_owner', 'second_owner', 'third_owner']]), 'ownership')
    ],
    remainder='passthrough'  # Pass the numeric features through without transformation
)

# Create the full pipeline with target encoding and regressor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Separate pipeline for target encoding
target_encoder = TargetEncoder(cols=['short_carname'])

def train_and_log_model(model, model_name, X_train, y_train, X_test, y_test):
    with mlflow.start_run(run_name=model_name):
        # Create the full pipeline with the model
        full_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('target_encoder', target_encoder),
            ('regressor', model)
        ])
        
        # Fit the model
        full_pipeline.fit(X_train, y_train)
        
        # Predict on the test data
        y_pred = full_pipeline.predict(X_test)
        
        # Evaluate the model
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        r2 = r2_score(y_test, y_pred)
        
        # Log model and metrics in MLflow
        mlflow.sklearn.log_model(full_pipeline, model_name)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("R2", r2)

# Set the experiment
mlflow.set_experiment("Car Price Prediction")

# List of models to train
models = [
    (SGDRegressor(max_iter=1000, tol=1e-3), "SGDRegressor"),
    (RandomForestRegressor(n_estimators=100), "RandomForestRegressor"),
    (CatBoostRegressor(iterations=100, silent=True), "CatBoostRegressor"),
    (AdaBoostRegressor(n_estimators=100), "AdaBoostRegressor"),
    (XGBRegressor(n_estimators=100, objective='reg:squarederror'), "XGBRegressor")
]

# Train and log each model
for model, model_name in models:
    train_and_log_model(model, model_name, X_train, y_train, X_test, y_test)


ValueError: 1D data passed to a transformer that expects 2D data. Try to specify the column selection as a list of one item instead of a scalar.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow
import mlflow.sklearn

# Sample data
data = {
    'insurance_validity': ['valid', 'expired', 'valid', 'expiring', 'valid'],
    'fuel_type': ['petrol', 'diesel', 'electric', 'petrol', 'diesel'],
    'ownership': ['first_owner', 'second_owner', 'first_owner', 'third_owner', 'second_owner'],
    'transmission': ['manual', 'automatic', 'manual', 'automatic', 'manual'],
    'manufacturing_year': [2015, 2016, 2017, None, 2016],  # Including a missing value
    'short_carname': ['carA', 'carB', 'carA', 'carC', 'carB'],
    'seats': [5, 5, 4, None, 5],  # Including a missing value
    'kms_driven': [50000, 30000, 40000, 35000, None],  # Including a missing value
    'mileage(kmpl)': [18.0, 20.0, 15.0, 17.5, None],  # Including a missing value
    'engine(cc)': [1500, 1600, 1200, 1400, 1300],
    'torque(Nm)': [200, 250, 180, 190, 210],
    'price(in lakhs)': [5.5, 6.0, 4.0, 5.0, 5.8]
}

df = pd.DataFrame(data)

# Splitting the data into training and test sets
X = df.drop('price(in lakhs)', axis=1)
y = df['price(in lakhs)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing for numerical features
numeric_features = ['seats', 'kms_driven', 'mileage(kmpl)', 'engine(cc)', 'torque(Nm)', 'manufacturing_year']
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=3)),  # Using KNNImputer
    ('scaler', StandardScaler())
])

# Preprocessing for categorical features excluding target encoding
categorical_features = ['insurance_validity', 'fuel_type', 'ownership', 'transmission']
categorical_transformer = ColumnTransformer(
    transformers=[
        # Label Encoding for insurance_validity
        ('insurance_validity', OrdinalEncoder(), ['insurance_validity']),

        # One-Hot Encoding for fuel_type and transmission
        ('fuel_type', OneHotEncoder(handle_unknown='ignore'), ['fuel_type']),
        ('transmission', OneHotEncoder(handle_unknown='ignore'), ['transmission']),

        # Ordinal Encoding for ownership
        ('ownership', OrdinalEncoder(categories=[['first_owner', 'second_owner', 'third_owner']]), ['ownership'])
    ],
    remainder='passthrough'  # Pass the numeric features through without transformation
)

# Create the full pipeline with target encoding and regressor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Function to train and log models
def train_and_log_model(model, model_name, X_train, y_train, X_test, y_test):
    with mlflow.start_run(run_name=model_name):
        # Create the full pipeline with the model
        full_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('target_encoder', TargetEncoder(cols=['short_carname'])),
            ('regressor', model)
        ])
        
        # Fit the model
        full_pipeline.fit(X_train, y_train)
        
        # Predict on the test data
        y_pred = full_pipeline.predict(X_test)
        
        # Evaluate the model
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        r2 = r2_score(y_test, y_pred)
        
        # Log model and metrics in MLflow
        mlflow.sklearn.log_model(full_pipeline, model_name)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("R2", r2)

# Set the experiment
mlflow.set_experiment("Car Price Prediction")

# List of models to train
models = [
    (SGDRegressor(max_iter=1000, tol=1e-3), "SGDRegressor"),
    (RandomForestRegressor(n_estimators=100), "RandomForestRegressor"),
    (CatBoostRegressor(iterations=100, silent=True), "CatBoostRegressor"),
    (AdaBoostRegressor(n_estimators=100), "AdaBoostRegressor"),
    (XGBRegressor(n_estimators=100, objective='reg:squarederror'), "XGBRegressor")
]

# Train and log each model
for model, model_name in models:
    train_and_log_model(model, model_name, X_train, y_train, X_test, y_test)


ValueError: X does not contain the columns listed in cols

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow
import mlflow.sklearn

# Sample data
data = {
    'insurance_validity': ['valid', 'expired', 'valid', 'expiring', 'valid'],
    'fuel_type': ['petrol', 'diesel', 'electric', 'petrol', 'diesel'],
    'ownership': ['first_owner', 'second_owner', 'first_owner', 'third_owner', 'second_owner'],
    'transmission': ['manual', 'automatic', 'manual', 'automatic', 'manual'],
    'manufacturing_year': [2015, 2016, 2017, None, 2016],  # Including a missing value
    'short_carname': ['carA', 'carB', 'carA', 'carC', 'carB'],
    'seats': [5, 5, 4, None, 5],  # Including a missing value
    'kms_driven': [50000, 30000, 40000, 35000, None],  # Including a missing value
    'mileage(kmpl)': [18.0, 20.0, 15.0, 17.5, None],  # Including a missing value
    'engine(cc)': [1500, 1600, 1200, 1400, 1300],
    'torque(Nm)': [200, 250, 180, 190, 210],
    'price(in lakhs)': [5.5, 6.0, 4.0, 5.0, 5.8]
}

df = pd.DataFrame(data)

# Splitting the data into training and test sets
X = df.drop('price(in lakhs)', axis=1)
y = df['price(in lakhs)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Target encoding for 'short_carname'
target_encoder = TargetEncoder(cols=['short_carname'])
X_train['short_carname'] = target_encoder.fit_transform(X_train['short_carname'], y_train)
X_test['short_carname'] = target_encoder.transform(X_test['short_carname'])

# Preprocessing for numerical features
numeric_features = ['seats', 'kms_driven', 'mileage(kmpl)', 'engine(cc)', 'torque(Nm)', 'manufacturing_year', 'short_carname']
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=3)),  # Using KNNImputer
    ('scaler', StandardScaler())
])

# Preprocessing for categorical features excluding target encoding
categorical_features = ['insurance_validity', 'fuel_type', 'ownership', 'transmission']
categorical_transformer = ColumnTransformer(
    transformers=[
        # Label Encoding for insurance_validity
        ('insurance_validity', OrdinalEncoder(), ['insurance_validity']),

        # One-Hot Encoding for fuel_type and transmission
        ('fuel_type', OneHotEncoder(handle_unknown='ignore'), ['fuel_type']),
        ('transmission', OneHotEncoder(handle_unknown='ignore'), ['transmission']),

        # Ordinal Encoding for ownership
        ('ownership', OrdinalEncoder(categories=[['first_owner', 'second_owner', 'third_owner']]), ['ownership'])
    ],
    remainder='passthrough'  # Pass the numeric features through without transformation
)

# Create the full pipeline with target encoding and regressor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

def train_and_log_model(model, model_name, X_train, y_train, X_test, y_test):
    with mlflow.start_run(run_name=model_name):
        # Create the full pipeline with the model
        full_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('regressor', model)
        ])
        
        # Fit the model
        full_pipeline.fit(X_train, y_train)
        
        # Predict on the test data
        y_pred = full_pipeline.predict(X_test)
        
        # Evaluate the model
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        r2 = r2_score(y_test, y_pred)
        
        # Log model and metrics in MLflow
        mlflow.sklearn.log_model(full_pipeline, model_name)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("R2", r2)

# Set the experiment
mlflow.set_experiment("Car Price Prediction")

# List of models to train
models = [
    (SGDRegressor(max_iter=1000, tol=1e-3), "SGDRegressor"),
    (RandomForestRegressor(n_estimators=100), "RandomForestRegressor"),
    (CatBoostRegressor(iterations=100, silent=True), "CatBoostRegressor"),
    (AdaBoostRegressor(n_estimators=100), "AdaBoostRegressor"),
    (XGBRegressor(n_estimators=100, objective='reg:squarederror'), "XGBRegressor")
]

# Train and log each model
for model, model_name in models:
    train_and_log_model(model, model_name, X_train, y_train, X_test, y_test)


ValueError: Found unknown categories ['expired'] in column 0 during transform

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow
import mlflow.sklearn
from sklearn.base import BaseEstimator, TransformerMixin

# Sample data
data = {
    'insurance_validity': ['valid', 'expired', 'valid', 'expiring', 'valid'],
    'fuel_type': ['petrol', 'diesel', 'electric', 'petrol', 'diesel'],
    'ownership': ['first_owner', 'second_owner', 'first_owner', 'third_owner', 'second_owner'],
    'transmission': ['manual', 'automatic', 'manual', 'automatic', 'manual'],
    'manufacturing_year': [2015, 2016, 2017, None, 2016],  # Including a missing value
    'short_carname': ['carA', 'carB', 'carA', 'carC', 'carB'],
    'seats': [5, 5, 4, None, 5],  # Including a missing value
    'kms_driven': [50000, 30000, 40000, 35000, None],  # Including a missing value
    'mileage(kmpl)': [18.0, 20.0, 15.0, 17.5, None],  # Including a missing value
    'engine(cc)': [1500, 1600, 1200, 1400, 1300],
    'torque(Nm)': [200, 250, 180, 190, 210],
    'price(in lakhs)': [5.5, 6.0, 4.0, 5.0, 5.8]
}

df = pd.DataFrame(data)

# Splitting the data into training and test sets
X = df.drop('price(in lakhs)', axis=1)
y = df['price(in lakhs)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Custom transformer for Target Encoding
class TargetEncodingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None):
        self.cols = cols
        self.encoder = TargetEncoder(cols=self.cols)
    
    def fit(self, X, y=None):
        self.encoder.fit(X[self.cols], y)
        return self
    
    def transform(self, X, y=None):
        X[self.cols] = self.encoder.transform(X[self.cols])
        return X

# Preprocessing for numerical features
numeric_features = ['seats', 'kms_driven', 'mileage(kmpl)', 'engine(cc)', 'torque(Nm)', 'manufacturing_year', 'short_carname']
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=3)),  # Using KNNImputer
    ('scaler', StandardScaler())
])

# Preprocessing for categorical features excluding target encoding
categorical_features = ['insurance_validity', 'fuel_type', 'ownership', 'transmission']
categorical_transformer = ColumnTransformer(
    transformers=[
        # Label Encoding for insurance_validity
        ('insurance_validity', OrdinalEncoder(), ['insurance_validity']),

        # One-Hot Encoding for fuel_type and transmission
        ('fuel_type', OneHotEncoder(handle_unknown='ignore'), ['fuel_type']),
        ('transmission', OneHotEncoder(handle_unknown='ignore'), ['transmission']),

        # Ordinal Encoding for ownership
        ('ownership', OrdinalEncoder(categories=[['first_owner', 'second_owner', 'third_owner']]), ['ownership'])
    ],
    remainder='passthrough'  # Pass the numeric features through without transformation
)

# Create the full pipeline with target encoding and regressor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

def train_and_log_model(model, model_name, X_train, y_train, X_test, y_test):
    with mlflow.start_run(run_name=model_name):
        # Create the full pipeline with the model
        full_pipeline = Pipeline(steps=[
            ('target_encoder', TargetEncodingTransformer(cols=['short_carname'])),
            ('preprocessor', preprocessor),
            ('regressor', model)
        ])
        
        # Fit the model
        full_pipeline.fit(X_train, y_train)
        
        # Predict on the test data
        y_pred = full_pipeline.predict(X_test)
        
        # Evaluate the model
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        r2 = r2_score(y_test, y_pred)
        
        # Log model and metrics in MLflow
        mlflow.sklearn.log_model(full_pipeline, model_name)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("R2", r2)

# Set the experiment
mlflow.set_experiment("Car Price Prediction")

# List of models to train
models = [
    (SGDRegressor(max_iter=1000, tol=1e-3), "SGDRegressor"),
    (RandomForestRegressor(n_estimators=100), "RandomForestRegressor"),
    (CatBoostRegressor(iterations=100, silent=True), "CatBoostRegressor"),
    (AdaBoostRegressor(n_estimators=100), "AdaBoostRegressor"),
    (XGBRegressor(n_estimators=100, objective='reg:squarederror'), "XGBRegressor")
]

# Train and log each model
for model, model_name in models:
    train_and_log_model(model, model_name, X_train, y_train, X_test, y_test)


ValueError: Found unknown categories ['expired'] in column 0 during transform