In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    ConfusionMatrixDisplay,
)

In [None]:
# Classification: Can we predict whether a particular car model will be a "best-seller" (yes/no) based on
# features, historical sales data, and marketing efforts.
file_path = "../data/processed/car_sales_data_with_synthetic_features.csv"
df = pd.read_csv(file_path)

df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23906 entries, 0 to 23905
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Car_id            23906 non-null  object
 1   Date              23906 non-null  object
 2   Customer Name     23905 non-null  object
 3   Gender            23906 non-null  object
 4   Annual Income     23906 non-null  int64 
 5   Dealer_Name       23906 non-null  object
 6   Company           23906 non-null  object
 7   Model             23906 non-null  object
 8   Engine            23906 non-null  object
 9   Transmission      23906 non-null  object
 10  Color             23906 non-null  object
 11  Price ($)         23906 non-null  int64 
 12  Dealer_No         23906 non-null  object
 13  Body Style        23906 non-null  object
 14  Phone             23906 non-null  int64 
 15  Dealer_Region     23906 non-null  object
 16  Mileage_km_total  23906 non-null  int64 
 17  Crash_Test_S

Unnamed: 0,Car_id,Date,Customer Name,Gender,Annual Income,Dealer_Name,Company,Model,Engine,Transmission,Color,Price ($),Dealer_No,Body Style,Phone,Dealer_Region,Mileage_km_total,Crash_Test_Score,Family_Size
0,C_CND_000001,1/2/2022,Geraldine,Male,13500,Buddy Storbeck's Diesel Service Inc,Ford,Expedition,DoubleÂ Overhead Camshaft,Auto,Black,26000,06457-3834,SUV,8264678,Middletown,68468,4,2
1,C_CND_000002,1/2/2022,Gia,Male,1480000,C & M Motors Inc,Dodge,Durango,DoubleÂ Overhead Camshaft,Auto,Black,19000,60504-7114,SUV,6848189,Aurora,64053,3,2
2,C_CND_000003,1/2/2022,Gianna,Male,1035000,Capitol KIA,Cadillac,Eldorado,Overhead Camshaft,Manual,Red,31500,38701-8047,Passenger,7298798,Greenville,24942,5,3


In [49]:
# Extract year and month from the 'date' column
df["Date"] = pd.to_datetime(df["Date"])
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month

# Combine company and model into a single 'Model' column
df["Full_Model"] = df["Company"] + " " + df["Model"]

# Select relevant features for classification
features = [
    "year",
    "month",
    "Full_Model",
    "Engine",
    "Transmission",
    "Color",
    "Body Style",
    "Mileage_km_total",
    "Crash_Test_Score",
    "Family_Size",
]

df = df[features]

# Rename columns for clarity
df2 = df.rename(
    columns={
        "Full_Model": "full_model",
        "Engine": "engine",
        "Transmission": "transmission",
        "Color": "color",
        "Body Style": "body_style",
        "Mileage_km_total": "mileage_km",
        "Crash_Test_Score": "crash_test_score",
        "Family_Size": "family_size",
    }
)

# Parse types of features
df2 = df2.astype(
    {
        "year": "int",
        "month": "int",
        "crash_test_score": "int",
        "family_size": "int",
        "mileage_km": "int",
    }
)

print(df2.info())
df2.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23906 entries, 0 to 23905
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   year              23906 non-null  int64 
 1   month             23906 non-null  int64 
 2   full_model        23906 non-null  object
 3   engine            23906 non-null  object
 4   transmission      23906 non-null  object
 5   color             23906 non-null  object
 6   body_style        23906 non-null  object
 7   mileage_km        23906 non-null  int64 
 8   crash_test_score  23906 non-null  int64 
 9   family_size       23906 non-null  int64 
dtypes: int64(5), object(5)
memory usage: 1.8+ MB
None


Unnamed: 0,year,month,full_model,engine,transmission,color,body_style,mileage_km,crash_test_score,family_size
0,2022,1,Ford Expedition,DoubleÂ Overhead Camshaft,Auto,Black,SUV,68468,4,2
1,2022,1,Dodge Durango,DoubleÂ Overhead Camshaft,Auto,Black,SUV,64053,3,2
2,2022,1,Cadillac Eldorado,Overhead Camshaft,Manual,Red,Passenger,24942,5,3
3,2022,1,Toyota Celica,Overhead Camshaft,Manual,Pale White,SUV,108673,3,3
4,2022,1,Acura TL,DoubleÂ Overhead Camshaft,Auto,Red,Hatchback,51660,4,1


In [50]:
# Add a new feature to indicate if the car model is a best-seller
# For simplicity, let's assume a model is a best-seller if it is in top 3 models sold in a given year and month.

# Aggregate by year, month, and full_model to get total number of sales
df2_agg = (
    df2.groupby(["year", "month", "full_model"])["year"].count().reset_index(name="total_sales")
)
# Find rank of each full_model within a year and a month based on total_sales
df2_agg["rank"] = df2_agg.groupby(["year", "month"])["total_sales"].rank(method="first", ascending=False)
df2_agg["monthly_best_seller"] = (df2_agg["rank"] <= 3).astype(int)
df2_agg.drop(columns=["rank", "total_sales"], inplace=True)

# Merge the aggregated data back to the original DataFrame
df3 = df2.merge(df2_agg, on=["year", "month", "full_model"], how="left")

df3.head()

Unnamed: 0,year,month,full_model,engine,transmission,color,body_style,mileage_km,crash_test_score,family_size,monthly_best_seller
0,2022,1,Ford Expedition,DoubleÂ Overhead Camshaft,Auto,Black,SUV,68468,4,2,0
1,2022,1,Dodge Durango,DoubleÂ Overhead Camshaft,Auto,Black,SUV,64053,3,2,0
2,2022,1,Cadillac Eldorado,Overhead Camshaft,Manual,Red,Passenger,24942,5,3,1
3,2022,1,Toyota Celica,Overhead Camshaft,Manual,Pale White,SUV,108673,3,3,0
4,2022,1,Acura TL,DoubleÂ Overhead Camshaft,Auto,Red,Hatchback,51660,4,1,0


In [51]:
# Prepare data for model training and testing

test_cond = (df3["year"] == 2023) & (df3["month"] >= 10)
train_df = df3[~test_cond]
test_df = df3[test_cond]
X_train = train_df.drop(columns=["monthly_best_seller"])
y_train = train_df["monthly_best_seller"]
X_test = test_df.drop(columns=["monthly_best_seller"])
y_test = test_df["monthly_best_seller"]

print("Shapes:")
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

dist = np.bincount(y_train)
print("\nClass distribution:", dist)
print("First class:", 100 - dist[1] / dist[0] * 100)
print("Second class:", dist[1] / dist[0] * 100)

Shapes:
(19140, 10) (19140,)
(4766, 10) (4766,)

Class distribution: [17935  1205]
First class: 93.28129356007805
Second class: 6.71870643992194


In [52]:
numeric_features = ["year", "month", "mileage_km", "crash_test_score", "family_size"]
categorical_features = ["full_model", "engine", "transmission", "color", "body_style"]
weights = compute_class_weight(
    class_weight="balanced", classes=np.unique(y_train), y=y_train
)
class_weights = dict(zip(np.unique(y_train), weights))

In [53]:
def grid_search_pipeline(pipeline, extra_params: dict = None):
    param_grid = {
        # OneHotEncoder hyperparameters
        "preprocessor__cat__onehot__drop": [
            "first",
            None,
        ],  # Drop first column to avoid collinearity
    }
    if extra_params is not None:
        param_grid.update(extra_params)
    grid = GridSearchCV(pipeline, param_grid, scoring="f1", cv=5)
    grid.fit(X_train, y_train)

    print("Best params:", grid.best_params_)
    print("Best AUC:", grid.best_score_)
    return grid.best_estimator_

In [54]:
def evaluate_model(model):
    y_pred = model.predict(X_test)

    # Predict probabilities (for ROC, AUC, etc.)
    y_proba = model.predict_proba(X_test)[:, 1]

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("AUC:", roc_auc_score(y_test, y_proba))
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred)

In [55]:
pipeline_1 = Pipeline(
    [
        (
            "preprocessor",
            ColumnTransformer(
                [
                    ("num", Pipeline([("scaler", StandardScaler())]), numeric_features),
                    (
                        "cat",
                        Pipeline([("onehot", OneHotEncoder(handle_unknown="ignore"))]),
                        categorical_features,
                    ),
                ]
            ),
        ),
        ("classifier", LogisticRegression(max_iter=1000, class_weight=class_weights)),
    ]
)
pipeline_1

In [56]:
from sklearn.ensemble import RandomForestClassifier

pipeline_2 = Pipeline(
    [
        (
            "preprocessor",
            ColumnTransformer(
                [
                    ("num", Pipeline([("scaler", StandardScaler())]), numeric_features),
                    (
                        "cat",
                        Pipeline([("onehot", OneHotEncoder(handle_unknown="ignore"))]),
                        categorical_features,
                    ),
                ]
            ),
        ),
        (
            "classifier",
            RandomForestClassifier(),
        ),
    ]
)
pipeline_2

In [57]:
from xgboost import XGBClassifier

pipeline_3 = Pipeline(
    [
        (
            "preprocessor",
            ColumnTransformer(
                [
                    ("num", Pipeline([("scaler", StandardScaler())]), numeric_features),
                    (
                        "cat",
                        Pipeline([("onehot", OneHotEncoder(handle_unknown="ignore"))]),
                        categorical_features,
                    ),
                ]
            ),
        ),
        (
            "classifier",
            XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
        ),
    ]
)
pipeline_3

In [58]:
from lightgbm import LGBMClassifier

pipeline_4 = Pipeline(
    [
        (
            "preprocessor",
            ColumnTransformer(
                [
                    ("num", Pipeline([("scaler", StandardScaler())]), numeric_features),
                    (
                        "cat",
                        Pipeline([("onehot", OneHotEncoder(handle_unknown="ignore"))]),
                        categorical_features,
                    ),
                ]
            ),
        ),
        (
            "classifier",
            LGBMClassifier(verbose=-1, random_state=42),
        ),
    ]
)
pipeline_4

In [None]:
model_1 = grid_search_pipeline(
    pipeline_1,
    # extra_params={
    #     "classifier__C": [0.01, 0.1, 1, 100],
    #     "classifier__penalty": ["l2", "elasticnet"],
    #     "classifier__solver": ["liblinear", "lbfgs"],
    #     "classifier__fit_intercept": [True, False],
    #     "classifier__class_weight": [class_weights, "balanced", None],
    #     "classifier__l1_ratio": [None, 0.25, 0.5, 0.75],
    # },
    extra_params={
        # LogisticRegression hyperparameters
        "classifier__class_weight": [class_weights, "balanced", None],
        "classifier__penalty": ["l1", "l2", "elasticnet"],
        "classifier__C": [0.01, 0.1, 1.0, 10.0],
        "classifier__solver": ["liblinear", "saga", "lbfgs"],
        "classifier__l1_ratio": [None, 0.25, 0.5, 0.75],  # Only used with elasticnet
        "classifier__fit_intercept": [True, False],
        # OneHotEncoder options
        "preprocessor__cat__onehot__drop": [None, "first"],
        "preprocessor__cat__onehot__sparse_output": [
            False
        ],  # Needed for dense matrix ops
        "preprocessor__cat__onehot__min_frequency": [None, 5],
        "preprocessor__cat__onehot__max_categories": [None, 20, 50],
        # StandardScaler options (less critical but tunable)
        "preprocessor__num__scaler__with_mean": [True],
        "preprocessor__num__scaler__with_std": [True],
    },
)
evaluate_model(model_1)



In [None]:
model_2 = grid_search_pipeline(
    pipeline_2,
    extra_params={
        "classifier__n_estimators": [100, 200],
        "classifier__max_depth": [None, 10, 20],
        "classifier__min_samples_split": [2, 5],
        "classifier__min_samples_leaf": [1, 2],
        "classifier__class_weight": [None, "balanced", class_weights],
    },
)
evaluate_model(model_2)

In [None]:
model_3 = grid_search_pipeline(
    pipeline_3,
    extra_params={
        "classifier__n_estimators": [100, 300],
        "classifier__max_depth": [3, 6, 10],
        "classifier__learning_rate": [0.01, 0.1],
        "classifier__subsample": [0.8, 1],
        "classifier__colsample_bytree": [0.8, 1],
        "classifier__scale_pos_weight": [1, 3],  # if imbalance
    },
)
evaluate_model(model_3)

In [None]:
model_4 = grid_search_pipeline(
    pipeline_4,
    extra_params={
        "classifier__n_estimators": [100, 300],
        "classifier__learning_rate": [0.01, 0.1],
        "classifier__num_leaves": [15, 31, 63],
        "classifier__max_depth": [-1, 10],
        "classifier__min_child_samples": [10, 20],
        "classifier__subsample": [0.8, 1],
        "classifier__colsample_bytree": [0.8, 1],
    },
)
evaluate_model(model_4)

In [None]:
import pickle

def dump_model(model, filename):
    with open(filename, "wb") as f:
        pickle.dump(model, f)

def load_model(filename):
    with open(filename, "rb") as f:
        return pickle.load(f)


dump_model(model=None, filename="../model/model.pkl")
# Load with
# load_model(filename="../model/model.pkl")