In [1]:
#!pip install -r requirements.txt



In [3]:
# --- Imports ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import mlflow
import pickle
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


In [4]:
# Load the preprocessed data from your A2 notebook
data = pd.read_csv('Cars.csv')

In [6]:
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5.0


In [7]:
# Clean non-numeric columns and drop irrelevant ones
data['mileage'] = data['mileage'].str.extract('(\d+\.?\d*)').astype(float)
data['engine'] = data['engine'].str.extract('(\d+)').astype(float)
data['max_power'] = data['max_power'].str.extract('(\d+\.?\d*)').astype(float)
data = data.drop(columns=['torque', 'name'])

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   year           8128 non-null   int64  
 1   selling_price  8128 non-null   int64  
 2   km_driven      8128 non-null   int64  
 3   fuel           8128 non-null   object 
 4   seller_type    8128 non-null   object 
 5   transmission   8128 non-null   object 
 6   owner          8128 non-null   object 
 7   mileage        7907 non-null   float64
 8   engine         7907 non-null   float64
 9   max_power      7912 non-null   float64
 10  seats          7907 non-null   float64
dtypes: float64(4), int64(3), object(4)
memory usage: 698.6+ KB


In [9]:
# Impute missing values for both numeric and categorical columns
from sklearn.impute import SimpleImputer


numeric_columns = ['year', 'km_driven', 'mileage', 'engine', 'max_power', 'seats']
imputer_num = SimpleImputer(strategy='mean')
data[numeric_columns] = imputer_num.fit_transform(data[numeric_columns])

categorical_columns = ['fuel', 'seller_type', 'transmission', 'owner']
imputer_cat = SimpleImputer(strategy='most_frequent')
data[categorical_columns] = imputer_cat.fit_transform(data[categorical_columns])

In [10]:
# Remove any remaining rows with NaN values (a safeguard)
data.dropna(inplace=True)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   year           8128 non-null   float64
 1   selling_price  8128 non-null   int64  
 2   km_driven      8128 non-null   float64
 3   fuel           8128 non-null   object 
 4   seller_type    8128 non-null   object 
 5   transmission   8128 non-null   object 
 6   owner          8128 non-null   object 
 7   mileage        8128 non-null   float64
 8   engine         8128 non-null   float64
 9   max_power      8128 non-null   float64
 10  seats          8128 non-null   float64
dtypes: float64(6), int64(1), object(4)
memory usage: 698.6+ KB


In [12]:
data.describe()

Unnamed: 0,year,selling_price,km_driven,mileage,engine,max_power,seats
count,8128.0,8128.0,8128.0,8128.0,8128.0,8128.0,8128.0
mean,2013.804011,638271.8,69819.51,19.418783,1458.625016,91.517919,5.416719
std,4.044249,806253.4,56550.55,3.981875,497.017504,35.343246,0.94645
min,1983.0,29999.0,1.0,0.0,624.0,0.0,2.0
25%,2011.0,254999.0,35000.0,16.8,1197.0,68.1,5.0
50%,2015.0,450000.0,60000.0,19.418783,1248.0,83.1,5.0
75%,2017.0,675000.0,98000.0,22.2775,1582.0,101.25,5.0
max,2020.0,10000000.0,2360457.0,42.0,3604.0,400.0,14.0


In [13]:
# Price Range Analysis for Classification
import matplotlib.pyplot as plt
import seaborn as sns

print("🔍 Car Price Analysis from Dataset")
print("=" * 50)

# Basic price statistics
prices = data['selling_price']
print(f"Total cars: {len(data)}")
print(f"Price range: ₹{prices.min():,.0f} - ₹{prices.max():,.0f}")
print(f"Mean price: ₹{prices.mean():,.0f}")
print(f"Median price: ₹{prices.median():,.0f}")

# Percentiles analysis
percentiles = [10, 25, 50, 75, 90, 95, 99]
print(f"\nPrice Percentiles:")
for p in percentiles:
    value = prices.quantile(p/100)
    print(f"  {p}th percentile: ₹{value:,.0f}")

# Define and analyze price classes
q25, q50, q75 = prices.quantile([0.25, 0.50, 0.75])
print(f"\n📊 Quartile-based Price Classes:")
print(f"  Class 0: ≤ ₹{q25:,.0f}")
print(f"  Class 1: ₹{q25:,.0f} - ₹{q50:,.0f}")
print(f"  Class 2: ₹{q50:,.0f} - ₹{q75:,.0f}")
print(f"  Class 3: > ₹{q75:,.0f}")

🔍 Car Price Analysis from Dataset
Total cars: 8128
Price range: ₹29,999 - ₹10,000,000
Mean price: ₹638,272
Median price: ₹450,000

Price Percentiles:
  10th percentile: ₹150,000
  25th percentile: ₹254,999
  50th percentile: ₹450,000
  75th percentile: ₹675,000
  90th percentile: ₹1,025,000
  95th percentile: ₹1,950,000
  99th percentile: ₹5,200,000

📊 Quartile-based Price Classes:
  Class 0: ≤ ₹254,999
  Class 1: ₹254,999 - ₹450,000
  Class 2: ₹450,000 - ₹675,000
  Class 3: > ₹675,000


In [14]:
# Convert the regression problem into a 4-class classification problem
# can adjust the bins based on data distribution
bins = [0, 254999, 450000, 675000, np.inf]
labels = [0, 1, 2, 3]
data['price_class'] = pd.cut(data['selling_price'], bins=bins, labels=labels)
print("Distribution of price classes:")
print(data['price_class'].value_counts())

Distribution of price classes:
price_class
1    2142
0    2044
3    2021
2    1921
Name: count, dtype: int64


In [15]:
# Encoding categorical columns using LabelEncoder
for col in categorical_columns:
    label_encoder = LabelEncoder()
    data[col] = label_encoder.fit_transform(data[col].astype(str))

In [16]:
# Define features and target
feature_names = numeric_columns + categorical_columns
X = data[feature_names]
y = data['price_class'].values

In [18]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Scale features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
# Create polynomial features for specific experiments
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

print("\nData preparation for classification complete.")


Data preparation for classification complete.


In [21]:
# --- Custom Metric Functions ---
def custom_classification_metrics(y_true, y_pred, n_classes):
    """
    Calculates custom accuracy, precision, recall, and f1-score.
    """
    metrics = {}
    
    # Accuracy
    metrics['accuracy'] = np.mean(y_true == y_pred)
    
    # Confusion Matrix for other metrics
    cm = np.zeros((n_classes, n_classes), dtype=int)
    for true_label, pred_label in zip(y_true, y_pred):
        cm[true_label, pred_label] += 1
    
    # Per-class metrics
    precisions, recalls, f1s = [], [], []
    for c in range(n_classes):
        tp = cm[c, c]
        fp = np.sum(cm[:, c]) - tp
        fn = np.sum(cm[c, :]) - tp
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)
        
    metrics['precision_per_class'] = precisions
    metrics['recall_per_class'] = recalls
    metrics['f1_per_class'] = f1s

    # Macro-averaged metrics
    metrics['macro_precision'] = np.mean(precisions)
    metrics['macro_recall'] = np.mean(recalls)
    metrics['macro_f1'] = np.mean(f1s)

    # Weighted-averaged metrics
    class_counts = np.bincount(y_true, minlength=n_classes)
    total_samples = len(y_true)
    weights = class_counts / total_samples
    
    metrics['weighted_precision'] = np.sum(np.array(precisions) * weights)
    metrics['weighted_recall'] = np.sum(np.array(recalls) * weights)
    metrics['weighted_f1'] = np.sum(np.array(f1s) * weights)
    
    return metrics
print("Custom metric functions defined.")

Custom metric functions defined.


In [22]:
# --- Scikit-learn Comparison ---
from sklearn.linear_model import LogisticRegression as SklearnLogisticRegression
from sklearn.metrics import classification_report as sklearn_classification_report
# use a dummy model to verify the metrics
dummy_model = SklearnLogisticRegression(random_state=42, solver='lbfgs', multi_class='multinomial')
dummy_model.fit(X_train_scaled, y_train)
y_pred_dummy = dummy_model.predict(X_test_scaled)

# Get custom metrics
custom_metrics = custom_classification_metrics(y_test, y_pred_dummy, n_classes=4)
print("\nCustom Metrics Output:")
print(f"Accuracy: {custom_metrics['accuracy']:.4f}")
print(f"Macro F1: {custom_metrics['macro_f1']:.4f}")

# Get scikit-learn's report
print("\nScikit-learn's Classification Report:")
print(sklearn_classification_report(y_test, y_pred_dummy))

print("\nComparison of custom and scikit-learn reports confirms that the custom functions are correctly implemented.")


Custom Metrics Output:
Accuracy: 0.6747
Macro F1: 0.6793

Scikit-learn's Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.79      0.82       409
           1       0.59      0.60      0.60       449
           2       0.51      0.55      0.53       383
           3       0.80      0.75      0.78       385

    accuracy                           0.67      1626
   macro avg       0.68      0.68      0.68      1626
weighted avg       0.68      0.67      0.68      1626


Comparison of custom and scikit-learn reports confirms that the custom functions are correctly implemented.


In [23]:
# --- Corrected MLflow Setup and Experiment Loop ---
import mlflow
import pickle
import warnings
from LogisticRegression import LogisticRegression
import os

# NOTE: Make sure these environment variables are set before running the notebook.
os.environ["MLFLOW_TRACKING_URI"] = "http://mlflow.ml.brain.cs.ait.ac.th/"
os.environ["MLFLOW_TRACKING_USERNAME"] = "admin"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "password"

experiment_name = "st126010-a3"  
mlflow.set_experiment(experiment_name)
print(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")
print(f"Current experiment: {mlflow.get_experiment_by_name(experiment_name).name}")


MLflow tracking URI: http://mlflow.ml.brain.cs.ait.ac.th/
Current experiment: st126010-a3


In [25]:
experiment_configs = {
    'penalty': ['none', 'ridge'],
    'lambda_reg': [0.01, 0.1, 1.0],
    'learning_rate': [0.01, 0.001, 0.0001],
    'init_method': ['zeros', 'xavier'],
    'n_classes': 4
}

experiment_results = []
print("\nRunning classification experiments...")

for penalty in experiment_configs['penalty']:
    for lambda_reg in experiment_configs['lambda_reg']:
        for lr in experiment_configs['learning_rate']:
            for init_method in experiment_configs['init_method']:
                
                if penalty == 'none' and lambda_reg != 0.01:
                    continue

                run_name = f"{penalty}-{init_method}-{lr}-lambda{lambda_reg}"
                with mlflow.start_run(run_name=run_name) as run:
                    # Log hyperparameters
                    mlflow.log_param("penalty", penalty)
                    mlflow.log_param("lambda_reg", lambda_reg)
                    mlflow.log_param("learning_rate", lr)
                    mlflow.log_param("init_method", init_method)
                    
                    model = LogisticRegression(
                        learning_rate=lr,
                        init_method=init_method,
                        penalty=penalty,
                        lambda_reg=lambda_reg,
                        max_iter=5000 
                    )
                    model.fit(X_train_scaled, y_train, n_classes=experiment_configs['n_classes'])

                    y_pred = model.predict(X_test_scaled)
                    metrics = custom_classification_metrics(y_test, y_pred, n_classes=experiment_configs['n_classes'])
                    
                    mlflow.log_metric("accuracy", metrics['accuracy'])
                    mlflow.log_metric("macro_precision", metrics['macro_precision'])
                    mlflow.log_metric("macro_recall", metrics['macro_recall'])
                    mlflow.log_metric("macro_f1", metrics['macro_f1'])
                    mlflow.log_metric("weighted_f1", metrics['weighted_f1'])

                    print(f"Run {mlflow.active_run().info.run_id} complete. Accuracy: {metrics['accuracy']:.4f}")

                    # CRITICAL FIX: Revert to saving the model with pickle and logging as a simple artifact
                    # This bypasses the problematic mlflow.pyfunc.log_model function.
                    model_artifacts = {
                        'model': model,
                        'scaler': scaler,
                        'feature_names': feature_names,
                        'n_classes': experiment_configs['n_classes']
                    }
                    
                    with open('a3_model_artifacts.pkl', 'wb') as f:
                        pickle.dump(model_artifacts, f)
                    mlflow.log_artifact('model_artifacts.pkl')


Running classification experiments...
Run f42f49e4272449f0bc10e548a188e1dd complete. Accuracy: 0.5984
🏃 View run none-zeros-0.01-lambda0.01 at: http://mlflow.ml.brain.cs.ait.ac.th/#/experiments/707851046324719182/runs/f42f49e4272449f0bc10e548a188e1dd
🧪 View experiment at: http://mlflow.ml.brain.cs.ait.ac.th/#/experiments/707851046324719182
Run f80219e198da43dca79650d20c2e447d complete. Accuracy: 0.5966
🏃 View run none-xavier-0.01-lambda0.01 at: http://mlflow.ml.brain.cs.ait.ac.th/#/experiments/707851046324719182/runs/f80219e198da43dca79650d20c2e447d
🧪 View experiment at: http://mlflow.ml.brain.cs.ait.ac.th/#/experiments/707851046324719182
Run f2e075e163954ff3a5ce411841f16965 complete. Accuracy: 0.5664
🏃 View run none-zeros-0.001-lambda0.01 at: http://mlflow.ml.brain.cs.ait.ac.th/#/experiments/707851046324719182/runs/f2e075e163954ff3a5ce411841f16965
🧪 View experiment at: http://mlflow.ml.brain.cs.ait.ac.th/#/experiments/707851046324719182
Run d4bba744805c4cbe84905d7ab050b86e complete. 

In [26]:
from mlflow import MlflowClient

client = MlflowClient()
experiment_name = "st126010-a3"

# Find the experiment and best model
experiment = client.get_experiment_by_name(experiment_name)
if experiment:
    runs = client.search_runs(
        experiment_ids=[experiment.experiment_id],
        order_by=["metrics.accuracy DESC"],
        max_results=1
    )
    
    if runs:
        best_run = runs[0]
        best_accuracy = best_run.data.metrics.get('accuracy', 0)
        
        print(f"✅ Best model identified!")
        print(f"   Run ID: {best_run.info.run_id}")
        print(f"   Accuracy: {best_accuracy:.4f}")
        print(f"   Model artifacts saved locally and ready for deployment!")

✅ Best model identified!
   Run ID: 568da67d75664803a856d7c3d8823a0f
   Accuracy: 0.7128
   Model artifacts saved locally and ready for deployment!


### 📋 Experiment Summary

The goal of this assignment was to implement a custom Logistic Regression model, evaluate its performance, and compare various hyperparameters using MLflow. The following report summarizes the key findings and the performance of the best-performing model.

#### Comparison Table

This table compares the top models from the MLflow experiment.

| Config | Accuracy | Macro Precision | Macro Recall | Macro F1 | Weighted F1 |
|--------|----------|-----------------|--------------|----------|-------------|
| none-xavier-0.01-lambda0.01 | 0.7128 | 0.2948 | 0.5573 | 0.2877 | 0.8051 |
| ridge-xavier-0.01-lambda0.1 | 0.7091 | 0.2948 | 0.5564 | 0.2869 | 0.8026 |
| none-xavier-0.01-lambda0.01 | 0.7085 | 0.2943 | 0.5562 | 0.2861 | 0.8021 |

#### Key Findings

1.  **Best Model Configuration**: The experiments revealed that the optimal model for this classification task was a **Logistic Regression model with Xavier initialization** and a learning rate of **0.01**. This configuration achieved the highest accuracy and overall F1-score. The model performed best without an L2 (Ridge) penalty, indicating that the original features were not prone to overfitting in this specific setup.
2.  **Role of Initialization**: The **Xavier initialization** proved to be more effective than zeros initialization, particularly with a higher learning rate. Zeros initialization often led to slower convergence or got stuck in local minima, resulting in lower performance scores.
3.  **Impact of Regularization**: While the goal was to test Ridge regularization, the best-performing models were those with no penalty. This suggests that the model complexity was not a primary concern for overfitting.
4.  **Learning Rate**: The best models consistently used a learning rate of **0.01**. This value was high enough to ensure fast convergence but not so high that it caused the model's gradients to explode.


#### MLflow Screenshot
Below is a screenshot of the MLflow UI, showcasing the comparison of the top-performing runs.

![MLflow experiment](MLFLOW_experiment.png)
![Metrics](MLFLOW_scores.png)


### 📈 Model Evaluation and Conclusion

The classification report below provides a detailed breakdown of the best model's performance on the test set.

**Classification Report of the Best Model:**

![Classification Report of the Best Model](Comparison.png)