In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import mlflow
import mlflow.sklearn
import joblib
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv('E://AIT-PhD//2025 Sem1//Sem1 Subjects//Machine Learning//Machine learning//Assignment//cardekho.csv')
df.head(10)


Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage(km/ltr/kg),engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0
5,Hyundai Xcent 1.2 VTVT E Plus,2017,440000,45000,Petrol,Individual,Manual,First Owner,20.14,1197.0,81.86,5.0
6,Maruti Wagon R LXI DUO BSIII,2007,96000,175000,LPG,Individual,Manual,First Owner,17.3,1061.0,57.5,5.0
7,Maruti 800 DX BSII,2001,45000,5000,Petrol,Individual,Manual,Second Owner,16.1,796.0,37.0,4.0
8,Toyota Etios VXD,2011,350000,90000,Diesel,Individual,Manual,First Owner,23.59,1364.0,67.1,5.0
9,Ford Figo Diesel Celebration Edition,2013,200000,169000,Diesel,Individual,Manual,First Owner,20.0,1399.0,68.1,5.0


In [None]:
# Convert selling price into four categories
df['price_category'] = pd.cut(df['selling_price'], bins=[0, 300000, 600000, 900000, np.inf], labels=[0, 1, 2, 3])

# Splitting dataset
X = df.drop(columns=['selling_price', 'price_category'])  # Features
y = df['price_category']  # Target



In [None]:
# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# Apply OneHotEncoding to categorical variables
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_columns]))

# Assign correct column names
X_encoded.columns = encoder.get_feature_names_out(categorical_columns)



In [None]:
# Drop original categorical columns and concatenate encoded ones
X = X.drop(columns=categorical_columns)
X = pd.concat([X, X_encoded], axis=1)

# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')  # Fills NaN with most frequent value
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
# Splitting dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training size: {X_train.shape}, Testing size: {X_test.shape}")

Training size: (6502, 2398), Testing size: (1626, 2398)


In [None]:
class CustomLogisticRegression:
    def __init__(self, penalty='none', C=1.0):
        self.penalty = penalty
        self.C = C
        self.model = None

    def fit(self, X_train, y_train):
        if self.penalty == 'l2':
            self.model = LogisticRegression(penalty='l2', C=self.C, solver='lbfgs', multi_class='multinomial', max_iter=1000)
        else:
            self.model = LogisticRegression(penalty=None, solver='lbfgs', multi_class='multinomial', max_iter=1000)
        self.model.fit(X_train, y_train)

    def predict(self, X_test):
        return self.model.predict(X_test)

    def accuracy(self, y_true, y_pred):
        return np.mean(y_true == y_pred)

    def precision(self, y_true, y_pred, class_label):
        tp = np.sum((y_pred == class_label) & (y_true == class_label))
        fp = np.sum((y_pred == class_label) & (y_true != class_label))
        return tp / (tp + fp) if (tp + fp) > 0 else 0

    def recall(self, y_true, y_pred, class_label):
        tp = np.sum((y_pred == class_label) & (y_true == class_label))
        fn = np.sum((y_pred != class_label) & (y_true == class_label))
        return tp / (tp + fn) if (tp + fn) > 0 else 0

    def f1_score(self, y_true, y_pred, class_label):
        p = self.precision(y_true, y_pred, class_label)
        r = self.recall(y_true, y_pred, class_label)
        return (2 * p * r) / (p + r) if (p + r) > 0 else 0

    def macro_avg(self, y_true, y_pred):
        classes = np.unique(y_true)
        precision = np.mean([self.precision(y_true, y_pred, c) for c in classes])
        recall = np.mean([self.recall(y_true, y_pred, c) for c in classes])
        f1 = np.mean([self.f1_score(y_true, y_pred, c) for c in classes])
        return precision, recall, f1

    def weighted_avg(self, y_true, y_pred):
        classes, counts = np.unique(y_true, return_counts=True)
        total = len(y_true)
        weights = counts / total
        precision = np.sum([w * self.precision(y_true, y_pred, c) for w, c in zip(weights, classes)])
        recall = np.sum([w * self.recall(y_true, y_pred, c) for w, c in zip(weights, classes)])
        f1 = np.sum([w * self.f1_score(y_true, y_pred, c) for w, c in zip(weights, classes)])
        return precision, recall, f1

# Example usage
clf = CustomLogisticRegression(penalty='l2', C=1.0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Accuracy:", clf.accuracy(y_test, y_pred))
print("Macro Precision, Recall, F1:", clf.macro_avg(y_test, y_pred))
print("Weighted Precision, Recall, F1:", clf.weighted_avg(y_test, y_pred))



Accuracy: 0.5947109471094711
Macro Precision, Recall, F1: (0.579514570284325, 0.5579483530384859, 0.547386364366482)
Weighted Precision, Recall, F1: (0.5808210435914234, 0.594710947109471, 0.5697266654311377)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
from sklearn.metrics import classification_report

print("Sklearn Classification Report:")
print(classification_report(y_test, y_pred))

Sklearn Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.74      0.72       527
           1       0.52      0.68      0.59       585
           2       0.44      0.15      0.22       310
           3       0.66      0.67      0.66       204

    accuracy                           0.59      1626
   macro avg       0.58      0.56      0.55      1626
weighted avg       0.58      0.59      0.57      1626



In [None]:
mlflow.set_tracking_uri("https://mlflow.cs.ait.ac.th/")
mlflow.set_experiment("st125675-a3")

with mlflow.start_run():
    mlflow.log_param("penalty", clf.penalty)
    mlflow.log_param("C", clf.C)
    
    mlflow.log_metric("accuracy", clf.accuracy(y_test, y_pred))
    macro_p, macro_r, macro_f1 = clf.macro_avg(y_test, y_pred)
    weighted_p, weighted_r, weighted_f1 = clf.weighted_avg(y_test, y_pred)
    
    mlflow.log_metric("macro_precision", macro_p)
    mlflow.log_metric("macro_recall", macro_r)
    mlflow.log_metric("macro_f1", macro_f1)
    
    mlflow.log_metric("weighted_precision", weighted_p)
    mlflow.log_metric("weighted_recall", weighted_r)
    mlflow.log_metric("weighted_f1", weighted_f1)
    
    mlflow.sklearn.log_model(clf.model, "logistic_regression_model")

print("Model and experiment logged in MLflow!")

MlflowException: API request to https://mlflow.cs.ait.ac.th/api/2.0/mlflow/experiments/get-by-name failed with exception HTTPSConnectionPool(host='mlflow.cs.ait.ac.th', port=443): Max retries exceeded with url: /api/2.0/mlflow/experiments/get-by-name?experiment_name=st125675-a3 (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'mlflow.cs.ait.ac.th'. (_ssl.c:1000)")))

In [None]:
# Register the best model in MLflow
import mlflow.pyfunc

model_name = "st125675-a3-model"

# Register model
mlflow.sklearn.log_model(clf.model, model_name)

# Transition model to 'Staging'
client = mlflow.tracking.MlflowClient()
latest_version = client.get_latest_versions(model_name, stages=["None"])[0].version

client.transition_model_version_stage(
    name=model_name,
    version=latest_version,
    stage="Staging"
)

print(f"Model {model_name} version {latest_version} set to Staging!")

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



MlflowException: API request to https://mlflow.cs.ait.ac.th/api/2.0/mlflow/runs/create failed with exception HTTPSConnectionPool(host='mlflow.cs.ait.ac.th', port=443): Max retries exceeded with url: /api/2.0/mlflow/runs/create (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'mlflow.cs.ait.ac.th'. (_ssl.c:1000)")))

In [None]:
import unittest
import numpy as np

class TestLogisticRegression(unittest.TestCase):

    def setUp(self):
        # Creating a small dataset for testing
        self.X_sample = np.array([[1, 2], [2, 3], [3, 4], [4, 5]])
        self.y_sample = np.array([0, 1, 1, 0])

        # Initialize and train model
        self.model = CustomLogisticRegression(penalty='l2', C=1.0)
        self.model.fit(self.X_sample, self.y_sample)

    def test_model_input(self):
        # Ensure model accepts valid input shape
        try:
            self.model.predict(self.X_sample)
            valid = True
        except Exception as e:
            print(f"Error: {e}")
            valid = False

        self.assertTrue(valid, "Model should accept valid input")

    def test_output_shape(self):
        # Check if model output has expected shape
        y_pred = self.model.predict(self.X_sample)
        self.assertEqual(y_pred.shape, self.y_sample.shape, "Output shape should match input labels")

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)

### Setting Up GitHub Actions for CI/CD

1. **Create a `.github/workflows/ci.yml` file** in your repository.
2. **Paste the following script** into `ci.yml`:

```yaml
name: ML Model CI/CD

on: [push]

jobs:
  test:
    runs-on: ubuntu-latest

    steps:
    - name: Checkout Repository
      uses: actions/checkout@v2

    - name: Set Up Python
      uses: actions/setup-python@v2
      with:
        python-version: '3.8'

    - name: Install Dependencies
      run: |
        pip install -r requirements.txt

    - name: Run Unit Tests
      run: |
        python -m unittest discover -s . -p "test_*.py"

  deploy:
    needs: test
    runs-on: ubuntu-latest

    steps:
    - name: Deploy Model if Tests Pass
      run: echo "Deployment step here (e.g., update MLflow, push Docker image)"
```

3. **Commit & Push** the `.github/workflows/ci.yml` file to your repository.
4. **Now, every push will trigger automatic testing and deployment!** 🚀
