In [3]:
%pip install pandas scikit-learn xgboost mlflow

Collecting pandas
  Using cached pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.5.2-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting xgboost
  Using cached xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Collecting mlflow
  Downloading mlflow-2.17.0-py3-none-any.whl.metadata (29 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.1.2-cp311-cp311-win_amd64.whl.metadata (59 kB)
     ---------------------------------------- 0.0/59.7 kB ? eta -:--:--
     -------------------- ------------------- 30.7/59.7 kB 1.4 MB/s eta 0:00:01
     ---------------------------------------- 59.7/59.7 kB 1.1 MB/s eta 0:00:00
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.14.1-cp311-cp311-win_am


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
from time import time

import mlflow
import pandas as pd
from pandas import DataFrame, concat
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [5]:
REMOTE_SERVER_URI = "http://127.0.0.1:5000"

In [6]:
def handle_outliers(data: DataFrame, columns: list[str]) -> DataFrame:
    if len(columns) == 0:
        return data
    quartiles = data[columns[0]].quantile([0.25, 0.75])
    iqr = quartiles[0.75] - quartiles[0.25]
    sever_lower_outliers = quartiles[0.25] -3 * iqr
    sever_upper_outliers = quartiles[0.25] +3 * iqr
    if sever_upper_outliers > 0 and sever_lower_outliers > 0:  # if there are outliers, get rid of 'em
        data = data[(data[columns[0]] > sever_lower_outliers)&(data[columns[0]] < sever_upper_outliers)]
    columns.pop(0)
    return handle_outliers(data, columns)

In [7]:
def perform_feature_eng(data: DataFrame) -> DataFrame:
    encoded_cols = data.select_dtypes(include=["string"]).apply(lambda x: LabelEncoder().fit_transform(x))
    encoded_cols.columns = [f"encoded_{col}" for col in encoded_cols.columns]
    data = concat([data, encoded_cols], axis =1)
    return data.drop(data.select_dtypes(include=["string"]), axis=1)

In [13]:
mlflow.set_tracking_uri(REMOTE_SERVER_URI)
mlflow.set_experiment("/xgb_experiment")
with mlflow.start_run():
    df = pd.read_csv(r"C:\Users\Admin\Downloads\credit_risk_dataset.csv")
    mlflow.log_param("rawdataset_shape", df.shape)
    mlflow.log_param("rawdataset_columns", df.columns)
    mlflow.log_artifact(r"C:\Users\Admin\Downloads\credit_risk_dataset.csv")
    
    df = df.convert_dtypes()
    df = df.dropna()
    df = df.drop_duplicates()
    df = handle_outliers(df, columns=df.select_dtypes(include=["Int64", "Float64"]).columns.to_list())
    
    mlflow.log_param("preprocesseddataset_shape", df.shape)
    mlflow.log_param("preprocesseddataset_columns", df.columns)
    
    df = perform_feature_eng(df)
    mlflow.log_param("processeddataset_shape", df.shape)
    mlflow.log_param("processeddataset_columns", df.columns)
    df.to_csv(r"C:\Users\Admin\Downloads\processed_credit_risk_dataset.csv")
    mlflow.log_artifact(r"C:\Users\Admin\Downloads\processed_credit_risk_dataset.csv")
    
    x, y = df.drop(["loan_status"], axis=1), df["loan_status"]
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7)#, random_state=42)
    xgb = XGBClassifier()
    xgb.fit(x_train, y_train)
    mlflow.xgboost.log_model(xgb, "xgb_model")
    
    accuracy = xgb.score(x_test, y_test)
    y_pred = xgb.predict(x_test)
    report = classification_report(y_test, y_pred)
    report_path = r"C:\Users\Admin\Downloads\classification_report.txt"
    with open(report_path, "w") as file:
        file.write(report)
        
    mlflow.log_metric(f"xgb_experiment/accuracy", accuracy)
    mlflow.log_artifact(report_path)

2024/10/12 23:17:01 INFO mlflow.tracking._tracking_service.client: 🏃 View run placid-sheep-845 at: http://127.0.0.1:5000/#/experiments/559483722190579680/runs/61815c0edeff4339bd93c8d75181a813.
2024/10/12 23:17:01 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/559483722190579680.
