In [None]:
from time import time

import mlflow
import pandas as pd
from pandas import DataFrame, concat
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [None]:
REMOTE_SERVER_URI = "http://127.0.0.1:5000"

In [None]:
def handle_outliers(data: DataFrame, columns: list[str]) -> DataFrame:
    if len(columns) == 0:
        return data
    quartiles = data[columns[0]].quantile([0.25, 0.75])
    iqr = quartiles[0.75] - quartiles[0.25]
    sever_lower_outliers = quartiles[0.25] -3 * iqr
    sever_upper_outliers = quartiles[0.25] +3 * iqr
    if sever_upper_outliers > 0 and sever_lower_outliers > 0:  # if there are outliers, get rid of 'em
        data = data[(data[columns[0]] > sever_lower_outliers)&(data[columns[0]] < sever_upper_outliers)]
    columns.pop(0)
    return handle_outliers(data, columns)

In [None]:
def perform_feature_eng(data: DataFrame) -> DataFrame:
    encoded_cols = data.select_dtypes(include=["string"]).apply(lambda x: LabelEncoder().fit_transform(x))
    encoded_cols.columns = [f"encoded_{col}" for col in encoded_cols.columns]
    data = concat([data, encoded_cols], axis =1)
    return data.drop(data.select_dtypes(include=["string"]), axis=1)

In [None]:
mlflow.set_tracking_uri(REMOTE_SERVER_URI)
mlflow.set_experiment("/xgb_experiment")
with mlflow.start_run():
    df = pd.read_csv(r"..\..\data\credit_risk_dataset.csv")
    mlflow.log_param("rawdataset_shape", df.shape)
    mlflow.log_param("rawdataset_columns", df.columns)
    mlflow.log_artifact("..\..\data\credit_risk_dataset.csv")
    
    df = df.convert_dtypes()
    df = df.dropna()
    df = df.drop_duplicates()
    df = handle_outliers(df, columns=df.select_dtypes(include=["Int64", "Float64"]).columns.to_list())
    
    mlflow.log_param("preprocesseddataset_shape", df.shape)
    mlflow.log_param("preprocesseddataset_columns", df.columns)
    
    df = perform_feature_eng(df)
    mlflow.log_param("processeddataset_shape", df.shape)
    mlflow.log_param("processeddataset_columns", df.columns)
    df.to_csv(r"..\..\data\processed_credit_risk_dataset.csv")
    mlflow.log_artifact("..\..\data\processed_credit_risk_dataset.csv")
    
    x, y = df.drop(["loan_status"], axis=1), df["loan_status"]
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7)#, random_state=42)
    xgb = XGBClassifier()
    xgb.fit(x_train, y_train)
    mlflow.xgboost.log_model(xgb, "xgb_model")
    
    accuracy = xgb.score(x_test, y_test)
    y_pred = xgb.predict(x_test)
    report = classification_report(y_test, y_pred)
    report_path = r"..\..\data\classification_report.txt"
    with open(report_path, "w") as file:
        file.write(report)
        
    mlflow.log_metric(f"xgb_experiment/accuracy", accuracy)
    mlflow.log_artifact(report_path)