# Desenvolvimento do modelo para prediçao de laptop 

Exploracao inicial de diferentes tipos de modelos

In [43]:
#%pip install dagshub
#%pip install catboost
#%pip install xgboost
#%pip install lightgbm

In [60]:
import pandas as pd
import mlflow
import dagshub
from dagshub.data_engine import datasources
from sklearn.model_selection import train_test_split
import mlflow.sklearn
import mlflow.catboost
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression


from mlflow.models import infer_signature

## Obtendo dados do dataset

In [61]:
ds = datasources.get('rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring', 'processed')

In [62]:
ds.all().dataframe

Unnamed: 0,path,datapoint_id,dagshub_download_url,media type,size
0,credit-score-processed.csv,103597627,https://dagshub.com/api/v1/repos/rrmoreira/fia...,text/plain,14128312


In [63]:
res = ds.head()

for dp in res:
    dataset_url = dp.download_url

In [64]:
dataset_url

'https://dagshub.com/api/v1/repos/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring/raw/main/data/processed/credit-score-processed.csv'

In [65]:
df = pd.read_csv(dataset_url)
df.head()

Unnamed: 0,age,annual_income,monthly_inhand_salary,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,delay_from_due_date,num_of_delayed_payment,changed_credit_limit,...,payment_behaviour_high_spent_large_value_payments,payment_behaviour_high_spent_medium_value_payments,payment_behaviour_high_spent_small_value_payments,payment_behaviour_low_spent_large_value_payments,payment_behaviour_low_spent_medium_value_payments,payment_behaviour_low_spent_small_value_payments,payment_behaviour_other,payment_of_min_amount_no,payment_of_min_amount_other,payment_of_min_amount_yes
0,23,19114.12,1824.843333,3,4,3,4,3,7,11.27,...,0,0,1,0,0,0,0,1,0,0
1,23,19114.12,0.0,3,4,3,4,0,0,11.27,...,0,0,0,1,0,0,0,1,0,0
2,500,19114.12,0.0,3,4,3,4,3,7,0.0,...,0,0,0,0,1,0,0,1,0,0
3,23,19114.12,0.0,3,4,3,4,5,4,6.27,...,0,0,0,0,0,1,0,1,0,0
4,23,19114.12,1824.843333,3,4,3,4,6,0,11.27,...,0,1,0,0,0,0,0,1,0,0


In [66]:
dagshub.init(repo_owner="rrmoreira", repo_name="fiap-ds-mlops-9dtsr-credit-scoring", mlflow=True)

In [67]:
mlflow.autolog()

2025/08/02 13:29:49 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/08/02 13:29:49 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2025/08/02 13:29:49 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


In [68]:
features = list(df.columns)
features.remove('credit_score') #target variable

In [69]:
features

['age',
 'annual_income',
 'monthly_inhand_salary',
 'num_bank_accounts',
 'num_credit_card',
 'interest_rate',
 'num_of_loan',
 'delay_from_due_date',
 'num_of_delayed_payment',
 'changed_credit_limit',
 'num_credit_inquiries',
 'outstanding_debt',
 'credit_utilization_ratio',
 'total_emi_per_month',
 'amount_invested_monthly',
 'monthly_balance',
 'credit_mix_bad',
 'credit_mix_good',
 'credit_mix_other',
 'credit_mix_standard',
 'payment_behaviour_high_spent_large_value_payments',
 'payment_behaviour_high_spent_medium_value_payments',
 'payment_behaviour_high_spent_small_value_payments',
 'payment_behaviour_low_spent_large_value_payments',
 'payment_behaviour_low_spent_medium_value_payments',
 'payment_behaviour_low_spent_small_value_payments',
 'payment_behaviour_other',
 'payment_of_min_amount_no',
 'payment_of_min_amount_other',
 'payment_of_min_amount_yes']

In [72]:
X = df[features]

In [73]:
len(features)

30

In [74]:
y = df['credit_score']
y

0        0
1        0
2        0
3        0
4        0
        ..
99995    1
99996    1
99997    1
99998    2
99999    1
Name: credit_score, Length: 100000, dtype: int64

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.3, random_state=42)

In [76]:


def evaluate_and_log_model(kind, model_name, model, X_test, y_test):
   predictions = model.predict(X_test)
   proba = None
   if hasattr(model, "predict_proba"):
      proba = model.predict_proba(X_test)[:, 1] if len(model.classes_) == 2 else None

   accuracy = accuracy_score(y_test, predictions)
   precision = precision_score(y_test, predictions, average='weighted')
   recall = recall_score(y_test, predictions, average='weighted')
   f1 = f1_score(y_test, predictions, average='weighted')
   mlflow.log_metric("Accuracy", accuracy)
   mlflow.log_metric("Precision", precision)
   mlflow.log_metric("Recall", recall)
   mlflow.log_metric("F1", f1)
   if proba is not None:
      auc = roc_auc_score(y_test, proba)
      mlflow.log_metric("ROC_AUC", auc)

   signature = infer_signature(X_test, predictions)
   if kind == "catboost":
      mlflow.catboost.log_model(model, model_name, signature=signature, input_example=X_test[:5])
   elif kind == "xgboost":
      mlflow.xgboost.log_model(model, model_name, signature=signature, input_example=X_test[:5])
   elif kind == "lightgbm":
      mlflow.lightgbm.log_model(model, model_name, signature=signature, input_example=X_test[:5])
   else:
      mlflow.sklearn.log_model(model, model_name, signature=signature, input_example=X_test[:5])

   print(f"Model {model_name} logged with Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")

## Experimentos

### RandomForest Classifier

In [78]:
with mlflow.start_run(run_name="RandomForest Classifier"):
    param_grid = {
        'n_estimators': [200, 300, 500],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2'],
        'bootstrap': [True, False]
    }

    rf = RandomForestClassifier(random_state=42)
    rfc_search = RandomizedSearchCV(rf, param_distributions=param_grid, n_iter=15,
              cv=3, scoring=make_scorer(accuracy_score, greater_is_better=False), n_jobs=-1, random_state=42)
    rfc_search.fit(X_train, y_train)
    best_model = rfc_search.best_estimator_
    mlflow.log_param("best_n_estimators", best_model.n_estimators)
    mlflow.log_param("best_max_depth", best_model.max_depth)
    mlflow.log_param("best_min_samples_split", best_model.min_samples_split)
    evaluate_and_log_model("sklearn", "RandomForest Classifier", best_model, X_test, y_test)

2025/08/02 13:46:32 INFO mlflow.sklearn.utils: Logging the 5 best runs, 10 runs will be omitted.
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 87.79it/s] 


Model RandomForest Classifier logged with Accuracy: 0.7064, Precision: 0.7155461725618537, Recall: 0.7064, F1: 0.7085011173160016
🏃 View run RandomForest Classifier at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/d3f321d5361446289055e1b33b5db018
🧪 View experiment at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0


### Decision Tree Classifier

In [79]:
with mlflow.start_run(run_name="DecisionTree_Classifier"):
    param_grid = {
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10]
    }
    tree = DecisionTreeClassifier(random_state=42)
    
    grid_search = GridSearchCV(tree, param_grid, scoring=make_scorer(accuracy_score, greater_is_better=False), cv=5)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_

    mlflow.log_param("best_max_depth", best_model.max_depth)
    mlflow.log_param("best_min_samples_split", best_model.min_samples_split)
    evaluate_and_log_model("sklearn", "Decision Tree Classifier", best_model, X_test, y_test)


2025/08/02 13:53:08 INFO mlflow.sklearn.utils: Logging the 5 best runs, 7 runs will be omitted.
Downloading artifacts: 100%|██████████| 7/7 [00:00<?, ?it/s]


Model Decision Tree Classifier logged with Accuracy: 0.6467333333333334, Precision: 0.6896280841575652, Recall: 0.6467333333333334, F1: 0.6515327649968193
🏃 View run DecisionTree_Classifier at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/0f7b401b07284abaa85251105ce91124
🧪 View experiment at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0


### Logistic Regression

In [80]:
with mlflow.start_run(run_name="Logistic Regression"):
    param_grid = {
        'penalty': ['l1', 'l2', 'elasticnet', None],
        'C': [0.01, 0.1, 1, 10],
        'solver': ['lbfgs', 'saga'],
        'max_iter': [50, 100, 150]
    }
    logreg = LogisticRegression(random_state=42, multi_class='auto')
    grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring=make_scorer(accuracy_score))
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    
    mlflow.log_param("best_penalty", best_model.penalty)
    mlflow.log_param("best_C", best_model.C)
    mlflow.log_param("best_solver", best_model.solver)
    mlflow.log_param("best_max_iter", best_model.max_iter)
    
    evaluate_and_log_model("sklearn", "Logistic Regression", best_model, X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model Logistic Regression logged with Accuracy: 0.5296333333333333, Precision: 0.6085676481718, Recall: 0.5296333333333333, F1: 0.4739281755383046
🏃 View run Logistic Regression at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/1f78e5ece43145dfa5e3aca2fca24526
🧪 View experiment at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0


### XGBoost Classifier

In [81]:
with mlflow.start_run(run_name="XGBoost Classifier"):
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [3, 5, 7, 9],
        'learning_rate': [0.01, 0.1, 0.2, 0.3]
    }
    xgb = XGBClassifier(random_state=42, verbosity=0, use_label_encoder=False)
    
    grid_search = GridSearchCV(xgb, param_grid, scoring=make_scorer(accuracy_score), cv=5)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_

    mlflow.log_param("best_n_estimators", best_model.n_estimators)
    mlflow.log_param("best_max_depth", best_model.max_depth)
    mlflow.log_param("best_learning_rate", best_model.learning_rate)
    
    evaluate_and_log_model("xgboost", "XGBoost Classifier", best_model, X_test, y_test)

2025/08/02 14:33:09 INFO mlflow.sklearn.utils: Logging the 5 best runs, 43 runs will be omitted.
  self.get_booster().save_model(fname)
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 248.43it/s]


Model XGBoost Classifier logged with Accuracy: 0.7717333333333334, Precision: 0.7712228163212964, Recall: 0.7717333333333334, F1: 0.771321317952736
🏃 View run XGBoost Classifier at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/92b59800347c422bb681fd7d8c1957a4
🧪 View experiment at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0


## Registrando Modelo

In [82]:
#View run XGBoost Classifier - melhor modelo: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/92b59800347c422bb681fd7d8c1957a4

run_id = '92b59800347c422bb681fd7d8c1957a4'

mlflow.register_model(
    model_uri=f"runs:/{run_id}/model",
    name="credit-scoring-model"
)


Successfully registered model 'credit-scoring-model'.
2025/08/02 14:40:59 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: credit-scoring-model, version 1
Created version '1' of model 'credit-scoring-model'.


<ModelVersion: aliases=[], creation_timestamp=1754156459126, current_stage='None', description='', last_updated_timestamp=1754156459126, name='credit-scoring-model', run_id='92b59800347c422bb681fd7d8c1957a4', run_link='', source='mlflow-artifacts:/a13a12126d67478588e03dcde0482475/92b59800347c422bb681fd7d8c1957a4/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>