# Desenvolvimento do modelo para prediçao de laptop 

Exploracao inicial de diferentes tipos de modelos

In [12]:
#%pip install dagshub
#%pip install catboost
#%pip install xgboost
#%pip install lightgbm

In [1]:
import pandas as pd
import mlflow
import dagshub
from dagshub.data_engine import datasources
from sklearn.model_selection import train_test_split
import mlflow.sklearn
import mlflow.catboost
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

from mlflow.models import infer_signature

# Obtendo dados do dataset

In [2]:
ds = datasources.get('rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring', 'processed')

In [3]:
ds.all().dataframe

Unnamed: 0,path,datapoint_id,dagshub_download_url,media type,size
0,credit-score-processed.csv,103597627,https://dagshub.com/api/v1/repos/rrmoreira/fia...,text/plain,14528357


In [4]:
res = ds.head()

for dp in res:
    dataset_url = dp.download_url

In [5]:
dataset_url

'https://dagshub.com/api/v1/repos/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring/raw/main/data/processed/credit-score-processed.csv'

In [6]:
df = pd.read_csv(dataset_url)
df.head()

Unnamed: 0,annual_income,monthly_inhand_salary,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,delay_from_due_date,num_of_delayed_payment,changed_credit_limit,num_credit_inquiries,...,amount_invested_monthly,monthly_balance,credit_score,credit_mix_bad,credit_mix_good,credit_mix_none,credit_mix_standard,payment_of_min_amount_no,payment_of_min_amount_none,payment_of_min_amount_yes
0,19114.12,1824.843333,3,4,3,4,3,7,11.27,4.0,...,80.415295,312.494089,0,0,0,1,0,1,0,0
1,19114.12,0.0,3,4,3,4,0,0,11.27,4.0,...,118.280222,284.629162,0,0,1,0,0,1,0,0
2,19114.12,0.0,3,4,3,4,3,7,0.0,4.0,...,81.699521,331.209863,0,0,1,0,0,1,0,0
3,19114.12,0.0,3,4,3,4,5,4,6.27,4.0,...,199.458074,223.45131,0,0,1,0,0,1,0,0
4,19114.12,1824.843333,3,4,3,4,6,0,11.27,4.0,...,41.420153,341.489231,0,0,1,0,0,1,0,0


In [7]:
dagshub.init(repo_owner="rrmoreira", repo_name="fiap-ds-mlops-9dtsr-credit-scoring", mlflow=True)

In [8]:
mlflow.autolog()

2025/08/02 11:38:27 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2025/08/02 11:38:27 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2025/08/02 11:38:27 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [9]:
features = list(df.columns)
features.remove('credit_score') #target variable

In [11]:
features

['annual_income',
 'monthly_inhand_salary',
 'num_bank_accounts',
 'num_credit_card',
 'interest_rate',
 'num_of_loan',
 'delay_from_due_date',
 'num_of_delayed_payment',
 'changed_credit_limit',
 'num_credit_inquiries',
 'outstanding_debt',
 'credit_utilization_ratio',
 'total_emi_per_month',
 'amount_invested_monthly',
 'monthly_balance',
 'credit_mix_bad',
 'credit_mix_good',
 'credit_mix_none',
 'credit_mix_standard',
 'payment_of_min_amount_no',
 'payment_of_min_amount_none',
 'payment_of_min_amount_yes']

In [14]:
X = df[features]

In [15]:
len(features)

22

In [16]:
y = df['credit_score']
y

0        0
1        0
2        0
3        0
4        0
        ..
99995    1
99996    1
99997    1
99998    2
99999    1
Name: credit_score, Length: 100000, dtype: int64

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.3, random_state=42)

In [18]:


def evaluate_and_log_model(kind, model_name, model, X_test, y_test):
   predictions = model.predict(X_test)
   proba = None
   if hasattr(model, "predict_proba"):
      proba = model.predict_proba(X_test)[:, 1] if len(model.classes_) == 2 else None

   accuracy = accuracy_score(y_test, predictions)
   precision = precision_score(y_test, predictions, average='weighted')
   recall = recall_score(y_test, predictions, average='weighted')
   f1 = f1_score(y_test, predictions, average='weighted')
   mlflow.log_metric("Accuracy", accuracy)
   mlflow.log_metric("Precision", precision)
   mlflow.log_metric("Recall", recall)
   mlflow.log_metric("F1", f1)
   if proba is not None:
      auc = roc_auc_score(y_test, proba)
      mlflow.log_metric("ROC_AUC", auc)

   signature = infer_signature(X_test, predictions)
   if kind == "catboost":
      mlflow.catboost.log_model(model, model_name, signature=signature, input_example=X_test[:5])
   elif kind == "xgboost":
      mlflow.xgboost.log_model(model, model_name, signature=signature, input_example=X_test[:5])
   elif kind == "lightgbm":
      mlflow.lightgbm.log_model(model, model_name, signature=signature, input_example=X_test[:5])
   else:
      mlflow.sklearn.log_model(model, model_name, signature=signature, input_example=X_test[:5])

   print(f"Model {model_name} logged with Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}")

### Experimento com RandomForest Classifier

In [18]:
from sklearn.ensemble import RandomForestClassifier

with mlflow.start_run(run_name="RandomForest Classifier"):
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [3, 5, 10],
        'min_samples_split': [2, 5]
    }
    rf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(rf, param_grid, cv=3, scoring=make_scorer(accuracy_score, greater_is_better=False))
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    mlflow.log_param("best_n_estimators", best_model.n_estimators)
    mlflow.log_param("best_max_depth", best_model.max_depth)
    mlflow.log_param("best_min_samples_split", best_model.min_samples_split)
    evaluate_and_log_model("sklearn", "RandomForest Classifier", best_model, X_test, y_test)

2025/08/02 09:33:21 INFO mlflow.sklearn.utils: Logging the 5 best runs, 13 runs will be omitted.


🏃 View run thoughtful-ray-350 at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/197018acfb82412ca56f96feaa3c7c7b
🧪 View experiment at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0
🏃 View run lyrical-hog-241 at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/71c70dde49a94dc19defeb74639fded4
🧪 View experiment at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0
🏃 View run nimble-rook-134 at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/ec716d56059443f0899c86fc2a710883
🧪 View experiment at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0
🏃 View run gregarious-pug-761 at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/d0bcc8945d684300aceb91488e230d6a
🧪 View experiment at: https://dagshu

  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 722.76it/s] 


Model RandomForest Classifier logged with Accuracy: 0.6408, Precision: 0.6351175057980016, Recall: 0.6408, F1: 0.6343637918502489
🏃 View run RandomForest Classifier at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/0f903ef5843b4a748abbb9176d25bc50
🧪 View experiment at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0


In [18]:

from sklearn.model_selection import RandomizedSearchCV

with mlflow.start_run(run_name="RandomForest Classifier"):
    param_grid = {
        'n_estimators': [200, 300, 500],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2'],
        'bootstrap': [True, False]
    }

    rf = RandomForestClassifier(random_state=42)
    rfc_search = RandomizedSearchCV(rf, param_distributions=param_grid, n_iter=15,
              cv=3, scoring=make_scorer(accuracy_score, greater_is_better=False), n_jobs=-1, random_state=42)
    rfc_search.fit(X_train, y_train)
    best_model = rfc_search.best_estimator_
    mlflow.log_param("best_n_estimators", best_model.n_estimators)
    mlflow.log_param("best_max_depth", best_model.max_depth)
    mlflow.log_param("best_min_samples_split", best_model.min_samples_split)
    evaluate_and_log_model("sklearn", "RandomForest Classifier", best_model, X_test, y_test)

2025/08/02 10:08:48 INFO mlflow.sklearn.utils: Logging the 5 best runs, 10 runs will be omitted.


🏃 View run incongruous-jay-681 at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/67e48ca145414d4f994488ebc6585fe8
🧪 View experiment at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0
🏃 View run tasteful-mouse-145 at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/6811372c9a62464ab00f3cb4e6fa414d
🧪 View experiment at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0
🏃 View run rare-ram-420 at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/01635a7137824657a0620c8262aad0c9
🧪 View experiment at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0
🏃 View run redolent-pig-511 at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/d223e9e8f1c5441cac15f656ff292174
🧪 View experiment at: https://dagshub

  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 374.10it/s]


Model RandomForest Classifier logged with Accuracy: 0.7084666666666667, Precision: 0.7137136630777763, Recall: 0.7084666666666667, F1: 0.7095821262993685
🏃 View run RandomForest Classifier at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/ec9f1a42a94a4838ba9d58b339fba2a1
🧪 View experiment at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0


🏃 View run magnificent-wolf-685 at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/9882af5b40aa451bace7b3c372b5fcd6
🧪 View experiment at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0
🏃 View run efficient-midge-141 at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/a73da85a4b824d3bbb0465283cbac45f
🧪 View experiment at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0
🏃 View run fearless-snipe-283 at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/3d2cb807ed684c9a90cf5d2b3fa14133
🧪 View experiment at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0
🏃 View run vaunted-ape-32 at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/07a670a69bff4a7a832d287236954d7e
🧪 View experiment at: https://d

### Experimento com Decision Tree Classifier

In [19]:
from sklearn.tree import DecisionTreeClassifier


with mlflow.start_run(run_name="DecisionTree_Classifier"):
    param_grid = {
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10]
    }
    tree = DecisionTreeClassifier(random_state=42)
    
    grid_search = GridSearchCV(tree, param_grid, scoring=make_scorer(accuracy_score, greater_is_better=False), cv=5)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_

    mlflow.log_param("best_max_depth", best_model.max_depth)
    mlflow.log_param("best_min_samples_split", best_model.min_samples_split)
    evaluate_and_log_model("sklearn", "Decision Tree Classifier", best_model, X_test, y_test)


2025/08/02 10:36:36 INFO mlflow.sklearn.utils: Logging the 5 best runs, 7 runs will be omitted.
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 426.72it/s]


Model Decision Tree Classifier logged with Accuracy: 0.6467333333333334, Precision: 0.6896280841575652, Recall: 0.6467333333333334, F1: 0.6515327649968193
🏃 View run DecisionTree_Classifier at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/96b0d75f21f8408a8b04bc459829601f
🧪 View experiment at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0


### Experimento com Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

with mlflow.start_run(run_name="Logistic Regression2"):
    param_grid = {
        'penalty': ['l1', 'l2', 'elasticnet', None],
        'C': [0.01, 0.1, 1, 10],
        'solver': ['lbfgs', 'saga'],
        'max_iter': [100, 200, 300]
    }
    logreg = LogisticRegression(random_state=42, multi_class='auto')
    grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring=make_scorer(accuracy_score))
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    
    mlflow.log_param("best_penalty", best_model.penalty)
    mlflow.log_param("best_C", best_model.C)
    mlflow.log_param("best_solver", best_model.solver)
    mlflow.log_param("best_max_iter", best_model.max_iter)
    
    evaluate_and_log_model("sklearn", "Logistic Regression", best_model, X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model Logistic Regression logged with Accuracy: 0.5402666666666667, Precision: 0.4951589895224626, Recall: 0.5402666666666667, F1: 0.4756676801607092
🏃 View run Logistic Regression2 at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/4e5f3e8e85c049028a25582d05dd3a20
🧪 View experiment at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0


### Experimento com XGBoost Classifier

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, accuracy_score

with mlflow.start_run(run_name="XGBoost Classifier"):
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [3, 5, 7, 9],
        'learning_rate': [0.01, 0.1, 0.2, 0.3]
    }
    xgb = XGBClassifier(random_state=42, verbosity=0, use_label_encoder=False)
    
    grid_search = GridSearchCV(xgb, param_grid, scoring=make_scorer(accuracy_score), cv=5)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_

    mlflow.log_param("best_n_estimators", best_model.n_estimators)
    mlflow.log_param("best_max_depth", best_model.max_depth)
    mlflow.log_param("best_learning_rate", best_model.learning_rate)
    
    evaluate_and_log_model("xgboost", "XGBoost Classifier", best_model, X_test, y_test)

2025/08/02 00:27:56 INFO mlflow.sklearn.utils: Logging the 5 best runs, 43 runs will be omitted.
  self.get_booster().save_model(fname)
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 254.04it/s]


Model XGBoost Regressor logged with MSE: 0.3781723380088806, MAE: 0.4526499807834625, R2: 0.3511192202568054, MAPE: 624687893708800.0
🏃 View run XGBoost Regressor at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0/runs/92d2325a1b24489bb4ab593fefd4c792
🧪 View experiment at: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-credit-scoring.mlflow/#/experiments/0


## Model Registry

In [21]:
#View run XGBoost Regressor - melhor modelo: https://dagshub.com/rrmoreira/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0/runs/3039d0e0e9374c9b8e5b21c1d4a93839

run_id = '3039d0e0e9374c9b8e5b21c1d4a93839'

mlflow.register_model(
    model_uri=f"runs:/{run_id}/model",
    name="laptop-pricing-model-brl"
)


Successfully registered model 'laptop-pricing-model-brl'.
2025/07/30 23:00:27 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: laptop-pricing-model-brl, version 1
Created version '1' of model 'laptop-pricing-model-brl'.


<ModelVersion: aliases=[], creation_timestamp=1753927227474, current_stage='None', description='', last_updated_timestamp=1753927227474, name='laptop-pricing-model-brl', run_id='3039d0e0e9374c9b8e5b21c1d4a93839', run_link='', source='mlflow-artifacts:/a0cd437434354e3a90f60b591ac0abf7/3039d0e0e9374c9b8e5b21c1d4a93839/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>