In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Function to preprocess data
def preprocess_data(data):
    label_encoders = {}
    categorical_columns = ["profissao", "mix_credito", "comportamento_pagamento"]
    
    for col in categorical_columns:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le
    
    return data, label_encoders

# Function to train models
def train_models(x_train, y_train):
    models = {
        "RandomForest": RandomForestClassifier(),
        "KNN": KNeighborsClassifier()
    }
    
    for name, model in models.items():
        model.fit(x_train, y_train)
    
    return models

# Function to evaluate models
def evaluate_models(models, x_test, y_test):
    for name, model in models.items():
        predictions = model.predict(x_test)
        accuracy = accuracy_score(y_test, predictions)
        print(f"{name} Accuracy: {accuracy:.4f}")


In [2]:

# Load dataset
data = pd.read_csv("/home/rahima/JP/clientes.csv")
print("Initial Data Info:")
print(data.info())


Initial Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 25 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   id_cliente                100000 non-null  int64  
 1   mes                       100000 non-null  int64  
 2   idade                     100000 non-null  float64
 3   profissao                 100000 non-null  object 
 4   salario_anual             100000 non-null  float64
 5   num_contas                100000 non-null  float64
 6   num_cartoes               100000 non-null  float64
 7   juros_emprestimo          100000 non-null  float64
 8   num_emprestimos           100000 non-null  float64
 9   dias_atraso               100000 non-null  float64
 10  num_pagamentos_atrasados  100000 non-null  float64
 11  num_verificacoes_credito  100000 non-null  float64
 12  mix_credito               100000 non-null  object 
 13  divida_total              

In [3]:

# Preprocessing
data, encoders = preprocess_data(data)
x = data.drop(columns=["score_credito", "id_cliente"])
y = data["score_credito"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Train models
models = train_models(x_train, y_train)

# Evaluate models
evaluate_models(models, x_test, y_test)

# Load new clients for prediction
new_clients = pd.read_csv("novos_clientes.csv")
for col, encoder in encoders.items():
    new_clients[col] = encoder.transform(new_clients[col])

best_model = models["RandomForest"]
predictions = best_model.predict(new_clients)
print("Predictions for new clients:", predictions)


RandomForest Accuracy: 0.8242
KNN Accuracy: 0.7369
Predictions for new clients: ['Poor' 'Poor' 'Standard']
