# Importar os módulos necessários

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import pandas as pd
#necessario por causa do metodo de avaliaçao
from sklearn.model_selection import KFold
from sklearn.metrics import (
        make_scorer,
        confusion_matrix, 
        cohen_kappa_score, 
        accuracy_score, 
        precision_score, 
        recall_score, 
        f1_score, 
        roc_auc_score
)
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier # decision trees for classification
from sklearn.neural_network import  MLPClassifier # neural networks for classification
from sklearn.naive_bayes import GaussianNB # naive bayes for classification
from sklearn.svm import SVC # support vector machines for classification

# Definir as Métricas para Avaliação dos Modelos

In [5]:
#metricas regressao
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def custom_regression_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    return mse, mae, r2


In [6]:
#metricas classificação
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn+fp)

In [7]:
#Metricas regressao
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

REGRESSION_METRICS = {
    "MSE": make_scorer(mean_squared_error),
    "MAE": make_scorer(mean_absolute_error),
    "R2": make_scorer(r2_score)
}


In [8]:
METRICS = {
        "accuracy": make_scorer(accuracy_score),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score),
        "f1": make_scorer(f1_score),
        "AUC": make_scorer(roc_auc_score, needs_proba=True),
        "specificity": make_scorer(specificity_score),
        "kappa":make_scorer(cohen_kappa_score)
}

# Ler o Conjunto de Dados

In [9]:
d = pd.read_csv("Xcenario1.csv")
d

Unnamed: 0,id_body_type,id_modelos,id_marcas,ano,id_city,id_insurance,seats,engine_capacity,id_fuel_type,kms_driven,max_power,id_owner_type,resale_price_Lakh
0,0.364112,3.086522,1.230837,2019.0,1.096102,0.256633,5.0,1199.0,0.180334,30910.0,83.8,0.150465,5.66
1,0.598869,2.403638,0.976238,2018.0,1.281577,1.306401,5.0,1199.0,0.180334,48089.0,88.7,1.365236,6.64
2,0.578053,2.583846,0.976238,2015.0,0.922710,0.256633,5.0,1497.0,0.180334,51000.0,117.3,0.620328,5.65
3,0.598869,3.650793,1.230837,2021.0,0.922710,0.256633,7.0,1956.0,0.514919,30000.0,167.6,0.150465,23.00
4,0.364112,2.747703,0.541440,2019.0,1.170307,1.306401,5.0,1197.0,0.180334,61113.0,83.1,0.150465,6.87
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13420,1.347597,3.650793,1.745897,2021.0,1.027199,0.256633,7.0,2199.0,0.514919,80000.0,197.2,0.620328,26.50
13421,0.364112,2.951823,0.677665,2017.0,1.223199,0.256633,5.0,1197.0,0.180334,17923.0,81.8,0.150465,5.87
13422,0.578053,3.282816,1.693345,2017.0,1.027199,0.445408,5.0,1498.0,0.514919,63389.0,108.6,0.620328,7.43
13423,0.598869,2.747703,0.541440,2017.0,0.868320,0.445408,5.0,1248.0,0.514919,40000.0,88.5,0.620328,9.45


In [10]:
X, y = d.drop("resale_price_Lakh", axis=1), d["resale_price_Lakh"]

# Definir o Método de Validação Cruzada

In [27]:
#Classificação
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)

# Utilize KFold para um problema de regressão
kf = KFold(n_splits=10, shuffle=True, random_state=1234)


# Implementação dos Algoritmos de Machine Learning

### Árvores de Decisão

In [12]:
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_validate

# Utilize KFold para um problema de regressão
kf = KFold(n_splits=10, shuffle=True, random_state=1234)

# Continue com a validação cruzada e a avaliação de métricas
dt_regressor = DecisionTreeRegressor(max_depth=3, random_state=1234)
scores = cross_validate(dt_regressor, X, y, cv=kf, scoring=REGRESSION_METRICS)
dt_scores = pd.DataFrame(scores)
pd.DataFrame(dt_scores.mean()).T





Unnamed: 0,fit_time,score_time,test_MSE,test_MAE,test_R2
0,0.014146,0.003095,22.569007,2.671775,0.73939


In [28]:


# Criar um modelo de regressão de árvore de decisão
dt_regressor = DecisionTreeRegressor(max_depth=3, random_state=1234)

# Estratégia de validação cruzada (KFold) para um problema de regressão
kf = KFold(n_splits=10, shuffle=True, random_state=1234)

# Continuar com a validação cruzada e avaliação de métricas
scores = cross_validate(dt_regressor, X, y, cv=kf, scoring=METRICS)
dt_scores = pd.DataFrame(scores)
pd.DataFrame(dt_scores.mean()).T



Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,0.017097,0.007059,,,,,,,


### Redes Neuronais

In [17]:
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import HistGradientBoostingRegressor


# Criar um imputador
imputer = SimpleImputer(strategy='mean')  # Você pode escolher 'mean', 'median', 'most_frequent', ou outra estratégia

# Aplicar a imputação aos dados de entrada X
X_imputed = imputer.fit_transform(X)

# Criar uma Rede Neural para regressão
nn_regressor = MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=20, random_state=1234)

# Realizar a validação cruzada e avaliar as métricas para MLPRegressor
scores_nn = cross_validate(nn_regressor, X_imputed, y, cv=kf, scoring=REGRESSION_METRICS)
nn_scores = pd.DataFrame(scores_nn)
print("MLPRegressor Scores:")
print(pd.DataFrame(nn_scores.mean()))

# Ou usar o HistGradientBoostingRegressor que lida nativamente com valores ausentes
hgbr = HistGradientBoostingRegressor(random_state=1234)

# Realizar a validação cruzada e avaliar as métricas para HistGradientBoostingRegressor
scores_hgbr = cross_validate(hgbr, X, y, cv=kf, scoring=REGRESSION_METRICS)
hgbr_scores = pd.DataFrame(scores_hgbr)
print("\nHistGradientBoostingRegressor Scores:")
print(pd.DataFrame(hgbr_scores.mean()))


MLPRegressor Scores:
                    0
fit_time     2.059301
score_time   0.002567
test_MSE    52.939941
test_MAE     4.293197
test_R2      0.367783

HistGradientBoostingRegressor Scores:
                   0
fit_time    0.599200
score_time  0.004521
test_MSE    7.542727
test_MAE    1.126403
test_R2     0.914607


In [19]:
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.neural_network import MLPRegressor
from sklearn.impute import SimpleImputer


imputer = SimpleImputer(strategy='mean')  # Você pode escolher 'mean', 'median', 'most_frequent', ou outra estratégia

# Aplicar a imputação aos dados de entrada X
X_imputed = imputer.fit_transform(X)

# Criar uma Rede Neural para regressão
nn_regressor = MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=20, random_state=1234)

# Realizar a validação cruzada e avaliar as métricas
scores_nn = cross_validate(nn_regressor, X_imputed, y, cv=kf, scoring=METRICS)
nn_scores = pd.DataFrame(scores_nn)
print(pd.DataFrame(nn_scores.mean()).T)


   fit_time  score_time  test_accuracy  test_precision  test_recall  test_f1  \
0  2.028102    0.006901            NaN             NaN          NaN      NaN   

   test_AUC  test_specificity  test_kappa  
0       NaN               NaN         NaN  


### Naive Bayes

In [21]:
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')  # Você pode escolher 'mean', 'median', 'most_frequent', ou outra estratégia

# Aplicar a imputação aos dados de entrada X
X_imputed = imputer.fit_transform(X)

# Criar um regressor RandomForest
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=1234)

# Realizar a validação cruzada e avaliar as métricas
scores_rf = cross_validate(rf_regressor, X_imputed, y, cv=kf, scoring=REGRESSION_METRICS)
rf_scores = pd.DataFrame(scores_rf)
print(pd.DataFrame(rf_scores.mean()).T)



   fit_time  score_time  test_MSE  test_MAE   test_R2
0   8.28385    0.043082  6.595092  1.022149  0.924971


### Support Vector Machine

In [26]:
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer


# Criar um imputador
imputer = SimpleImputer(strategy='mean')  # Você pode escolher 'mean', 'median', 'most_frequent', ou outra estratégia

# Aplicar a imputação aos dados de entrada X
X_imputed = imputer.fit_transform(X)

# Criar um regressor SVM
svm_regressor = SVR()

# Realizar a validação cruzada e avaliar as métricas
scores_svm = cross_validate(svm_regressor, X_imputed, y, cv=splitter, scoring=REGRESSION_METRICS)
svm_scores = pd.DataFrame(scores_svm)
print(pd.DataFrame(svm_scores.mean()).T)



ValueError: Supported target types are: ('binary', 'multiclass'). Got 'continuous' instead.