# Importar os módulos necessários

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
#necessario por causa do metodo de avaliaçao
from sklearn.model_selection import KFold
from sklearn.metrics import (
        make_scorer,
        confusion_matrix, 
        cohen_kappa_score, 
        accuracy_score, 
        precision_score, 
        recall_score, 
        f1_score, 
        roc_auc_score
)
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier # decision trees for classification
from sklearn.neural_network import  MLPClassifier # neural networks for classification
from sklearn.naive_bayes import GaussianNB # naive bayes for classification
from sklearn.svm import SVC # support vector machines for classification

# Definir as Métricas para Avaliação dos Modelos

In [3]:
#metricas regressao
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def custom_regression_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    return mse, mae, r2


In [4]:
#metricas classificação
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn+fp)

In [5]:
#Metricas regressao
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

REGRESSION_METRICS = {
    "MSE": make_scorer(mean_squared_error),
    "MAE": make_scorer(mean_absolute_error),
    "R2": make_scorer(r2_score)
}


In [6]:
METRICS = {
        "accuracy": make_scorer(accuracy_score),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score),
        "f1": make_scorer(f1_score),
        "AUC": make_scorer(roc_auc_score, needs_proba=True),
        "specificity": make_scorer(specificity_score),
        "kappa":make_scorer(cohen_kappa_score)
}

# Ler o Conjunto de Dados

In [7]:
d = pd.read_csv("Xcenario4.csv")
d

Unnamed: 0,id_transmissao,id_body_type,id_full_name,ano,id_city,id_insurance,seats,engine_capacity,mileage,id_fuel_type,kms_driven,max_power,id_owner_type,resale_price_Lakh
0,0.143778,0.375361,3.237418,2019,1.092066,0.260896,5.0,1199.0,23.84,0.186603,30910.0,83.8,0.152217,5.66
1,0.143778,0.595820,2.885236,2018,1.290475,1.317034,5.0,1199.0,17.50,0.186603,48089.0,88.7,1.357319,6.64
2,0.143778,0.561640,3.061327,2015,0.915716,0.260896,5.0,1497.0,17.40,0.186603,51000.0,117.3,0.617152,5.65
3,0.549778,0.595820,3.663387,2021,0.915716,0.260896,7.0,1956.0,14.08,0.500424,30000.0,167.6,0.152217,23.00
4,0.549778,0.375361,3.441538,2019,1.172025,1.317034,5.0,1197.0,21.40,0.186603,61113.0,83.1,0.152217,6.87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13815,0.549778,1.360191,3.663387,2021,1.033298,0.260896,7.0,2199.0,14.11,0.500424,80000.0,197.2,0.617152,26.50
13816,0.143778,0.375361,3.362357,2017,1.221954,0.260896,5.0,1197.0,18.60,0.186603,17923.0,81.8,0.152217,5.87
13817,0.143778,0.561640,3.441538,2017,1.033298,0.436701,5.0,1498.0,21.13,0.500424,63389.0,108.6,0.617152,7.43
13818,0.143778,0.595820,2.778780,2017,0.869364,0.436701,5.0,1248.0,24.30,0.500424,40000.0,88.5,0.617152,9.45


In [8]:
X, y = d.drop("resale_price_Lakh", axis=1), d["resale_price_Lakh"]

# Definir o Método de Validação Cruzada

In [9]:
#Classificação
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)

# Utilize KFold para um problema de regressão
kf = KFold(n_splits=10, shuffle=True, random_state=1234)


# Implementação dos Algoritmos de Machine Learning

### Árvores de Decisão

In [10]:
from sklearn.model_selection import KFold, cross_validate
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Definir as métricas de regressão que deseja avaliar
REGRESSION_METRICS = {
    'neg_mean_squared_error': 'neg_mean_squared_error',
    'neg_mean_absolute_error': 'neg_mean_absolute_error',
    'r2': 'r2'
}

# Definir o regressor e o método de validação cruzada
dt_regressor = DecisionTreeRegressor(max_depth=3, random_state=1234)
kf = KFold(n_splits=10, shuffle=True, random_state=1234)

# Realizar a validação cruzada e avaliar as métricas
scores = cross_validate(dt_regressor, X, y, cv=kf, scoring=REGRESSION_METRICS)
dt_scores = {k: -v.mean() for k, v in scores.items()}  # Negar os valores para tornar os scores positivos
pd.DataFrame(dt_scores, index=[0])

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error,test_r2
0,-0.032705,-0.003287,29.974181,2.948132,-0.753871


In [11]:
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_validate

# Utilize KFold para um problema de regressão
kf = KFold(n_splits=10, shuffle=True, random_state=1234)

# Continue com a validação cruzada e a avaliação de métricas
dt_regressor = DecisionTreeRegressor(max_depth=3, random_state=1234)
scores = cross_validate(dt_regressor, X, y, cv=kf, scoring=REGRESSION_METRICS)
dt_scores = pd.DataFrame(scores)
pd.DataFrame(dt_scores.mean()).T





Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error,test_r2
0,0.027259,0.002236,-29.974181,-2.948132,0.753871


In [12]:


# Criar um modelo de regressão de árvore de decisão
dt_regressor = DecisionTreeRegressor(max_depth=3, random_state=1234)

# Estratégia de validação cruzada (KFold) para um problema de regressão
kf = KFold(n_splits=10, shuffle=True, random_state=1234)

# Continuar com a validação cruzada e avaliação de métricas
scores = cross_validate(dt_regressor, X, y, cv=kf, scoring=METRICS)
dt_scores = pd.DataFrame(scores)
pd.DataFrame(dt_scores.mean()).T



Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,0.023129,0.015721,,,,,,,


### Redes Neuronais

In [13]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_validate

# Criar uma Rede Neural para regressão
gb_regressor = HistGradientBoostingRegressor(random_state=1234)

# Realizar a validação cruzada e avaliar as métricas
scores_gb = cross_validate(gb_regressor, X, y, cv=kf, scoring=REGRESSION_METRICS)
gb_scores = pd.DataFrame(scores_gb)
pd.DataFrame(gb_scores.mean()).T


Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error,test_r2
0,0.95919,0.014407,-11.787531,-1.313764,0.904817


In [14]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_validate

# Criar uma Rede Neural para regressão
gb_regressor = HistGradientBoostingRegressor(max_iter=20, random_state=1234)

# Realizar a validação cruzada e avaliar as métricas
scores_gb = cross_validate(gb_regressor, X, y, cv=kf, scoring=METRICS)
gb_scores = pd.DataFrame(scores_gb)
pd.DataFrame(gb_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,0.247111,0.01914,,,,,,,


In [15]:
# ISTO É NECESSÁRIO?

#nn = MLPClassifier(hidden_layer_sizes=(50,50), max_iter=20, random_state=1234)
#scores_nn = cross_validate(nn, X, y, cv=splitter, scoring=METRICS)
#nn_scores = pd.DataFrame(scores_nn)
#pd.DataFrame(nn_scores.mean()).T

### Naive Bayes

In [16]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error



# Criar um regressor RandomForest
gb_regressor = HistGradientBoostingRegressor(max_iter=20, random_state=1234)

# Realizar a validação cruzada e avaliar as métricas
scores_gb = cross_validate(gb_regressor, X, y, cv=kf, scoring=REGRESSION_METRICS)
gb_scores = pd.DataFrame(scores_gb)
pd.DataFrame(gb_scores.mean()).T


Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error,test_r2
0,0.27152,0.007287,-16.263286,-1.767299,0.868923


In [17]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error



# Criar um regressor RandomForest
gb_regressor = HistGradientBoostingRegressor(max_iter=20, random_state=1234)

# Realizar a validação cruzada e avaliar as métricas
scores_gb = cross_validate(gb_regressor, X, y, cv=kf, scoring=METRICS)
gb_scores = pd.DataFrame(scores_gb)
pd.DataFrame(gb_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,0.247628,0.017246,,,,,,,


In [18]:
# É NECESSÁRIO?

#nb = GaussianNB()
#scores_nb = cross_validate(nb, X, y, cv=splitter, scoring=METRICS)
#nb_scores = pd.DataFrame(scores_nb)
#pd.DataFrame(nb_scores.mean()).T

### Support Vector Machine

In [19]:
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer


# Criar um imputador
imputer = SimpleImputer(strategy='mean')  # Você pode escolher 'mean', 'median', 'most_frequent', ou outra estratégia

# Aplicar a imputação aos dados de entrada X
X_imputed = imputer.fit_transform(X)

# Criar um regressor SVM
svm_regressor = SVR()

# Realizar a validação cruzada e avaliar as métricas
scores_svm = cross_validate(svm_regressor, X_imputed, y, cv=splitter, scoring=REGRESSION_METRICS)
svm_scores = pd.DataFrame(scores_svm)
print(pd.DataFrame(svm_scores.mean()).T)


ValueError: Supported target types are: ('binary', 'multiclass'). Got 'continuous' instead.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# Criar um regressor RandomForest
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=1234)

# Realizar a validação cruzada e avaliar as métricas
scores_rf = cross_validate(rf_regressor, X, y, cv=kf, scoring=REGRESSION_METRICS)
rf_scores = pd.DataFrame(scores_rf)
pd.DataFrame(rf_scores.mean()).T


Unnamed: 0,fit_time,score_time,test_MSE,test_MAE,test_R2
0,5.658228,0.038372,10.881947,1.376584,0.876171
