# Importar os módulos necessários

In [34]:
import warnings
warnings.filterwarnings("ignore")

In [35]:
import pandas as pd
#necessario por causa do metodo de avaliaçao
from sklearn.model_selection import KFold
from sklearn.metrics import (
        make_scorer,
        confusion_matrix, 
        cohen_kappa_score, 
        accuracy_score, 
        precision_score, 
        recall_score, 
        f1_score, 
        roc_auc_score
)
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier # decision trees for classification
from sklearn.neural_network import  MLPClassifier # neural networks for classification
from sklearn.naive_bayes import GaussianNB # naive bayes for classification
from sklearn.svm import SVC # support vector machines for classification

# Definir as Métricas para Avaliação dos Modelos

In [36]:
#metricas regressao
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def custom_regression_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    return mse, mae, r2


In [37]:
#metricas classificação
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn+fp)

In [38]:
#Metricas regressao
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

REGRESSION_METRICS = {
    "MSE": make_scorer(mean_squared_error),
    "MAE": make_scorer(mean_absolute_error),
    "R2": make_scorer(r2_score)
}


In [39]:
METRICS = {
        "accuracy": make_scorer(accuracy_score),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score),
        "f1": make_scorer(f1_score),
        "AUC": make_scorer(roc_auc_score, needs_proba=True),
        "specificity": make_scorer(specificity_score),
        "kappa":make_scorer(cohen_kappa_score)
}

# Ler o Conjunto de Dados

In [40]:
d = pd.read_csv("PREV1.csv")
d

Unnamed: 0,id_marcas,ano,engine_capacity,id_insurance,id_transmissao,kms_driven,id_owner_type,id_fuel_type,max_power,seats,mileage,id_body_type,id_city,resale_price_Lakh
0,1.230837,2019.0,1199.0,0.256633,0.131359,30910.0,0.150465,0.180334,83.8,5.0,23.84,0.364112,1.096102,5.66
1,0.976238,2018.0,1199.0,1.306401,0.131359,48089.0,1.365236,0.180334,88.7,5.0,17.50,0.598869,1.281577,6.64
2,0.976238,2015.0,1497.0,0.256633,0.131359,51000.0,0.620328,0.180334,117.3,5.0,17.40,0.578053,0.922710,5.65
3,1.230837,2021.0,1956.0,0.256633,0.583102,30000.0,0.150465,0.514919,167.6,7.0,14.08,0.598869,0.922710,23.00
4,0.541440,2019.0,1197.0,1.306401,0.583102,61113.0,0.150465,0.180334,83.1,5.0,21.40,0.364112,1.170307,6.87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13414,1.745897,2021.0,2199.0,0.256633,0.583102,80000.0,0.620328,0.514919,197.2,7.0,14.11,1.347597,1.027199,26.50
13415,0.677665,2017.0,1197.0,0.256633,0.131359,17923.0,0.150465,0.180334,81.8,5.0,18.60,0.364112,1.223199,5.87
13416,1.693345,2017.0,1498.0,0.445408,0.131359,63389.0,0.620328,0.514919,108.6,5.0,21.13,0.578053,1.027199,7.43
13417,0.541440,2017.0,1248.0,0.445408,0.131359,40000.0,0.620328,0.514919,88.5,5.0,24.30,0.598869,0.868320,9.45


In [41]:
X, y = d.drop("resale_price_Lakh", axis=1), d["resale_price_Lakh"]

In [42]:
# Contar os valores nulos (NaN) em cada coluna
contagem_nan = X.isna().sum()

# Exibir a contagem de valores nulos em cada coluna
print(contagem_nan)

id_marcas          0
ano                0
engine_capacity    0
id_insurance       0
id_transmissao     0
kms_driven         0
id_owner_type      0
id_fuel_type       0
max_power          0
seats              0
mileage            0
id_body_type       0
id_city            0
dtype: int64


# Definir o Método de Validação Cruzada

In [43]:
#Classificação
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)

# Utilize KFold para um problema de regressão
kf = KFold(n_splits=10, shuffle=True, random_state=1234)


# Implementação dos Algoritmos de Machine Learning

### Árvores de Decisão

In [44]:
from sklearn.model_selection import KFold, cross_validate
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score



# Definir o regressor e o método de validação cruzada
dt_regressor = DecisionTreeRegressor(max_depth=3, random_state=1234)
kf = KFold(n_splits=10, shuffle=True, random_state=1234)

# Realizar a validação cruzada e avaliar as métricas
scores = cross_validate(dt_regressor, X, y, cv=kf, scoring=REGRESSION_METRICS)
dt_scores = {k: -v.mean() for k, v in scores.items()}  # Negar os valores para tornar os scores positivos
pd.DataFrame(dt_scores, index=[0])



Unnamed: 0,fit_time,score_time,test_MSE,test_MAE,test_R2
0,-0.020777,-0.002891,-22.654367,-2.67855,-0.738622


In [45]:
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_validate

# Utilize KFold para um problema de regressão
kf = KFold(n_splits=10, shuffle=True, random_state=1234)

# Continue com a validação cruzada e a avaliação de métricas
dt_regressor = DecisionTreeRegressor(max_depth=3, random_state=1234)
scores = cross_validate(dt_regressor, X, y, cv=kf, scoring=REGRESSION_METRICS)
dt_scores = pd.DataFrame(scores)
pd.DataFrame(dt_scores.mean()).T





Unnamed: 0,fit_time,score_time,test_MSE,test_MAE,test_R2
0,0.023376,0.00291,22.654367,2.67855,0.738622


In [46]:


# Criar um modelo de regressão de árvore de decisão
dt_regressor = DecisionTreeRegressor(max_depth=3, random_state=1234)

# Estratégia de validação cruzada (KFold) para um problema de regressão
kf = KFold(n_splits=10, shuffle=True, random_state=1234)

# Continuar com a validação cruzada e avaliação de métricas
scores = cross_validate(dt_regressor, X, y, cv=kf, scoring=METRICS)
dt_scores = pd.DataFrame(scores)
pd.DataFrame(dt_scores.mean()).T



Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,0.017976,0.003923,,,,,,,


### Redes Neuronais

In [47]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_validate

# Criar uma Rede Neural para regressão
nn_regressor = MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=20, random_state=1234)

# Realizar a validação cruzada e avaliar as métricas
scores_nn = cross_validate(nn_regressor, X, y, cv=kf, scoring=REGRESSION_METRICS)
nn_scores = pd.DataFrame(scores_nn)
pd.DataFrame(nn_scores.mean()).T


Unnamed: 0,fit_time,score_time,test_MSE,test_MAE,test_R2
0,1.407877,0.0,144.479446,8.323198,-0.746817


In [48]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_validate

# Criar uma Rede Neural para regressão
nn_regressor = MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=20, random_state=1234)

# Realizar a validação cruzada e avaliar as métricas
scores_nn = cross_validate(nn_regressor, X, y, cv=kf, scoring=METRICS)
nn_scores = pd.DataFrame(scores_nn)
pd.DataFrame(nn_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,1.549035,0.009984,,,,,,,


### Naive Bayes

In [49]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error



# Criar um regressor RandomForest
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=1234)

# Realizar a validação cruzada e avaliar as métricas
scores_rf = cross_validate(rf_regressor, X, y, cv=kf, scoring=REGRESSION_METRICS)
rf_scores = pd.DataFrame(scores_rf)
pd.DataFrame(rf_scores.mean()).T


Unnamed: 0,fit_time,score_time,test_MSE,test_MAE,test_R2
0,11.677517,0.044695,6.628182,0.996551,0.925521


In [50]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error



# Criar um regressor RandomForest
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=1234)

# Realizar a validação cruzada e avaliar as métricas
scores_rf = cross_validate(rf_regressor, X, y, cv=kf, scoring=METRICS)
rf_scores = pd.DataFrame(scores_rf)
pd.DataFrame(rf_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,11.600246,0.045532,,,,,,,


### Support Vector Machine