In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import time

### Importar tabela de dados e remover zeros e dados ausentes

In [None]:
tabela = pd.read_csv('../dados_coletados/tabela_modelagem.csv', na_values = '...')
tabela = tabela[tabela['Rendimento'] != '-']
tabela['Rendimento'] = tabela['Rendimento'].astype(float)
tabela = tabela[tabela['Rendimento'].notnull()]
tabela.head()

### Adicionar um atributo de classe para os valores de rendimento de produção de soja

In [None]:
print(np.quantile(tabela['Rendimento'], 0.3333))
print(np.quantile(tabela['Rendimento'], 0.6666))

In [None]:
tabela['Classe'] = None
q1 = np.quantile(tabela['Rendimento'], 0.3333)
q2 = np.quantile(tabela['Rendimento'], 0.6666)

tabela.loc[tabela['Rendimento'] < q1, 'Classe'] = 'baixo'
tabela.loc[(tabela['Rendimento'] >= q1) & (tabela['Rendimento'] <= q2), 'Classe'] = 'medio'
tabela.loc[tabela['Rendimento'] > q2, 'Classe'] = 'alto'

In [None]:
tabela['Classe'].hist()

In [None]:
len(tabela[tabela['Classe'] == 'baixo'])

In [None]:
seed = 5744 #numero decidido aleatóriamente (ver scripts anteriores)

In [None]:
baixo = tabela.loc[tabela['Classe'] == 'baixo']
medio = tabela.loc[tabela['Classe'] == 'medio'].sample(n = 6107,
                                                       replace = False,
                                                       random_state = seed)
alto = tabela.loc[tabela['Classe'] == 'alto'].sample(n = 6107,
                                                   replace = False,
                                                   random_state = seed)

tabela_classes = pd.concat([baixo, medio, alto])
tabela_classes

### Dividir dados em teste e treino

In [None]:
analisar_classes = tabela_classes.sample(frac = 1, random_state = seed)
analisar_classes.head()

In [None]:
analisar_classes.info()

In [None]:
X = analisar_classes.iloc[:,11:36]
y = analisar_classes.iloc[:,36]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = seed)

In [None]:
print('N treino:', len(X_train))
print('N teste:', len(X_test))

### Padronizar os atributos

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
scaler = sc.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
joblib.dump(scaler, '../resultados/scaler_class.sav')

### Definir método para validação cruzada dos modelos de classificação

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import confusion_matrix 

def classifier_cv(sample_method, estimator, X_train, y_train, parameters, cv = 5, n_iter = 10):
    if sample_method == 'RandomizedSearchCV':
        tunning = RandomizedSearchCV(
            estimator = estimator,
            param_distributions = parameters,
            n_iter = n_iter,
            cv = cv,
            scoring = 'accuracy',
            verbose = 3,
            random_state = seed,
            n_jobs = -1)
    elif sample_method == 'GridSearchCV': 
        tunning = GridSearchCV(
          estimator = estimator,
          param_grid = parameters,
          cv = cv,
          scoring = 'accuracy',
          verbose = 3,
          n_jobs = -1)
    else:
        print('Método de amostragem invalido! Escolha entre "RandomizedSearchCV" ou "GridSearchCV"')
    model = tunning.fit(X_train, y_train)
    display(model.best_params_)
    display(model.best_score_)

def resultados(y_pred):
    labels = ['alto', 'medio', 'baixo']
    print('Acurácia:', accuracy_score(y_test, y_pred))
    mc = confusion_matrix(y_test, y_pred, labels = labels)
    mm = multilabel_confusion_matrix(y_test, y_pred, labels = labels)
    
    ax = plt.subplot()
    sns.heatmap(mc, annot = True, fmt = 'g', ax = ax)
    ax.set_xlabel('Predito')
    ax.set_ylabel('Observado'); 
    ax.set_title('Matriz de confusão'); 
    ax.xaxis.set_ticklabels(labels)
    ax.yaxis.set_ticklabels(labels)
    
    fig, ax = plt.subplots(3, figsize=(4, 10))
    for classe in range(0, 3):
        sns.heatmap(mm[classe], annot = True, fmt = 'g', ax = ax[classe])
        ax[classe].set_xlabel('Predito')
        ax[classe].set_ylabel('Observado') 
        ax[classe].set_title(labels[classe].capitalize())
        ax[classe].xaxis.set_ticklabels(['positivo', 'negativo'])
        ax[classe].yaxis.set_ticklabels(['positivo', 'negativo'])
    fig.tight_layout()
    plt.show()


### Máquina de Vetores de Suporte

In [None]:
from sklearn.svm import SVC

#### Modelo sem ajuste de hiperparametros

In [None]:
classifier = SVC(kernel = 'rbf')
classifier.get_params()

In [None]:
start = time.time()
classifier.fit(X_train, y_train)
end = time.time()
print('Finalizado em ', round(end - start, 1), ' s')

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
resultados(y_pred)

#### Seleção de hiperparametros

In [None]:
C = [0.001 ,0.01, 0.1, 1, 10, 100] # Custo: trade-off entre margem mínima de separação e erro
gamma = [0.001, 0.01, 0.1, 1, 10, 100] # curvatura dos vetores
kernel = ['rbf']

random_grid = {
    'C': C,
    'gamma': gamma,
    'kernel': kernel}

In [None]:
start = time.time()
classifier_cv('GridSearchCV', SVC(), X_train, y_train, random_grid, cv = 3)
end = time.time()
print('Finalizado em ', round(end - start, 1), ' s')

##### Modelo final

In [None]:
SVC_classifier = SVC(
    kernel = 'rbf',
    C = 10,
    gamma = 0.1)
SVC_classifier.fit(X_train, y_train)

In [None]:
SVC_y_pred = SVC_classifier.predict(X_test)

In [None]:
resultados(SVC_y_pred)

In [None]:
SVC_savefile = '../resultados/SVC.sav'
joblib.dump(SVC_classifier, SVC_savefile)

#### Modelo salvo

In [None]:
SVC_model = joblib.load('../resultados/SVC.sav')

In [None]:
SVC_y_pred = SVC_model.predict(X_test)

In [None]:
resultados(SVC_y_pred)

### Árvore de Decisão

In [None]:
from sklearn.tree import DecisionTreeClassifier

#### Modelo sem ajuste de hiperparametros

In [None]:
classifier = DecisionTreeClassifier(random_state = seed)
classifier.get_params()

In [None]:
start = time.time()
classifier.fit(X_train, y_train)
end = time.time()
print('Finalizado em ', round(end - start, 1), ' s')

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
resultados(y_pred)

#### Seleção de hiperparametros

In [None]:
splitter = ['best', 'random'] #Pontos de divisão criados aleatóriamente ('random') ou todos possíveis ('best')
max_features = [None, 2, 5, 10, 15] # Número máximo de atributos a cada decisão
max_depth = [None, 10, 100, 300, 600, 1000] # Tamanho máximo de uma árvore de decisão (número de nós)
min_samples_split = [2, 10, 50, 100] # Mínimo de amostras para criar uma divisão
min_samples_leaf = [1, 5, 20, 50] # Mínimo número de amostras para cada nó terminal
max_leaf_nodes = [None, 10, 100, 300, 600, 1000] # Número máximo de nós terminais (limita max_depth)

random_grid = {
    'splitter': splitter,
    'max_depth': max_depth,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'max_features': max_features,
    'max_leaf_nodes': max_leaf_nodes}

In [None]:
start = time.time()
classifier_cv('RandomizedSearchCV', DecisionTreeClassifier(), X_train, y_train, random_grid, cv = 5, n_iter = 500)
end = time.time()
print('Finalizado em ', round(end - start, 1), ' s')

##### Modelo final

In [None]:
DT_Classifier = DecisionTreeClassifier(
    random_state = seed,
    max_features = None,
    max_depth = 600,
    min_samples_split = 10,
    min_samples_leaf = 1,
    max_leaf_nodes = 300,
    splitter = 'best')
DT_Classifier.fit(X_train, y_train)

In [None]:
DT_y_pred = DT_Classifier.predict(X_test)

In [None]:
resultados(DT_y_pred)

In [None]:
DT_savefile = '../resultados/DT_class.sav'
joblib.dump(DT_Classifier, DT_savefile)

#### Modelo salvo

In [None]:
DT_model = joblib.load('../resultados/DT_class.sav')

In [None]:
DT_y_pred = DT_model.predict(X_test)

In [None]:
resultados(DT_y_pred)

### Floresta Aleatória


In [None]:
from sklearn.ensemble import RandomForestClassifier

#### Modelo sem ajuste de hiperparametros

In [None]:
classifier = RandomForestClassifier(random_state = seed)
classifier.get_params()

In [None]:
start = time.time()
classifier.fit(X_train, y_train)
end = time.time()
print('Finalizado em ', round(end - start, 1), ' s')

In [None]:
y_pred = classifier.predict(X_test)
resultados(y_pred)

#### Seleção de hiperparametros

In [None]:
n_estimators = [10, 100, 300, 1000] # Número de árvores de decisão
max_features = ['auto', 'sqrt', 'log2', 0.2, 0.5, 0.8] # Número de atributos a cada decisão
max_depth = [None, 10, 100, 300, 600, 1000] # Tamanho máximo de uma árvore de decisão (número de nós)
bootstrap = [True, False] # Método para selecionar amostras (com ou sem reposição)
min_samples_split = [2, 10, 50, 100] # Mínimo de amostras para criar uma divisão
min_samples_leaf = [1, 5, 20, 50] # Mínimo número de amostras para cada nó terminal
max_leaf_nodes = [None, 10, 100, 300, 600, 1000] # Número máximo de nós terminais (limita max_depth)

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
start = time.time()
classifier_cv('RandomizedSearchCV', RandomForestClassifier(), X_train, y_train, random_grid, cv = 3, n_iter = 100)
end = time.time()
print('Finalizado em ', round(end - start, 1), ' s')

##### Modelo final

In [None]:
RF_classifier = RandomForestClassifier(
    n_estimators = 1000,
    max_features = 0.2,
    max_depth = 600,
    min_samples_split = 10,
    min_samples_leaf = 1,
    bootstrap = False,
    random_state = seed)
RF_classifier.fit(X_train, y_train)

In [None]:
RF_y_pred = RF_classifier.predict(X_test)

In [None]:
resultados(RF_y_pred)

In [None]:
RF_savefile = '../resultados/rf_class.sav'
joblib.dump(RF_classifier, RF_savefile)

#### Modelo salvo

In [None]:
RF_load = joblib.load('../resultados/rf_class.sav')

In [None]:
RF_y_pred = RF_load.predict(X_test)

In [None]:
resultados(RF_y_pred)

### Rede Neural

In [None]:
#pip install keras-tuner

In [None]:
import tensorflow as tf
from tqdm.keras import TqdmCallback

In [None]:
import keras_tuner as kt

#### Seleção de hiperparametros

In [None]:
def build_model(hp):
    model = tf.keras.models.Sequential()
    global X_train
    
    model.add(tf.keras.layers.Input(X_train.shape[1])) #Input layer
    
    for i in range(hp.Int('num_layers', 3, 12, 3)): #Hidden layers
        model.add(
            tf.keras.layers.Dense(
                # hiperparâmetros
                units = hp.Int(f"units_{i}", min_value = 32, max_value = 128, step = 32),
                activation = 'relu'
                )
            )

    model.add(tf.keras.layers.Dense(3, activation = 'softmax')) #Output layer
    
    model.compile(
        optimizer = 'adam',
        loss = 'categorical_crossentropy',
        metrics = ['categorical_accuracy']
    )

    return model

In [None]:
build_model(kt.HyperParameters())

In [None]:
tuner = kt.RandomSearch(
    hypermodel = build_model,
    objective = 'val_loss',
    max_trials = 200,
    executions_per_trial = 1,
    overwrite = True,
    directory = '../resultados',
    project_name = 'kt_class'
)

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
encoder = LabelEncoder()
encoder.fit(y_train)
encoded_Y = encoder.transform(y_train)
y_train_encoded = np_utils.to_categorical(encoded_Y)
y_train_encoded

In [None]:
tuner.search(X_train, y_train_encoded,
             epochs = 20,
             batch_size = 1465,
             validation_split = 0.1,
             verbose = 3)

#### Carregar resultados da seleção de modelos

In [None]:
a_tuner = kt.RandomSearch(
    hypermodel = build_model,
    objective = 'val_loss',
    max_trials = 200,
    executions_per_trial = 1,
    overwrite = False,
    directory = '../resultados',
    project_name = "kt_class"
)

In [None]:
a_tuner.results_summary()

In [None]:
best = a_tuner.get_best_models(num_models = 1)
best = best[0]

In [None]:
best.build(input_shape = (25))

In [None]:
best.summary()

#### Treinar o modelo

In [None]:
stop = tf.keras.callbacks.EarlyStopping(
    monitor = 'val_loss',
    patience = 10)

pasta = '../resultados/Rede_neural_class'
arquivo = pasta + '/nn_model-{epoch:02d}-{val_loss:.2f}.hdf5'
cp = tf.keras.callbacks.ModelCheckpoint(
    arquivo,
    monitor = 'val_loss',
    verbose = 0,
    save_best_only = False,
    save_weights_only = False,
    save_frequency = 1)

In [None]:
NN_model = best.fit(X_train, y_train_encoded,
                    batch_size = 1465,
                    epochs = 100,
                    validation_split = 0.2,
                    verbose = 1,
                    callbacks=[TqdmCallback(verbose = 1), cp, stop])

In [None]:
hist = pd.DataFrame(NN_model.history)
hist['epoch'] = NN_model.epoch
hist

In [None]:
def plot_history(hist):
    plt.figure()
    plt.xlabel('Epoch')
    plt.ylabel('Acurácia')
    plt.plot(hist['epoch'], hist['categorical_accuracy'],
            label='Treinamento')
    plt.plot(hist['epoch'], hist['val_categorical_accuracy'],
            label = 'Validação')
    plt.legend()
    
    plt.figure()
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.plot(hist['epoch'], hist['loss'],
            label='Treinamento')
    plt.plot(hist['epoch'], hist['val_loss'],
            label = 'Validação')
    plt.legend()

In [None]:
plot_history(hist)

#### Modelo salvo

In [None]:
load = tf.keras.models.load_model(
    '../resultados/Rede_neural_class/nn_model-05-0.74.hdf5',
    compile = False)

In [None]:
load.summary()

In [None]:
NN_y_pred = load.predict(X_test)
NN_y_pred

In [None]:
NN_y_pred_labels = np.argmax(NN_y_pred, axis = 1)
NN_y_pred_labels = pd.Series(NN_y_pred_labels)
NN_y_pred_labels

In [None]:
NN_y_pred_labels.loc[NN_y_pred_labels == 0,] = 'alto'
NN_y_pred_labels.loc[NN_y_pred_labels == 1,] = 'baixo'
NN_y_pred_labels.loc[NN_y_pred_labels == 2,] = 'medio'
NN_y_pred_labels

In [None]:
resultados(NN_y_pred_labels)

### XGBoost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

#### Modelo sem ajuste de hiperparametros

In [None]:
classifier = GradientBoostingClassifier()

In [None]:
classifier.get_params()

In [None]:
start = time.time()
classifier.fit(X_train, y_train)
end = time.time()
print('Finalizado em ', round(end - start, 1), ' s')

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
resultados(y_pred)

#### Seleção de hiperparametros

In [None]:
n_estimators = [10, 50, 100] # Número de árvores de decisão
max_features = ['auto'] # Número de atributos a cada decisão
max_depth = [None, 10, 100] # Tamanho máximo de uma árvore de decisão (número de nós)
max_leaf_nodes = [None] # Número máximo de nós terminais (limita max_depth)
min_samples_split = [2, 50] # Mínimo de amostras para criar uma divisão
min_samples_leaf = [1, 50] # Mínimo número de amostras para cada nó terminal
learning_rate = [0.1]

random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'max_leaf_nodes': max_leaf_nodes,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'learning_rate': learning_rate}

In [None]:
start = time.time()
classifier_cv('GridSearchCV', GradientBoostingClassifier(), X_train, y_train, random_grid, cv = 3)
end = time.time()
print('Finalizado em ', round(end - start, 1), ' s')

##### Modelo final

In [None]:
XGB_classifier = GradientBoostingClassifier(
    n_estimators = 100,
    max_features = 'auto',
    max_depth = None,
    max_leaf_nodes = None,
    min_samples_split = 50,
    min_samples_leaf = 1,
    learning_rate = 0.1,
    verbose = 1,
    random_state = seed
)

In [None]:
XGB_classifier.fit(X_train, y_train)

In [None]:
XGB_y_pred = XGB_classifier.predict(X_test)

In [None]:
resultados(XGB_y_pred)

In [None]:
XGB_savefile = '../resultados/XGB_class.sav'
joblib.dump(XGB_classifier, XGB_savefile)

#### Modelo salvo

In [None]:
XGB_load = joblib.load('../resultados/XGB_class.sav')

In [None]:
XGB_y_pred = XGB_load.predict(X_test)

In [None]:
resultados(XGB_y_pred)

### Comparação dos modelos

In [None]:
SVC_acc = accuracy_score(y_test, SVC_y_pred)
DT_acc = accuracy_score(y_test, DT_y_pred)
RF_acc = accuracy_score(y_test, RF_y_pred)
NN_acc = accuracy_score(y_test, NN_y_pred_labels)
XGB_acc = accuracy_score(y_test, XGB_y_pred)

In [None]:
modelos_loss = {
    'Modelo': ['SVC', 'DT', 'RF', 'NN', 'XGB'],
    'acc': [SVC_acc, DT_acc, RF_acc, NN_acc, XGB_acc]
    }

In [None]:
loss_tabela = pd.DataFrame(modelos_loss)
loss_tabela

In [None]:
order = loss_tabela.sort_values('acc', ascending = False, inplace = False)
order

In [None]:
sns.barplot(x = 'Modelo', y = 'acc',
            hue = None, data = loss_tabela,
            order = np.flipud(order['Modelo']),
            orient = None, color = None, palette = None, saturation=0.5)
plt.savefig('../resultados/acc_modelos.png', dpi = 300, bbox_inches = 'tight')