# <center>(7570) SIST.DE PROG. NO CONVENCIONAL DE ROBOTS
### <center>Trabajo Práctico 02: Redes Neuronales y Random Forest</center>
### <center>Marco Luis Fleres, Padrón 93174</center>

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Cargamos el dataset:

In [None]:
DATAFILE="/kaggle/input/mushroom-classification/mushrooms.csv"
DATASET_SIZE=8124
np.set_printoptions(precision=3, suppress=True)
N_CLASSES=1 # Edible/Poisonous
N_INPUTS=22 # Columnas del CSV usadas para clasificar

full_dataset = tf.data.experimental.make_csv_dataset(DATAFILE, batch_size=1, label_name="class", shuffle=True)
# Ya que hay una sola clase "edible/poisonous", codificamos el label de los rows como 0/1. Luego codificaremos los features.
full_dataset = full_dataset.map(lambda features, label: (features, 0 if label=="e" else 1))

for features, label in full_dataset.take(1):
    print("Label:", label.numpy())
    tf.print(features)

Separamos los datos en los conjuntos de entrenamiento y validación.

In [None]:
train_data = full_dataset.take(round(DATASET_SIZE/3*2))
test_data = full_dataset.skip(round(DATASET_SIZE/3*2)).take(round(DATASET_SIZE/3)-1)


print(tf.data.experimental.cardinality(train_data).numpy())
print(tf.data.experimental.cardinality(test_data).numpy())

Construimos el modelo del perceptrón multicapa. Empezamos por codificar los valores de las columnas como números. Ya que construiremos un modelo de keras, usamos feature_columns para codificar las columnas:

In [None]:
from tensorflow import feature_column

feature_columns=[]

VOCABULARY={
    'cap-shape': ['b', 'c', 'x', 'f', 'k', 's'],
    'cap-surface': ['f', 'g', 'y', 's'],
    'cap-color': ['n', 'b', 'c', 'g', 'r', 'p', 'u', 'e', 'w', 'y'],
    'bruises': ['t', 'f'],
    'odor': ['a', 'l', 'c', 'y', 'f', 'm', 'n', 'p', 's'],
    'gill-attachment': ['a', 'd', 'f', 'n'],
    'gill-spacing': ['c', 'w', 'd'],
    'gill-size': ['b', 'n'],
    'gill-color': ['k', 'n', 'b', 'h', 'g', 'r', 'o', 'p', 'u', 'e', 'w', 'y'],
    'stalk-shape': ['e', 't'],
    'stalk-root': ['b', 'c', 'u', 'e', 'z', 'r', '?'],
    'stalk-surface-above-ring': ['f', 'y', 'k', 's'],
    'stalk-surface-below-ring': ['f', 'y', 'k', 's'],
    'stalk-color-above-ring': ['n', 'b', 'c', 'g', 'o', 'p', 'e', 'w', 'y'],
    'stalk-color-below-ring': ['n', 'b', 'c', 'g', 'o', 'p', 'e', 'w', 'y'],
    'veil-type': ['p', 'u'],
    'veil-color': ['n', 'o', 'w', 'y'],
    'ring-number': ['n', 'o', 't'],
    'ring-type': ['c', 'e', 'f', 'l', 'n', 'p', 's', 'z'],
    'spore-print-color': ['k', 'n', 'b', 'h', 'r', 'o', 'u', 'w', 'y'],
    'population': ['a', 'c', 'n', 's', 'v', 'y'],
    'habitat': ['g', 'l', 'm', 'p', 'u', 'w', 'd'],
}

for header in VOCABULARY:
    feature_columns.append(feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list(header, VOCABULARY[header])))

feature_layer = tf.keras.layers.DenseFeatures(feature_columns)


Observamos el preprocesado:

In [None]:
for features, label in train_data.take(1):
    print("Label:", label.numpy())
    print("Parametros:")
    tf.print(features)
    print("Parametros codificados:", feature_layer(features))

In [None]:
from tensorflow.keras import layers
from keras.optimizers import SGD

model = tf.keras.Sequential([
  #tf.keras.Input(shape=(126)),
  feature_layer,
  layers.Dense(20, activation='relu', kernel_initializer='he_normal', name="layer1"),
  layers.Dense(20, activation='relu', kernel_initializer='he_normal', name="layer1"),
  layers.Dense(1, name="output")
])

model.compile(optimizer=SGD(learning_rate=0.01, momentum=0.9), loss='mse', metrics=['accuracy'])
#print(model.summary())

#train_data = train_data.batch(100)
model.fit(train_data.batch(100), epochs=10, batch_size=100, verbose=1)

Probamos el Predictor con algunos datos:

In [None]:
for features, label in test_data.take(20):
    print("Label/Prediction:", label.numpy(), model.predict_classes(features)[0][0])

> Obtenemos el Error Cuadrático Medio, la precisión, recall y f1 del modelo con los datos de prueba, tanto con el puntaje provisto por Keras como el evaluado con sklearn.

In [None]:
from math import sqrt

# Evaluamos usando el conjunto de datos de test
loss, accuracy = model.evaluate(test_data.batch(1))
print('MSE: %.3f' % (loss))
print('Accuracy: %.3f' % (accuracy))



In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#predictions = test_data.map(lambda features, labels: (labels, tf.map_fn(lambda f:model.predict_classes(f), features)))
predictionPairs = []

for features, label in test_data.take(10):
    predictionPairs.append( (label.numpy(), model.predict_classes(features)[0][0]) )

y_true, y_pred = zip(*predictionPairs)
    
print({
    "accuracy": accuracy_score(y_true, y_pred),
    "precision": precision_score(y_true, y_pred),
    "recall": recall_score(y_true, y_pred),
    "f1": f1_score(y_true, y_pred)
})

---

# Random Forest

In [None]:
dataPd = pd.read_csv(DATAFILE)
dataPd.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

Y = dataPd['class']
X = dataPd.drop(['class'], axis=1)

# Necesitamos codificar los features como números para alimentar el Random Forest
X = OneHotEncoder().fit_transform(X).toarray()

train_X, test_X, train_y, test_y = train_test_split(X, Y, test_size=0.33, random_state=0)

Parámetros del Random Forest:
Cantidad de árboles (n_estimators): 50


In [None]:
from sklearn.ensemble import RandomForestClassifier

RF_model = RandomForestClassifier(bootstrap=True, n_estimators=50)
RF_model.fit(train_X, train_y)

In [None]:
from sklearn.metrics import accuracy_score, recall_score, f1_score

real = list(test_y.tolist())
RF_predictions = list(RF_model.predict(test_X))

print({
    "accuracy": accuracy_score(real, RF_predictions),
    "precision": precision_score(real, RF_predictions, pos_label="e"),
    "recall": recall_score(real, RF_predictions, pos_label="e"),
    "f1": f1_score(real, RF_predictions, pos_label="e")
})

# Conclusiones

Tanto la red neuronal como el RandomForest tuvieron una precisión muy alta. Es posible que se haya dado overfitting, aunque los RandomForest son menos suceptibles a esto. Harían falta mas datos para comprobarlo. La Red Neuronal si bien converge rápidamente, no es tan rápida de entrenar como el RandomForest, y requiere más hiperparámetros, por lo que a igualdad de resultados, preferiremos el RandomForest.