# Ciencia de datos de clasificación

Importar librerías

In [199]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from sklearn.metrics import (accuracy_score, precision_score, recall_score,f1_score, confusion_matrix, classification_report,roc_auc_score, roc_curve)
from imblearn.under_sampling import RandomUnderSampler
import mlflow
import mlflow.tensorflow

Importar datos limpios

In [225]:
df=pd.read_csv("listings_limpio.csv")

# Alistamiento de datos

Visualizar dataframe

In [226]:
df.drop(columns=['id'], inplace=True)
df.shape

(17664, 39)

Definir el target

In [227]:
df['Target']= (df['review_scores_rating']>=4.8).astype(int)
df['Target'].value_counts()


Target
0    11341
1     6323
Name: count, dtype: int64

Borrar la variable que se usó para definir el target

In [228]:
cols_review= [c for c in df.columns if "review_scores" in c]
cols_review

['review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value']

In [229]:
df.drop(columns=cols_review, inplace=True)

In [None]:
df.drop(columns='neighbourhood_cleansed',inplace=True)

In [None]:
vars_a_quitar = [
    'bathrooms',
    'bedrooms',
    'beds',
    ]
df.drop(columns=vars_a_quitar,inplace=True)

### Ordenar feats

In [232]:
cols_string = df.select_dtypes(include=['object', 'string']).columns.tolist()

In [233]:
cols_bool = df.select_dtypes(include=['bool']).columns.tolist()
df[cols_bool] = df[cols_bool].astype(int)

In [234]:
cols_num = df.select_dtypes(include=['number']).columns.tolist()
cols_num.remove('Target')
cols_bin= [
    c for c in cols_num
    if df[c].dropna().nunique() == 2 and df[c].dropna().isin([0, 1]).all()
]
cols_num=[c for c in cols_num if c not in cols_bin]

In [235]:
feats = cols_num+cols_bin+cols_string
print(len(feats),feats)

32 ['host_response_rate', 'host_acceptance_rate', 'latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price', 'minimum_nights', 'maximum_nights', 'availability_365', 'number_of_reviews', 'estimated_occupancy_l365d', 'estimated_revenue_l365d', 'reviews_per_month', 'Wifi', 'Air_conditioning', 'Kitchen_and_dining', 'Washer_dryer', 'TV', 'Safe', 'Refrigerator', 'Smoke_alarm_home_safety', 'Essentials', 'Services', 'property_Entire_Place', 'property_Hotel_Room', 'property_Other', 'property_Private_Room', 'property_Shared_Room', 'neighbourhood_cleansed']


In [236]:
df=df[feats+['Target']]
df.head(5)

Unnamed: 0,host_response_rate,host_acceptance_rate,latitude,longitude,accommodates,bathrooms,bedrooms,beds,price,minimum_nights,...,Smoke_alarm_home_safety,Essentials,Services,property_Entire_Place,property_Hotel_Room,property_Other,property_Private_Room,property_Shared_Room,neighbourhood_cleansed,Target
0,100,100,40.41476,-3.70418,2,1.0,1.0,2.0,157.0,5,...,0,0,0,1,0,0,0,0,Sol,0
1,100,100,40.42247,-3.70577,2,1.0,1.0,3.0,143.0,5,...,0,0,0,1,0,0,0,0,Universidad,0
2,100,99,40.41884,-3.69655,4,1.5,1.0,2.0,65.0,1,...,1,1,0,0,0,0,1,0,Justicia,0
3,100,100,40.42252,-3.7025,2,1.0,1.0,1.0,116.0,30,...,0,1,1,1,0,0,0,0,Universidad,0
4,100,100,40.42252,-3.7025,2,1.0,1.0,2.0,79.0,30,...,0,1,1,1,0,0,0,0,Universidad,0


### Separar en train y test

In [237]:
train = df.sample(frac=0.8, random_state=100)
x_train=train.drop(columns=df.columns[-1])
x_train.head()
y_train=train['Target']

In [238]:
test = df.drop(train.index)
x_test=test.drop(columns=df.columns[-1])
x_test.head()
y_test=test['Target']

In [239]:
val = train.sample(frac=0.2, random_state=100)
train = train.drop(val.index)

In [240]:
print(train.shape)
print(val.shape)
print(test.shape)

(11305, 33)
(2826, 33)
(3533, 33)


### Balancear train

In [217]:
rus = RandomUnderSampler(random_state=42)
x_train, y_train = rus.fit_resample(x_train, y_train)
print(y_train.value_counts())
train = pd.concat([x_train, y_train], axis=1) 

Target
0    5071
1    5071
Name: count, dtype: int64


### Procesar variables de entrada

In [241]:
def dataframe_to_dataset(dataframe, shuffle=True):
    dataframe = dataframe.copy()
    labels = dataframe.pop("Target")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe), reshuffle_each_iteration=True)
    return ds

In [242]:
batch_size = 32

train_ds = dataframe_to_dataset(train, shuffle=True).batch(batch_size)
val_ds   = dataframe_to_dataset(val, shuffle=False).batch(batch_size)
test_ds  = dataframe_to_dataset(test, shuffle=False).batch(batch_size)

In [243]:
def encode_numerical_feature(feature, name, dataset):
    normalizer = keras.layers.Normalization()
    feat_ds = dataset.map(lambda x, y: x[name])
    feat_ds = feat_ds.map(lambda x: tf.expand_dims(x, -1))
    normalizer.adapt(feat_ds)
    return normalizer(feature)

def encode_categorical_feature(feature, name, dataset, is_string=False):
    lookup_class = keras.layers.StringLookup if is_string else keras.layers.IntegerLookup
    lookup = lookup_class(output_mode="binary")  
    feat_ds = dataset.map(lambda x, y: x[name])
    feat_ds = feat_ds.map(lambda x: tf.expand_dims(x, -1))
    lookup.adapt(feat_ds)
    return lookup(feature)

In [244]:
inputs = []
for i in cols_num:
  inputs.append(keras.Input(shape=(1,), name=i))

for i in cols_bin:
  inputs.append(keras.Input(shape=(1,), name=i, dtype="int64"))

for i in cols_string:
  inputs.append(keras.Input(shape=(1,), name=i, dtype="string"))

In [245]:
feats_encoded=[]
len_feats=len(feats_encoded)
for i,feat in enumerate(cols_num):
  feats_encoded.append(
      encode_numerical_feature(inputs[len_feats+i], feat, train_ds)
  )

len_feats=len(feats_encoded)

for i,feat in enumerate(cols_bin):
  feats_encoded.append(
      encode_categorical_feature(inputs[len_feats+i], feat, train_ds, False)
  )

len_feats=len(feats_encoded)

for i,feat in enumerate(cols_string):
  feats_encoded.append(
      encode_categorical_feature(inputs[len_feats+i], feat, train_ds, True)
  )

all_feats = keras.layers.concatenate(feats_encoded)
len_feats

31

## Red neuronal de clasificación

### Correr con MLFLow

In [250]:
# Construir modelo
capa1= keras.layers.Dense(32, activation='relu')(all_feats)
capa2 = keras.layers.Dense(32, activation='relu')(capa1)
capa3 = keras.layers.Dense(32, activation='relu')(capa2)
model_layers = keras.layers.Dense(1, activation='tanh')(capa3)
model = keras.Model(inputs, model_layers)

#Definir optimizador y compilar
optimizer = keras.optimizers.Adam(learning_rate=0.001)
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=[
        keras.metrics.BinaryAccuracy(name='accuracy',threshold=0.4),
        keras.metrics.AUC(name='auc'),
        keras.metrics.Precision(name='precision',thresholds=0.4),
        keras.metrics.Recall(name='recall',thresholds=0.4)
    ]
)

import os
os.environ["KERAS_BACKEND"] = "tensorflow"

#Definir experimento y ruta
mlflow.set_tracking_uri("file:///C:/mlruns_proy2")
mlflow.set_experiment("ACTD - CLASIFICACIÓN")

with mlflow.start_run(run_name="RUN 11"):
    #Definir parámetros
    mlflow.log_params({
        "modelo": "DNN",
        "activacion_ocultas":"TANH",
        "n_capas_oculttas": 3,
        "n_neuronas": 32,
        "learning_rate": 1e-3,
        "epochs_max": 50,
        "var_eliminadas": 'camas, baños, cuartos ',
        'threshold': 0.4
    })

    #Entrenar modelo
    history = model.fit(
        train_ds,
        epochs=50,
        validation_data=val_ds
    )

    #Resultados
    test_results = model.evaluate(test_ds, verbose=0)
    test_loss = test_results[0]
    test_acc  = test_results[1]
    test_auc  = test_results[2]
    test_pre = test_results[3]
    test_rec = test_results[4]

    # Métricas de última época en train/val (del history)
    train_loss_final = history.history["loss"][-1]
    val_loss_final   = history.history["val_loss"][-1]
    train_auc_final  = history.history["auc"][-1]
    val_auc_final    = history.history["val_auc"][-1]

    #Guardar las métricas
    mlflow.log_metrics({
        "train_loss_final": train_loss_final,
        "val_loss_final":   val_loss_final,
        "train_auc_final":  train_auc_final,
        "val_auc_final":    val_auc_final,
        "test_loss":        test_loss,
        "test_accuracy":    test_acc,
        "test_auc":         test_auc,
        "test_precision":   test_pre,
        "test_recall":      test_rec
    })

    #Registrar history
    for epoch in range(len(history.history["loss"])):
        mlflow.log_metric("loss",      history.history["loss"][epoch],      step=epoch)
        mlflow.log_metric("val_loss",  history.history["val_loss"][epoch],  step=epoch)

    #Guardar modelo
    mlflow.tensorflow.log_model(model, "model")

    mlflow.end_run(status='FINISHED')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50




INFO:tensorflow:Assets written to: C:\Users\adri_\AppData\Local\Temp\tmp6w8cl9j2\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\adri_\AppData\Local\Temp\tmp6w8cl9j2\model\data\model\assets


### Mejor modelo