<p style="text-align:center">
    <img src="https://cdn.discordapp.com/attachments/1253438405740593182/1253780484190896288/compasso-uol.png?ex=66771991&is=6675c811&hm=e78b900dc1e9184169baaad060a92928217fca81d29bf853ac13dd22f59cc1c6&" width="200" alt="Skills Network Logo"  />
    </a>
</p>

<h1 style="text-align: center;">Hotel Reservations with Machine Learning</h1>

<p style="text-align:center">
    <img src="https://cdn.discordapp.com/attachments/1253438405740593182/1253781078343286844/hotel-reservation-services-1000x1000.jpg?ex=66771a1f&is=6675c89f&hm=dd35bacdc6982b2c03a227df96fafe8d9f2c3a1f71b80c36880a7cd1fe274c46&" width="400" alt="Skills Network Logo"  />
    </a>
</p>

### Importando as Bibliotecas

In [None]:
import sagemaker
import boto3
import pandas as pd
import numpy as np
from sagemaker import Session
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sagemaker.sklearn.estimator import SKLearn
from sagemaker import get_execution_role
from dotenv import dotenv_values, load_dotenv
import os
import tarfile

### Carregando variaveis de ambiente e configurando a sessão

In [None]:

# Carrega variáveis de ambiente a partir de um arquivo .env
load_dotenv()
bucket = os.getenv('bucket')  
print('Using bucket ' + bucket)

# Configuração de sessão
boto3_session = boto3.Session()
s3_client = boto3_session.client('s3')
sm_boto3 = boto3_session.client('sagemaker')
session = sagemaker.Session(boto_session=boto3_session)

### Configurando os buckets, paths e fazendo a preparação de dados

In [None]:
# Configuração de buckets e paths
subpasta_modelo = 'modelo'
subpasta_dataset = 'datasets'
role = os.getenv('role')  
print(role)

# Carregando e processando o dataset
dataset = pd.read_csv('Hotel Reservations.csv')

# Categorizando a nova coluna de acordo com o preço
def categorize_price(price):
    if price <= 85:
        return 1
    elif price < 115:
        return 2
    else:
        return 3

# Excluir a coluna avg_price_per_room e a coluna ID
dataset['label_avg_price_per_room'] = dataset['avg_price_per_room'].apply(categorize_price)
dataset = dataset.drop(columns=['avg_price_per_room'])
dataset = dataset.drop(columns=['Booking_ID'])

# Codificação das colunas categóricas
categorical_columns = dataset.select_dtypes(include=['object']).columns

# Limpeza e preparação das colunas categoricas
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    dataset[col] = le.fit_transform(dataset[col])
    label_encoders[col] = le

# Preparação dos dados
X = dataset.drop(columns=['label_avg_price_per_room'])
y = dataset['label_avg_price_per_room']
X = np.array(X).astype('float32')
y = np.array(y).astype('float32')

# Salvando o dataset modificado em um novo arquivo CSV
dataset.to_csv('Hotel_Reservations_Modified.csv', index=False)

### Criação das variaveis de treinamento e teste do modelo

In [None]:
# Criando as variaveis para o preenchimento da entrada do modelo
attributes = list(dataset.columns)
label = 'label_avg_price_per_room'

In [None]:
# Criando as variaveis de teste e treinamento do modelo
X_data = dataset [attributes]
y_data = dataset [label] 

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)

In [None]:
# Recebendo o dataframe nas variaveis de treinamento e teste
trainX = pd.DataFrame(X_train, columns=attributes)
trainX[label] = y_train

testX = pd.DataFrame(X_test, columns=attributes)
testX[label] = y_test

In [None]:
# associando o banco de dados as variaveis de treino e teste
trainX.to_csv("train-V-1.csv",index = False)
testX.to_csv("test-V-1.csv", index = False)

### Enviando os dados para S3

In [None]:
# envia os dados para S3. Para ser treinado no SageMaker
trainpath = session.upload_data(
    path='train-V-1.csv', bucket=bucket,
    key_prefix='sagemaker')

testpath = session.upload_data(
    path='test-V-1.csv', bucket=bucket,
    key_prefix='sagemaker')

### Criação do script, contendo o modelo e os parametros para o treinamento

In [None]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import joblib
import os
import numpy as np
import pandas as pd
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf
    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument('--n_estimators', type=int, default=100)
    parser.add_argument('--random_state', type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)
    print("[INFO] Reading data")
    print()

    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (85%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (15%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
  
    print("Training RandomForest Model.....")
    print()
    model =  RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose = 3,n_jobs=-1)
    model.fit(X_train, y_train)
    print()
    
    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    print("Model persisted at " + model_path)
    print()

    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)

### Configurando os parametros para o Estimador

In [None]:
# Chamando o Estimator do SageMaker Python SDK
from sagemaker.sklearn.estimator import SKLearn

print(role)
# Configurando os parametros do estimador 
sklearn_estimator = SKLearn(
    entry_point="script.py", 
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version="1.2-1",
    base_job_name="RF-custom-sklearn",

    # Hiperparâmetros que serão passados para o script de treinamento
    hyperparameters={
        "n_estimators": 100, # Número de árvores na floresta do classificador RandomForest
        "random_state": 0, # Semente para o gerador de números aleatórios, para reprodutibilidade
    },
    
    use_spot_instances = True,
    max_wait = 7200, 
    max_run = 3600,
)

### Executando o treinamento do modelo

In [None]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True, logs=True)

artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name)['ModelArtifacts']['S3ModelArtifacts']

print('Model artifact persisted at ' + artifact)

### Salvando a saida do treinamento convertendo em joblib e salvando na s3

In [None]:
bucket_name = os.getenv('bucket_name') #TROCAR PARA O NOME DO BUCKET NO QUAL FOI CRIADO O MODEL.TAR.GZ
s3_key ='RF-custom-sklearn-2024-06-21-15-06-33-705/output/model.tar.gz' #TROCAR PARA O ENDEREÇO CORRETO
local_tar_path = 'model.tar.gz'
extracted_model_path = 'model.pkl'
model_s3_key = f'{subpasta_modelo}/model.joblib' # Caminho completo no S3

s3 = boto3.client('s3')

s3.download_file(bucket_name, s3_key, local_tar_path)
print(f'{local_tar_path} baixado')

with tarfile.open(local_tar_path, 'r:gz') as tar:
    tar.extractall()
    extracted_files = tar.getnames()
    print(f'Arquivo {local_tar_path} extraído para {extracted_files}')

extracted_model_path = None
for file_name in extracted_files:
    if file_name.endswith('.joblib'):
        extracted_model_path = file_name
        break

if extracted_model_path and os.path.exists(extracted_model_path):
    os.rename(extracted_model_path, 'model.joblib')
else:
    print(f'Arquivo model.joblib não encontrado após a extração.')

# subindo o resultado para a s3
if os.path.exists('model.joblib'):
    s3.upload_file('model.joblib', bucket_name, model_s3_key)
    print(f'Arquivo model.joblib carregado para s3://{bucket_name}/{model_s3_key}')
else:
    print('Arquivo model.joblib não encontrado localmente para fazer upload.')