## 0. Development Setting

In [1]:
!python -V

Python 3.9.16


In [2]:
import pandas as pd
import pymysql
import boto3

from smart_open import open as s_open
from easydict import EasyDict
from dotenv import load_dotenv

from datetime import datetime
import time
import os

load_dotenv()

True

In [3]:
settings = EasyDict()

settings.AWS_ACCESS_KEY_ID = os.getenv('settings.AWS_ACCESS_KEY_ID')
settings.AWS_SECRET_ACCESS_KEY=os.getenv("settings.AWS_SECRET_ACCESS_KEY")
settings.AWS_REGION_NAME=os.getenv("settings.AWS_REGION_NAME")
settings.AWS_ACCOUNT_ID=os.getenv("settings.AWS_ACCOUNT_ID")

settings.DB_HOST=os.getenv("settings.DB_HOST")
settings.DB_USER=os.getenv("settings.DB_USER")
settings.DB_PASSWORD=os.getenv("settings.DB_PASSWORD")
settings.DB_NAME=os.getenv("settings.DB_NAME")
settings.DB_PORT=os.getenv("settings.DB_PORT")

settings.AWS_BUCKET_NAME="genia-bucket"
settings.AWS_ATHENA_OUTPUT_LOCATION="athena/quries"
settings.AWS_ATHENA_DATABASE="mini_db"

In [4]:
db = pymysql.connect(
    host=settings.DB_HOST,
    user=settings.DB_USER,
    passwd=settings.DB_PASSWORD,
    db=settings.DB_NAME,
    port=int(settings.DB_PORT),
)

cursor = db.cursor()
cursor

<pymysql.cursors.Cursor at 0x7f65b5e73b80>

In [5]:
class Boto3Client(object):
    aws_access_key_id = settings.AWS_ACCESS_KEY_ID
    aws_secret_access_key = settings.AWS_SECRET_ACCESS_KEY
    region_name = settings.AWS_REGION_NAME
    bucket_name = settings.AWS_BUCKET_NAME
    athena_database=settings.AWS_ATHENA_DATABASE
    athena_output_location=settings.AWS_ATHENA_OUTPUT_LOCATION
    
    service_name = None
    
    @classmethod
    def get_client(cls):
        options = dict(
            aws_access_key_id=Boto3Client.aws_access_key_id,
            aws_secret_access_key=Boto3Client.aws_secret_access_key,
            region_name=Boto3Client.region_name,
        )
        return boto3.client(cls.service_name, **options)

In [6]:
class S3Client(Boto3Client):
    service_name = "s3"
    
    @staticmethod
    def get_s3_df(file_name: str):
        
        clnt = S3Client.get_client()
        obj = clnt.get_object(
                Bucket=S3Client.bucket_name,
                Key=file_name
            )
        return pd.read_csv(obj["Body"])
    
    @staticmethod
    def get_s3_parquet_df(file_name: str):
        
        clnt = S3Client.get_client()
        obj = clnt.get_object(
                Bucket=S3Client.bucket_name,
                Key=file_name
            )
        return pd.read_parquet(io.BytesIO(obj["Body"].read()), engine="pyarrow")
    
    @staticmethod
    def upload_s3_df(df: pd.DataFrame, file_name: str):
        try:
            clnt = S3Client.get_client()
            file_name = f"s3://{S3Client.bucket_name}/{file_name}"
            with s_open(file_name, "wb", transport_params=dict(client=clnt)) as out_file:
                df.to_parquet(out_file, engine="pyarrow", compression="gzip", index=False)
            return True
        except Exception as e:
            print("Error occured: ", str(e))
            return False

In [7]:
df = S3Client.get_s3_df("raw/abalone.csv")
df

Unnamed: 0,Type,LongestShell,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [8]:
class AthenaClient(Boto3Client):
    
    service_name = "athena"
    
    output_location = f"s3://{Boto3Client.bucket_name}/{Boto3Client.athena_output_location}"
    
    @staticmethod
    def get_athena_query_exec_id(sql: str):
        
        clnt = AthenaClient.get_client()
        response = clnt.start_query_execution(
            QueryString=sql,
            QueryExecutionContext={"Database": AthenaClient.athena_database},
            ResultConfiguration={"OutputLocation": AthenaClient.output_location},
        )
        # response 내의 StatusCode == 200 확인
        return response["QueryExecutionId"]
    
    @staticmethod
    def collect_query_result(query_exec_id: str):
        
        clnt = AthenaClient.get_client()
        
        WAIT = ["QUEUED", "RUNNING"]
        SUCCESS = ["SUCCEEDED"]
        FAILED = ["FAILED", "CANCELLED"]
        
        while True:
            try:
                result = clnt.get_query_execution(QueryExecutionId=query_exec_id)
                status = result["QueryExecution"]["Status"]["State"]

                if status in SUCCESS:
                    query_result_path = f"{AthenaClient.output_location}/{query_exec_id}.csv"
                    query_result_path = query_result_path.replace(f"s3://{AthenaClient.bucket_name}/", "")
                    return S3Client.get_s3_df(query_result_path)

                if status in FAILED:
                    print(f"FAILED!!! -> {status}")
                    break

                if status in WAIT:
                    print(f"Still Running... -> {status}")
                    time.sleep(0.5)
                    continue

                print(f"unexpected status... -> {status}")
                break

            except Exception as e:
                print(str(e))
                break
        return False
    
    @staticmethod
    def get_athena_sql(sql: str):
        query_exec_id = AthenaClient.get_athena_query_exec_id(sample_sql)
        return AthenaClient.collect_query_result(query_exec_id)

## 1. Load Dataset (Amazon Athena)

In [9]:
sample_sql = 'SELECT * FROM "mini_db"."teacher-hotel"'
result = AthenaClient.get_athena_sql(sample_sql)

result

Still Running... -> QUEUED
Still Running... -> RUNNING
Still Running... -> RUNNING
Still Running... -> RUNNING
Still Running... -> RUNNING
Still Running... -> RUNNING


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,yyyy,mm
0,Resort Hotel,1,1,2016,November,45,1,0,1,1,...,,0,Transient,75.0,0,0,No-Show,2016-11-01,2016,11
1,Resort Hotel,1,378,2016,November,45,1,0,2,2,...,,0,Transient,46.0,0,0,Canceled,2016-10-18,2016,11
2,Resort Hotel,1,378,2016,November,45,1,0,2,2,...,,0,Transient,46.0,0,0,Canceled,2016-03-11,2016,11
3,Resort Hotel,1,378,2016,November,45,1,0,2,2,...,,0,Transient,46.0,0,0,Canceled,2016-10-18,2016,11
4,Resort Hotel,1,378,2016,November,45,1,0,2,2,...,,0,Transient,46.0,0,0,Canceled,2016-10-18,2016,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114460,City Hotel,0,11,2015,November,49,30,1,2,1,...,,0,Transient,55.0,0,0,Check-Out,2015-12-03,2015,11
114461,City Hotel,0,6,2015,November,49,29,2,2,2,...,,0,Transient,84.0,1,1,Check-Out,2015-12-03,2015,11
114462,City Hotel,0,68,2015,November,49,30,1,3,1,...,,0,Transient,60.0,0,1,Check-Out,2015-12-04,2015,11
114463,City Hotel,0,68,2015,November,49,30,1,3,2,...,,0,Transient,60.0,0,1,Check-Out,2015-12-04,2015,11


In [10]:
result.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date', 'yyyy', 'mm'],
      dtype='object')

column name | 컬럼 이름
---:|:---
hotel | 호텔이름
is_canceled | 취소 여부
lead_time | 예약한 날짜와 호텔에 도착한 날짜 사이의 경과일
arrival_date_year | 도착 연도
arrival_date_month | 도착 달
arrival_date_week_number | 도착 주
arrival_date_day_of_month | 도착 일
stays_in_weekend_nights | 주말 숙박 일수
stays_in_week_nights | 평일 숙박 일수
adults | 성인 몇명
children | 청소년 몇명
babies | 아기 몇명
meal | 식사 예약 종류
country 
market_segment | 마켓 구분
distribution_channel | 예약 채널
is_repeated_guest | 재방문 고객 여부
previous_cancellations | 지난 취소 예약 수
previous_bookings_not_canceled | 지난 취소하지 않은 예약 수 
reserved_room_type | 예약한 방 타입
assigned_room_type | 받은 방 타입
booking_changes | 예약 후 예약 변경 / 수정 횟수
deposit_type | 보증금 타입
agent | 예약 에이전트 (ID로 대체)
company | 예약 회사명 (ID로 대체)
days_in_waiting_list | 예약 확정 전까지 대기 명단에 있었던 기간
customer_type | 고객 유형
adr | 평균 일일 숙박비
required_car_parking_spaces | 요구 주차 공간 갯수
total_of_special_requests special_requests | 갯수
reservation_status | 예약 상태
reservation_status_date | 마지막 예약 상태가 설정된 일자
yyyy | (새로 추가됨)
mm | (새로 추가됨)

In [11]:
df = result.copy()
df

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,yyyy,mm
0,Resort Hotel,1,1,2016,November,45,1,0,1,1,...,,0,Transient,75.0,0,0,No-Show,2016-11-01,2016,11
1,Resort Hotel,1,378,2016,November,45,1,0,2,2,...,,0,Transient,46.0,0,0,Canceled,2016-10-18,2016,11
2,Resort Hotel,1,378,2016,November,45,1,0,2,2,...,,0,Transient,46.0,0,0,Canceled,2016-03-11,2016,11
3,Resort Hotel,1,378,2016,November,45,1,0,2,2,...,,0,Transient,46.0,0,0,Canceled,2016-10-18,2016,11
4,Resort Hotel,1,378,2016,November,45,1,0,2,2,...,,0,Transient,46.0,0,0,Canceled,2016-10-18,2016,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114460,City Hotel,0,11,2015,November,49,30,1,2,1,...,,0,Transient,55.0,0,0,Check-Out,2015-12-03,2015,11
114461,City Hotel,0,6,2015,November,49,29,2,2,2,...,,0,Transient,84.0,1,1,Check-Out,2015-12-03,2015,11
114462,City Hotel,0,68,2015,November,49,30,1,3,1,...,,0,Transient,60.0,0,1,Check-Out,2015-12-04,2015,11
114463,City Hotel,0,68,2015,November,49,30,1,3,2,...,,0,Transient,60.0,0,1,Check-Out,2015-12-04,2015,11


In [12]:
df.is_repeated_guest.unique() # 재방문 여부

array([0, 1])

In [13]:
df.previous_cancellations.unique() # 취소 예약 수

array([ 0,  1,  2,  4,  5, 25, 14, 19,  3, 24, 21, 11, 13,  6, 26])

In [14]:
df.previous_bookings_not_canceled.unique()

array([ 0,  2,  5,  4,  8,  3,  6,  1, 22,  7,  9, 21, 12, 13, 19, 11, 10,
       18, 20, 14, 27, 28, 16, 15, 17, 50, 51, 52, 53, 41, 42, 30, 26, 37,
       38, 39, 40, 24, 25, 68, 69, 70, 71, 29, 23, 43, 44, 45, 46, 31, 32,
       47, 48, 49, 58, 59, 60, 61, 35, 36, 33, 34, 62, 63, 64, 65, 66, 67,
       54, 55, 56, 57])

In [15]:
df.reserved_room_type.unique()

array(['E', 'A', 'D', 'F', 'C', 'G', 'H', 'P', 'L', 'B'], dtype=object)

In [16]:
df.days_in_waiting_list.unique()

array([  0,   2, 116, 122,  38,   1,  11,  60,  34,  50, 107,  43, 185,
        93, 109,   6,  13,   8,   3,  10,  40,  91,  69,  57,  99, 111,
        79,  98, 147,  41,  21,  48,  44,   5,  96,  65,  33,  77,  80,
        71,  32,  14,  27,  49, 379,  70,  35, 178, 330, 223,  23,  20,
       101,   4,  28,  22,  37, 105,  17,  25, 113,   9,  18,  19,  56,
        83,  16, 165,  61,  39,  47, 150, 125,  75,  85, 142, 174,  24,
       162, 391,  68, 117,  76,  62, 193,  63,  15,  45,  52,  55, 236,
       259,  12,  54,  59, 108,  81,  92,  74, 224,  31, 187, 176, 207,
       215, 160, 120,  30,  58,  89,  53,  46,  72,  42,   7,  84, 100,
       121,  26,  73, 154,  64,  36, 175, 183,  97,  87, 167])

In [17]:
df_last = df[['is_canceled', # 취소여부
              'mm', # 숙박 달
              'lead_time', # 예약 경과일
              'deposit_type', # 보증금 종류
              'customer_type', # 손님 종류
              'is_repeated_guest', # 재방문 여부
              'market_segment',
              'distribution_channel',
              'previous_cancellations', # 이전 취소 횟수
              'previous_bookings_not_canceled', # 취소하지 않은 횟수
              'reserved_room_type', # 예약한 방 타입
              'booking_changes', # 예약 변경 횟수
              'total_of_special_requests', # 특별 요청 수
              'adr' # 평균 숙박 가격 (하루당)
             ]]
df_last

Unnamed: 0,is_canceled,mm,lead_time,deposit_type,customer_type,is_repeated_guest,market_segment,distribution_channel,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,booking_changes,total_of_special_requests,adr
0,1,11,1,No Deposit,Transient,0,Online TA,TA/TO,0,0,E,0,0,75.0
1,1,11,378,Non Refund,Transient,0,Groups,TA/TO,0,0,A,0,0,46.0
2,1,11,378,Non Refund,Transient,0,Groups,TA/TO,0,0,A,0,0,46.0
3,1,11,378,Non Refund,Transient,0,Groups,TA/TO,0,0,A,0,0,46.0
4,1,11,378,Non Refund,Transient,0,Groups,TA/TO,0,0,A,0,0,46.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114460,0,11,11,No Deposit,Transient,0,Offline TA/TO,TA/TO,0,0,A,0,0,55.0
114461,0,11,6,No Deposit,Transient,0,Direct,Direct,0,0,A,1,1,84.0
114462,0,11,68,No Deposit,Transient,0,Offline TA/TO,TA/TO,0,0,D,1,1,60.0
114463,0,11,68,No Deposit,Transient,0,Offline TA/TO,TA/TO,0,0,D,1,1,60.0


## 2. Preprocessing

In [18]:
import numpy as np
np.sum(df_last.isnull())

is_canceled                       0
mm                                0
lead_time                         0
deposit_type                      0
customer_type                     0
is_repeated_guest                 0
market_segment                    0
distribution_channel              0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
booking_changes                   0
total_of_special_requests         0
adr                               0
dtype: int64

In [19]:
numerical_columns = ["lead_time", "previous_cancellations", "previous_bookings_not_canceled", 
                     "booking_changes", "total_of_special_requests", "adr"]

categorical_columns = ["mm", "deposit_type", "customer_type", "is_repeated_guest", "market_segment", 
                       "distribution_channel", "reserved_room_type"]

y_column = "is_canceled"

numerical_df = df_last[numerical_columns]
cate_df = df_last[categorical_columns]
y_data = df_last[y_column]

In [20]:
numerical_df.head()

Unnamed: 0,lead_time,previous_cancellations,previous_bookings_not_canceled,booking_changes,total_of_special_requests,adr
0,1,0,0,0,0,75.0
1,378,0,0,0,0,46.0
2,378,0,0,0,0,46.0
3,378,0,0,0,0,46.0
4,378,0,0,0,0,46.0


In [21]:
cate_df_encoded = pd.get_dummies(cate_df)
cate_df_encoded.head()

Unnamed: 0,mm,is_repeated_guest,deposit_type_No Deposit,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Contract,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,market_segment_Aviation,...,reserved_room_type_A,reserved_room_type_B,reserved_room_type_C,reserved_room_type_D,reserved_room_type_E,reserved_room_type_F,reserved_room_type_G,reserved_room_type_H,reserved_room_type_L,reserved_room_type_P
0,11,0,1,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1,11,0,0,1,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
2,11,0,0,1,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
3,11,0,0,1,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
4,11,0,0,1,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0


In [22]:
cate_df_encoded.columns

Index(['mm', 'is_repeated_guest', 'deposit_type_No Deposit',
       'deposit_type_Non Refund', 'deposit_type_Refundable',
       'customer_type_Contract', 'customer_type_Group',
       'customer_type_Transient', 'customer_type_Transient-Party',
       'market_segment_Aviation', 'market_segment_Complementary',
       'market_segment_Corporate', 'market_segment_Direct',
       'market_segment_Groups', 'market_segment_Offline TA/TO',
       'market_segment_Online TA', 'market_segment_Undefined',
       'distribution_channel_Corporate', 'distribution_channel_Direct',
       'distribution_channel_GDS', 'distribution_channel_TA/TO',
       'distribution_channel_Undefined', 'reserved_room_type_A',
       'reserved_room_type_B', 'reserved_room_type_C', 'reserved_room_type_D',
       'reserved_room_type_E', 'reserved_room_type_F', 'reserved_room_type_G',
       'reserved_room_type_H', 'reserved_room_type_L', 'reserved_room_type_P'],
      dtype='object')

In [23]:
encoded_df = pd.concat([numerical_df, cate_df_encoded, y_data], axis=1)
len(encoded_df)

114465

In [24]:
X_data = encoded_df.drop("is_canceled", axis=1)
y_data = encoded_df["is_canceled"]

X_data.shape, y_data.shape

((114465, 38), (114465,))

In [25]:
from sklearn.preprocessing import StandardScaler

ss_sc = StandardScaler()
X_data_scaled = ss_sc.fit_transform(X_data)
X_data_scaled.shape

(114465, 38)

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_data_scaled,
    y_data,
    stratify=y_data,
    test_size=0.25,
    random_state=42,
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((85848, 38), (28617, 38), (85848,), (28617,))

## 3. DeepLearning Model Architecture

In [27]:
import tensorflow as tf

2023-02-21 07:02:55.957147: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-21 07:02:56.099647: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-21 07:02:56.099678: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-21 07:02:57.035446: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [28]:
input_layer = tf.keras.Input(shape=X_train.shape[1:], name="InputLayer")

y = tf.keras.layers.Dense(64, activation="relu")(input_layer)
y = tf.keras.layers.Dropout(0.3)(y)

y = tf.keras.layers.Dense(16, activation="relu")(y)
y = tf.keras.layers.Dropout(0.2)(y)

y = tf.keras.layers.Dense(16, activation="relu")(y)
y = tf.keras.layers.Dropout(0.2)(y)

y = tf.keras.layers.Dense(4, activation="relu")(y)
y = tf.keras.layers.Dropout(0.1)(y)

output_layer = tf.keras.layers.Dense(1, activation="sigmoid", name="OutputLayer")(y)

model = tf.keras.Model(inputs=input_layer, outputs=output_layer, name="FuncModel")
model.summary()

Model: "FuncModel"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 InputLayer (InputLayer)     [(None, 38)]              0         
                                                                 
 dense (Dense)               (None, 64)                2496      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 16)                1040      
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_2 (Dense)             (None, 16)                272       
                                                                 
 dropout_2 (Dropout)         (None, 16)                0 

2023-02-21 07:03:01.738898: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-02-21 07:03:01.738941: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-02-21 07:03:01.738968: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (6be65369156f): /proc/driver/nvidia/version does not exist
2023-02-21 07:03:01.739213: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 4. Train Model

In [29]:
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_accuracy",
    mode="max",
    patience=50,
    restore_best_weights=True,
    verbose=1,
)


history = model.fit(
    X_train, 
    y_train,
    epochs=30,
    validation_split=0.25,
    batch_size=16,
    callbacks=[early_stopping],
    verbose=1,
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [30]:
model.evaluate(X_train, y_train)



[0.44239023327827454, 0.81395024061203]

In [31]:
model.evaluate(X_test, y_test)



[0.4350586533546448, 0.816647469997406]

## 5. Evaluate Model & Hyperparameter Tuning

In [32]:
sample_sql = 'SELECT * FROM "mini_db"."test-test_hotel"'
result = AthenaClient.get_athena_sql(sample_sql)

result

Still Running... -> QUEUED
Still Running... -> RUNNING


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,1,122,2017,August,31,1,0,1,2,...,No Deposit,240.0,,0,Transient,209.00,0,0,Canceled,2017-04-03
1,Resort Hotel,1,78,2017,August,31,1,0,3,2,...,No Deposit,240.0,,0,Transient,230.00,0,1,Canceled,2017-05-16
2,Resort Hotel,1,67,2017,August,31,1,0,4,1,...,No Deposit,314.0,,0,Transient,188.60,0,1,Canceled,2017-05-28
3,Resort Hotel,1,69,2017,August,31,1,0,4,2,...,No Deposit,242.0,,0,Transient,230.00,0,1,Canceled,2017-07-11
4,Resort Hotel,1,198,2017,August,31,1,0,4,2,...,No Deposit,240.0,,0,Transient,157.31,0,0,Canceled,2017-01-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4920,City Hotel,0,23,2017,August,35,30,2,5,2,...,No Deposit,394.0,,0,Transient,96.14,0,0,Check-Out,2017-09-06
4921,City Hotel,0,102,2017,August,35,31,2,5,3,...,No Deposit,9.0,,0,Transient,225.43,0,2,Check-Out,2017-09-07
4922,City Hotel,0,34,2017,August,35,31,2,5,2,...,No Deposit,9.0,,0,Transient,157.71,0,4,Check-Out,2017-09-07
4923,City Hotel,0,109,2017,August,35,31,2,5,2,...,No Deposit,89.0,,0,Transient,104.40,0,0,Check-Out,2017-09-07


In [33]:
df_test = result[['is_canceled', # 취소여부
              'arrival_date_month', # 숙박 달
              'lead_time', # 예약 경과일
              'deposit_type', # 보증금 종류
              'customer_type', # 손님 종류
              'is_repeated_guest', # 재방문 여부
              'market_segment',
              'distribution_channel',
              'previous_cancellations', # 이전 취소 횟수
              'previous_bookings_not_canceled', # 취소하지 않은 횟수
              'reserved_room_type', # 예약한 방 타입
              'booking_changes', # 예약 변경 횟수
              'total_of_special_requests', # 특별 요청 수
              'adr' # 평균 숙박 가격 (하루당)
             ]]
df_test

Unnamed: 0,is_canceled,arrival_date_month,lead_time,deposit_type,customer_type,is_repeated_guest,market_segment,distribution_channel,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,booking_changes,total_of_special_requests,adr
0,1,August,122,No Deposit,Transient,0,Online TA,TA/TO,0,0,D,0,0,209.00
1,1,August,78,No Deposit,Transient,0,Online TA,TA/TO,0,0,A,0,1,230.00
2,1,August,67,No Deposit,Transient,0,Online TA,TA/TO,0,0,A,0,1,188.60
3,1,August,69,No Deposit,Transient,0,Online TA,TA/TO,0,0,A,0,1,230.00
4,1,August,198,No Deposit,Transient,0,Online TA,TA/TO,0,0,A,0,0,157.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4920,0,August,23,No Deposit,Transient,0,Offline TA/TO,TA/TO,0,0,A,0,0,96.14
4921,0,August,102,No Deposit,Transient,0,Online TA,TA/TO,0,0,E,0,2,225.43
4922,0,August,34,No Deposit,Transient,0,Online TA,TA/TO,0,0,D,0,4,157.71
4923,0,August,109,No Deposit,Transient,0,Online TA,TA/TO,0,0,A,0,0,104.40


In [34]:
df_test.arrival_date_month.unique()

array(['August'], dtype=object)

In [35]:
df_test['arrival_date_month'] = df_test['arrival_date_month'].apply(lambda x: 8 if x == 'August' else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['arrival_date_month'] = df_test['arrival_date_month'].apply(lambda x: 8 if x == 'August' else x)


In [36]:
df_test.rename(columns = {'arrival_date_month' : 'mm'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.rename(columns = {'arrival_date_month' : 'mm'}, inplace = True)


In [37]:
df_test

Unnamed: 0,is_canceled,mm,lead_time,deposit_type,customer_type,is_repeated_guest,market_segment,distribution_channel,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,booking_changes,total_of_special_requests,adr
0,1,8,122,No Deposit,Transient,0,Online TA,TA/TO,0,0,D,0,0,209.00
1,1,8,78,No Deposit,Transient,0,Online TA,TA/TO,0,0,A,0,1,230.00
2,1,8,67,No Deposit,Transient,0,Online TA,TA/TO,0,0,A,0,1,188.60
3,1,8,69,No Deposit,Transient,0,Online TA,TA/TO,0,0,A,0,1,230.00
4,1,8,198,No Deposit,Transient,0,Online TA,TA/TO,0,0,A,0,0,157.31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4920,0,8,23,No Deposit,Transient,0,Offline TA/TO,TA/TO,0,0,A,0,0,96.14
4921,0,8,102,No Deposit,Transient,0,Online TA,TA/TO,0,0,E,0,2,225.43
4922,0,8,34,No Deposit,Transient,0,Online TA,TA/TO,0,0,D,0,4,157.71
4923,0,8,109,No Deposit,Transient,0,Online TA,TA/TO,0,0,A,0,0,104.40


In [38]:
numerical_columns = ["lead_time", "previous_cancellations", "previous_bookings_not_canceled", 
                     "booking_changes", "total_of_special_requests", "adr"]
categorical_columns = ["mm", "deposit_type", "customer_type", "is_repeated_guest", "market_segment", 
                       "distribution_channel", "reserved_room_type"]
y_column = "is_canceled"

numerical_df = df_test[numerical_columns]
cate_df = df_test[categorical_columns]
y_data = df_test[y_column]

cate_df_encoded = pd.get_dummies(cate_df)

In [39]:
cate_df_encoded["market_segment_Undefined"] = 0
cate_df_encoded["distribution_channel_Undefined"] = 0
cate_df_encoded["reserved_room_type_L"] = 0

In [40]:
encoded_df = pd.concat([numerical_df, cate_df_encoded, y_data], axis=1)

X_data = encoded_df.drop("is_canceled", axis=1)
y_data = encoded_df["is_canceled"]

In [41]:
X_data.shape, y_data.shape

((4925, 38), (4925,))

In [42]:
from sklearn.preprocessing import StandardScaler

ss_sc = StandardScaler()
X_data_scaled = ss_sc.fit_transform(X_data)
X_data_scaled.shape

(4925, 38)

In [43]:
model.evaluate(X_data_scaled, y_data)



[0.8141738176345825, 0.7132995128631592]

In [46]:
y_prob = model.predict(X_data_scaled, verbose=0) 
predicted = y_prob.argmax(axis=-1)

In [44]:
predicted_classes = np.argmax(model.predict(X_data_scaled), axis=1)



In [45]:
set(predicted_classes)

{0}

In [47]:
set(predicted)

{0}

In [48]:
predicted_classes

array([0, 0, 0, ..., 0, 0, 0])

In [49]:
predicted

array([0, 0, 0, ..., 0, 0, 0])

In [50]:
y_data

0       1
1       1
2       1
3       1
4       1
       ..
4920    0
4921    0
4922    0
4923    0
4924    0
Name: is_canceled, Length: 4925, dtype: int64

In [51]:
y_df = y_data.to_frame()
y_df

Unnamed: 0,is_canceled
0,1
1,1
2,1
3,1
4,1
...,...
4920,0
4921,0
4922,0
4923,0


In [54]:
y_df["predicted"] = predicted
y_df

Unnamed: 0,is_canceled,predicted
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
4920,0,0
4921,0,0
4922,0,0
4923,0,0


In [62]:
S3Client.upload_s3_df(y_df, file_name="inference/student2/test-hotel2.parquet")

True

In [63]:
import io
df = S3Client.get_s3_parquet_df("inference/student2/test-hotel2.parquet")
df

Unnamed: 0,is_canceled,predicted
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
4920,0,0
4921,0,0
4922,0,0
4923,0,0
