In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 1000)

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

In [2]:
from data_preprocessing.dataset_specification import DatasetSpecification
from data_preprocessing.data_preprocessing_toolkit import DataPreprocessingToolkit
from data_preprocessing.people_identifier import PeopleIdentifier

# Load original data

In [3]:
data_path = os.path.join("data", "hotel_data")

original_data = pd.read_csv(os.path.join(data_path, "hotel_data_original.csv"), index_col=0)

original_data = original_data.replace({"\\N": ""})
original_data = original_data.fillna("")

numeric_columns = ["n_people", "n_children_1", "n_children_2", "n_children_3",
                   "discount", "accommodation_price", "meal_price", "service_price",
                   "paid"]

for column in numeric_columns:
    original_data.loc[:, column] = pd.to_numeric(original_data.loc[:, column], errors="coerce")
    

# fix date_to    
original_data["date_to"] = pd.to_datetime(original_data["date_to"]) + pd.Timedelta(days = 1)

# add column with time difference
original_data["length_of_stay"] = (original_data["date_to"] - pd.to_datetime(original_data["date_from"])).dt.days

# add column book_to_arrival
original_data["book_to_arrival"] = (original_data["date_to"] - pd.to_datetime(original_data["booking_date"])).dt.days

# add column weekend_stay
#original_data["weekend_stay"] = ((pd.to_datetime(original_data["date_from"])).dt.weekday + 1) % 7 + (original_data["length_of_stay"] - pd.Timedelta(days = 1)).dt.days > 5
original_data["weekend_stay"] = ((pd.to_datetime(original_data["date_from"])).dt.weekday + 1) % 7 + (original_data["length_of_stay"] - 1) > 5


# add column night_price
sum_accomodation = original_data.groupby(['group_id'])['accommodation_price'].sum()


#original_data = add_nrooms(original_data)

#print(n_rooms)
original_data["n_rooms"] = original_data.groupby(['group_id'])['room_id'].count()[1]
original_data["sum_accomodation"] = sum_accomodation[1]
#original_data = original_data.groupby(['group_id'])
#original_data["night_price"] = original_data["accommodation_price"] / (original_data["length_of_stay"].dt.days * n_rooms)

#original_data = pd.unique(original_data["group_id"])

# add column n_people
original_data["n_people"] = original_data["n_people"] + original_data["n_children_1"] + original_data["n_children_2"] + original_data["n_children_3"]

# filter out_long_stays
original_data = original_data.loc[original_data["length_of_stay"] <= 21]

# filter out non-company_clients
original_data = original_data.loc[original_data["is_company"] == 0]

# filter out low_prices
original_data = original_data.loc[original_data["accommodation_price"] > 50]

###original_data["weekend_stay"] = ((pd.to_datetime(original_data["date_from"])).dt.weekday + 1) % 7 + (original_data["length_of_stay"] - 1 )
# - pd.Timedelta(days = 1)).dt.days > 5

#np.where(pd.to_datetime(original_data["date_from"]).weekday + (original_data["length_of_stay"] - 1) > 5)

#pd.to_datetime(original_data["date_from"]).weekday + (original_data["length_of_stay"] - 1) > 5

print(original_data.shape)
print(original_data.iloc[[2150]]['book_to_arrival'])
print(original_data.iloc[[2151]]['book_to_arrival'])
print(original_data.iloc[[2152]]['book_to_arrival'])

original_data = original_data.astype(
        {
            "date_from": np.datetime64,
            "date_to": np.datetime64,
            "booking_time": np.datetime64,
            "booking_date": np.datetime64,
            "n_people": np.int64,
            "n_children_1": np.int64,
            "n_children_2": np.int64,
            "n_children_3": np.int64,
            "discount": np.float64,
            "accommodation_price": np.float64,
            "meal_price": np.float64,
            "service_price": np.float64,
            "paid": np.float64,
        }
    )

display(original_data.head(15))




(15657, 29)
2641    4
Name: book_to_arrival, dtype: int64
2642    21
Name: book_to_arrival, dtype: int64
2643    31
Name: book_to_arrival, dtype: int64


Unnamed: 0,reservation_id,group_id,room_id,room_group_id,date_from,date_to,booking_date,booking_time,n_people,n_children_1,n_children_2,n_children_3,discount,accommodation_price,meal_price,service_price,paid,rate_plan,client_id,client_name,email,phone,is_company,reservation_status,length_of_stay,book_to_arrival,weekend_stay,n_rooms,sum_accomodation
1,16075,,118,118,2018-02-10,2018-02-13,2017-08-17,2017-08-17 15:01:00,5,0,0,0,,992.29,0.0,0.0,1.0,Standard,54117,ca83ddae9b7d15212b5391c815a689b8acfd8ef31d0d80...,,318faec979ecaf8adaee0c8e5d7531a67f309b7247d30b...,0,1,3,180,True,2,3175.2
2,16076,,270,270,2018-02-28,2018-03-03,2017-08-17,2017-08-17 15:08:00,4,0,0,0,0.0,693.4,0.0,0.0,693.4,Standard,54118,4db36724fc28085e053a3003dce55368ee207cce37d355...,f9c0564c66d6a830c4964a30ac261038dd7cf762b0641c...,cb550ba6d303bf230379073bcbdd55c37229eab3f173dc...,0,2,3,198,False,2,3175.2
3,16635,,294,294,2018-02-14,2018-02-16,2017-08-29,2017-08-29 13:58:00,2,0,0,0,,366.8,0.0,0.0,1.0,Standard,54790,e41ecdb28a96d0b3e294aea6e854d8dc39a1d61bb3dfe4...,f6a8c77530865b7e437eb746c3564c4cbdc522c10d35f6...,1c56315c10c9d8153ca7820648900befbd9109fb6cfb81...,0,1,2,171,False,2,3175.2
4,16964,,183,183,2018-02-03,2018-02-10,2017-09-04,2017-09-04 15:52:00,4,0,0,0,,1064.6,0.0,0.0,1.0,Standard,55177,5380adccf08ea3000791aad3ccc478e3b6a8de440910aa...,6d08a7230580a09f1fde268bb7c1a5d74a55bdcc9183f8...,3aff5ce689580e51de899de8ec75e8a8eaa470e4e99df4...,0,1,7,159,True,2,3175.2
5,17173,,64,64,2018-01-29,2018-02-03,2017-09-07,2017-09-07 13:21:00,2,0,0,0,,713.0,0.0,0.0,1.0,Standard,55412,4aebfe125cf6c059588792b9fb871afe282a8806299dfe...,0d6aafda88cc3d5844da8c60ca9d1b6682f1ce1a4dfe12...,ea16c664798581a9d93a3128d772b8b89e05743edbbfae...,0,1,5,149,False,2,3175.2
6,17308,,111,111,2018-03-28,2018-04-01,2017-09-11,2017-09-11 10:31:00,5,0,0,0,0.0,800.0,0.0,0.0,800.0,Standard,55560,1f4b60816f6efcb45dfa67da7a6adab42d4a05b90a9278...,6163ca5013b2bc940219a59d0e30ec401ecd01bb498e03...,516b31d7892e1b5f4b6078ea0fc4c63a06bb9ceceb885d...,0,4,4,202,True,2,3175.2
7,120165,,162,162,2018-11-16,2018-11-18,2018-02-19,2018-02-19 17:44:00,5,0,0,0,0.0,402.0,0.0,0.0,402.0,Standard,63419,d47bcb623e5031df97cd9faf472e28d9fe40f1386bbd92...,5213ac7a6db98631330ac74a241ffdf840e1857481a0b5...,6416a3bc7ea31b09ae63628a143b160d7976978cdbd298...,0,4,2,272,True,2,3175.2
8,120183,,45,45,2018-08-16,2018-08-19,2018-02-19,2018-02-19 17:44:00,1,0,0,0,0.0,660.0,0.0,0.0,660.0,Standard,61777,f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74be...,,,0,4,3,181,True,2,3175.2
9,120184,,64,64,2018-08-17,2018-08-19,2018-02-19,2018-02-19 17:44:00,1,0,0,0,0.0,320.0,0.0,0.0,320.0,Standard,61778,364e80d6c0608116ff8808b339ff25dc2e3f8f2211ba38...,,,0,4,2,181,True,2,3175.2
10,120185,,126,126,2018-08-16,2018-08-19,2018-02-19,2018-02-19 17:44:00,1,0,0,0,0.0,720.0,0.0,0.0,720.0,Standard,61777,f02a11d6b6f5bd1bf1a003a655d1c28df9362c7f2f74be...,,,0,4,3,181,True,2,3175.2


# Preprocess the data

- Identify users by client_id, name hash, phone hash, email hash.
- Fix date_to - originally it points to the last full day of stay, not the departure date.
- Add length of stay.
- Add book to arrival.
- Add number of rooms (important for group reservations).
- Add indicator for stays encompasing a weekend.
- Add night price.
- Fix book to arrival to be not smaller than 0.
- Filter out companies as recommendations for such clients should work differently.
- Aggregate group reservations into single interactions.

<span style="color:red"><font size="4">**Task:**</font></span><br> 
In the file data_preprocessing/data_preprocessing_toolkit write code for the following methods which work on the DataFrame with hotel data:
  - <strike>add_length_of_stay - Adds length_of_stay column which is the difference between date_from and date_to (in days), i.e. the number of nights the customer stayed at the hotel.</strike> - uuuuuuuuuwwwwwwwwwwwu
  - <strike>add_book_to_arrival - Adds book_to_arrival column which is the difference between date_from and booking_date (in days).</strike>
  - <strike>add_weekend_stay - Adds weekend_stay column with 'True'/'False' strings indicating if the interval date_from to date_to contains any weekend days (defined as Friday and Saturday). </strike>
  - add_night_price - Adds night_price column with the average price per one night per room - calculated as accomodation_price divided by length_of_stay and by n_rooms (there can be many rooms in group reservations - 'n_rooms' column).
  - <strike>sum_npeople - Sums n_people, n_children_1, n_children_2, n_children_3 and sets the result to the n_people column.</strike>
  - <strike>filter_out_company_clients - Filters out company clients is_company=0.</strike>
  - <strike>filter_out_long_stays - Leaves only stays with length_of_stay less or equal to 21.</strike>
  - filter_out_low_prices - Leaves only stays with accommodation price bigger than 50. Smaller prices are considered not reliable and likely a mistake of the hotel staff.
  - aggregate_group_reservations - Aggregates every group reservation into one reservation with aggregated data (for self.sum_columns a sum is taken, for self.mean_columns a mean, for self.mode_columns a mode, for self.first_columns the first value). This one is the most challenging - see instructions in the py file (remember that with the %load_ext autoreload %autoreload 2 options in the first cell of this notebook you don't have to restart the notebook - the changes in the py file will be immediately used in the notebook whenever a method from the py file is invoked).
  
You have to pass all assertions in the below cell.

In [4]:
original_data = pd.read_csv(os.path.join(data_path, "hotel_data_original.csv"), index_col=0)

original_data = original_data.replace({"\\N": ""})
original_data = original_data.fillna("")

numeric_columns = ["n_people", "n_children_1", "n_children_2", "n_children_3",
                   "discount", "accommodation_price", "meal_price", "service_price",
                   "paid"]

for column in numeric_columns:
    original_data.loc[:, column] = pd.to_numeric(original_data.loc[:, column], errors="coerce")
    
original_data = original_data.astype(
        {
            "date_from": np.datetime64,
            "date_to": np.datetime64,
            "booking_time": np.datetime64,
            "booking_date": np.datetime64,
            "n_people": np.int64,
            "n_children_1": np.int64,
            "n_children_2": np.int64,
            "n_children_3": np.int64,
            "discount": np.float64,
            "accommodation_price": np.float64,
            "meal_price": np.float64,
            "service_price": np.float64,
            "paid": np.float64,
        }
    )

preprocessed_data = original_data.copy()

print(preprocessed_data.shape)

dataset_specification = DatasetSpecification()
dp_toolkit = DataPreprocessingToolkit()

id_column_names = dataset_specification.get_id_columns()

people_identifier = PeopleIdentifier()
preprocessed_data = people_identifier.add_pid(preprocessed_data, id_column_names, "user_id")

preprocessed_data = dp_toolkit.filter_out_company_clients(preprocessed_data)  # Code this method
preprocessed_data = dp_toolkit.filter_out_low_prices(preprocessed_data)  # Code this method

preprocessed_data = dp_toolkit.fix_date_to(preprocessed_data)
preprocessed_data = dp_toolkit.add_length_of_stay(preprocessed_data)  # Code this method

preprocessed_data = dp_toolkit.filter_out_long_stays(preprocessed_data)  # Code this method

preprocessed_data = dp_toolkit.add_book_to_arrival(preprocessed_data)  # Code this method
preprocessed_data = dp_toolkit.add_weekend_stay(preprocessed_data)  # Code this method
preprocessed_data = dp_toolkit.add_nrooms(preprocessed_data)
preprocessed_data = dp_toolkit.sum_npeople(preprocessed_data)  # Code this method
preprocessed_data = dp_toolkit.clip_book_to_arrival(preprocessed_data)

preprocessed_data = dp_toolkit.aggregate_group_reservations(preprocessed_data)  # Code this method

preprocessed_data = dp_toolkit.add_night_price(preprocessed_data)  # Code this method

preprocessed_data = preprocessed_data.reset_index(drop=True)

display(preprocessed_data)


assert len(preprocessed_data.loc[preprocessed_data["is_company"] == 1]) == 0
assert len(preprocessed_data.loc[preprocessed_data["length_of_stay"] > 21]) == 0
assert len(preprocessed_data.loc[preprocessed_data["accommodation_price"] <= 50]) == 0



display(preprocessed_data.iloc[2150:2152]['book_to_arrival'])

assert preprocessed_data.iloc[1]['length_of_stay'] == 3
assert preprocessed_data.iloc[2]['length_of_stay'] == 2
assert preprocessed_data.iloc[3]['length_of_stay'] == 7


assert preprocessed_data.iloc[2150]['book_to_arrival'] == 11
assert preprocessed_data.iloc[2151]['book_to_arrival'] == 28
assert preprocessed_data.iloc[2152]['book_to_arrival'] == 12

assert preprocessed_data.iloc[2150]['weekend_stay'] == 'False'
assert preprocessed_data.iloc[2151]['weekend_stay'] == 'True'
assert preprocessed_data.iloc[2152]['weekend_stay'] == 'False'

assert preprocessed_data.iloc[3650]['n_people'] == 2
assert preprocessed_data.iloc[3651]['n_people'] == 4
assert preprocessed_data.iloc[3652]['n_people'] == 1

assert preprocessed_data.iloc[0]['night_price'] == 330.76
assert preprocessed_data.iloc[1]['night_price'] == 231.13
assert preprocessed_data.iloc[2]['night_price'] == 183.40

# Assertions for group reservations

assert preprocessed_data.iloc[15258]['rate_plan'] == 'Nonref'
assert preprocessed_data.iloc[15259]['rate_plan'] == 'Standard'
assert preprocessed_data.iloc[15260]['rate_plan'] == 'Standard'
assert preprocessed_data.iloc[15258]['accommodation_price'] == 1397.06
assert preprocessed_data.iloc[15261]['accommodation_price'] == 2953.10
assert preprocessed_data.iloc[15264]['accommodation_price'] == 1738.80
assert preprocessed_data.iloc[15258]['n_people'] == 6
assert preprocessed_data.iloc[15259]['n_people'] == 7
assert preprocessed_data.iloc[15260]['n_people'] == 11
assert preprocessed_data.iloc[15265]['night_price'] == 206.67
assert preprocessed_data.iloc[15266]['night_price'] == 138.92
assert preprocessed_data.iloc[15267]['night_price'] == 119.07
assert preprocessed_data.iloc[15261]['room_group_id'] == 483
assert preprocessed_data.iloc[15266]['room_group_id'] == 78
assert preprocessed_data.iloc[15267]['room_group_id'] == 463

display(preprocessed_data.head(15))

(17250, 24)


Unnamed: 0,n_people,n_children_1,n_children_2,n_children_3,accommodation_price,meal_price,service_price,paid,n_rooms,discount,room_id,room_group_id,date_from,date_to,booking_date,rate_plan,length_of_stay,book_to_arrival,weekend_stay,user_id,client_id,client_name,email,phone,is_company,night_price
0,5.0,0.0,0.0,0.0,992.29,0.0,0.0,1.00,1.0,,118,118,2018-02-10,2018-02-13,2017-08-17,Standard,3,180,True,1,54117,ca83ddae9b7d15212b5391c815a689b8acfd8ef31d0d80...,,318faec979ecaf8adaee0c8e5d7531a67f309b7247d30b...,0,330.763333
1,4.0,0.0,0.0,0.0,693.40,0.0,0.0,693.40,1.0,0.0,270,270,2018-02-28,2018-03-03,2017-08-17,Standard,3,198,False,2,54118,4db36724fc28085e053a3003dce55368ee207cce37d355...,f9c0564c66d6a830c4964a30ac261038dd7cf762b0641c...,cb550ba6d303bf230379073bcbdd55c37229eab3f173dc...,0,231.133333
2,2.0,0.0,0.0,0.0,366.80,0.0,0.0,1.00,1.0,,294,294,2018-02-14,2018-02-16,2017-08-29,Standard,2,171,False,3,54790,e41ecdb28a96d0b3e294aea6e854d8dc39a1d61bb3dfe4...,f6a8c77530865b7e437eb746c3564c4cbdc522c10d35f6...,1c56315c10c9d8153ca7820648900befbd9109fb6cfb81...,0,183.400000
3,4.0,0.0,0.0,0.0,1064.60,0.0,0.0,1.00,1.0,,183,183,2018-02-03,2018-02-10,2017-09-04,Standard,7,159,True,4,55177,5380adccf08ea3000791aad3ccc478e3b6a8de440910aa...,6d08a7230580a09f1fde268bb7c1a5d74a55bdcc9183f8...,3aff5ce689580e51de899de8ec75e8a8eaa470e4e99df4...,0,152.085714
4,2.0,0.0,0.0,0.0,713.00,0.0,0.0,1.00,1.0,,64,64,2018-01-29,2018-02-03,2017-09-07,Standard,5,149,False,5,55412,4aebfe125cf6c059588792b9fb871afe282a8806299dfe...,0d6aafda88cc3d5844da8c60ca9d1b6682f1ce1a4dfe12...,ea16c664798581a9d93a3128d772b8b89e05743edbbfae...,0,142.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15263,11.0,0.0,0.0,0.0,1090.26,0.0,260.0,1350.26,2.0,0.0,52,52,2020-09-25,2020-09-28,2020-09-19,Standard,3,9,True,14315,102628,453b30b1e7836532d452c8b04c989a39d76ba67e91294f...,463cff695b717457239b98e26fef0ff9af5c126b472bb0...,b29f52274f21de6cc14b4407b096138c807f266ecf4453...,0,181.710000
15264,2.0,0.0,0.0,0.0,1738.80,0.0,260.0,1998.80,2.0,0.0,56,56,2020-09-30,2020-10-04,2020-09-28,Standard,4,6,True,14364,102764,38e96b9a1e9b2578e57a55b24880b0596353b860408e66...,2cb7b2d43ecd3334d6f411b4fafdda4c846d6bf2e420c6...,e58e36487adb56a2c5e91266b3973c98e9cc0bff7fb2c6...,0,217.350000
15265,6.0,0.0,0.0,0.0,1240.00,0.0,0.0,1240.00,2.0,0.0,468,480,2020-10-02,2020-10-05,2020-09-30,Standard,3,5,True,14384,102835,6eb4b06563a019d5214d0d1f3da299181eb32069abb0a7...,ba8b814423332f6b895b3bd78a86698346889bf95fd8c5...,57d1049d7771b169e3ffbe22b784997cabc39d0d280bc6...,0,206.666667
15266,10.0,0.0,0.0,0.0,555.66,0.0,260.0,815.66,2.0,0.0,470,460,2020-10-23,2020-10-25,2020-10-21,Standard,2,4,True,14454,103037,4dfe9c3e2732ae84b06080d6fd9572a469a9bf205c19b6...,8f28a2a1a521014d5214ff3c6f5e94c0b6bd4c6250e3b4...,cb8f365314c22d234daa793a7023988f6993d6fb7cf55f...,0,138.915000


2150    13
2151    38
Name: book_to_arrival, dtype: int64

AssertionError: 

## Bucket important features to reduce the offer space size

Without this step every pair (user_id, item_id) would have at most a single interaction. The base item space has around $2^{25} \sim 3.3 \text{mln}$ elements. Therefore, values for selected features are aggregated into buckets:

```python
column_values_dict = {
    'term': ['WinterVacation', 'Easter', 'OffSeason', 'HighSeason', 'LowSeason', 'MayLongWeekend', 'NewYear', 'Christmas'],
    'length_of_stay_bucket': ['[0-1]', '[2-3]', '[4-7]', '[8-inf]'],
    'rate_plan': ['Standard', 'Nonref'],
    'room_segment': ['[0-160]', '[160-260]', '[260-360]', '[360-500]', '[500-900]'],
    'n_people_bucket': ['[1-1]', '[2-2]', '[3-4]', '[5-inf]'],
    'weekend_stay': ['True', 'False']
}
```

Explanation:
  - term - the term of the arrival date,
  - length_of_stay_bucket - aggregated length of stay,
  - rate_plan - rate plan which distinguishes if a given booking was refundable or nonrefundable (in reality rate plans are much more complex, they define prices for all rooms for every date, they include features like free breakfast, wine in the room etc.),
  - room_segment - for every room its average price is calculated, then every room assigned to an appropriate price range, which is a proxy for room quality,
  - n_people_bucket - aggregated number of people in a reservation,
  - weekend_stay - indicates if the stay encompassed a weekend.

The buckets are chosen based on expert knowledge of people working in the hotel industry for many years. Alternatively, clustering techniques could be used, but on a relatively small dataset expert methods are typically better.

The above aggregations reduce the number of possible items to $8 * 4 * 2 * 5 * 4 * 2 = 2560$.

### The recommenders will be trained and evaluated on such aggregated data. To get a proper offer for a user one would have to decode those buckets into specific values, but this is a much easier task and can be achieved based on simple rules.

<span style="color:red"><font size="4">**Task:**</font></span><br> 
In the file data_preprocessing/data_preprocessing_toolkit write code for the map_night_price_to_room_segment_buckets method. You must calculate the average of night prices for every **room_group_id** and map those prices to buckets (you can apply the map_value_to_bucket method which is available in the data_preprocessing_toolkit, the buckets are available under self.room_segment_buckets). The new column should be named 'room_segment'. You have to pass all assertions.

In [107]:
preprocessed_data = dp_toolkit.map_dates_to_terms(preprocessed_data)
preprocessed_data = dp_toolkit.map_lengths_of_stay_to_nights_buckets(preprocessed_data)
preprocessed_data = dp_toolkit.map_night_prices_to_room_segment_buckets(preprocessed_data)  # Code this method
preprocessed_data = dp_toolkit.map_npeople_to_npeople_buckets(preprocessed_data)

assert preprocessed_data.iloc[4]['room_segment'] == '[0-160]'
assert preprocessed_data.iloc[1]['room_segment'] == '[160-260]'
assert preprocessed_data.iloc[0]['room_segment'] == '[260-360]'
assert preprocessed_data.iloc[2820]['room_segment'] == '[360-500]'

preprocessed_data = dp_toolkit.map_item_to_item_id(preprocessed_data)

preprocessed_data.to_csv(os.path.join(data_path, "hotel_data_preprocessed.csv"))

display(preprocessed_data.head(15))




AssertionError: 

# Base statistics

In [None]:
print("Number of users: {}".format(len(preprocessed_data['user_id'].unique())))
print()
print("Number of items: {}".format(len(preprocessed_data['item_id'].unique())))
print()
print("Number of interactions: {}".format(len(preprocessed_data)))
print()

n_user = preprocessed_data.loc[:, ['user_id', 'item_id']].groupby('item_id').count().sort_values(by='user_id', ascending=False)
n_user = n_user.rename(columns={'user_id': 'n_users'})
display(n_user.head(10))

n_item = preprocessed_data.loc[:, ['user_id', 'item_id']].groupby('user_id').count().sort_values(by='item_id', ascending=False)
n_item = n_item.rename(columns={'item_id': 'n_items'})
display(n_item.head(10))

# Prepare the dataset for recommenders

One could consider many features describing each interaction but from the business perspective term, length_of_stay_bucket, rate_plan, room_segment, n_people_bucket, weekend_stay are the most important.

In [None]:
item_features = ['term', 'length_of_stay_bucket', 'rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay']

interactions_df = preprocessed_data.loc[:, ['user_id', 'item_id'] + item_features]

column_values_dict = {
    'term': ['WinterVacation', 'Easter', 'OffSeason', 'HighSeason', 'LowSeason', 'MayLongWeekend', 'NewYear', 'Christmas'],
    'length_of_stay_bucket': ['[0-1]', '[2-3]', '[4-7]', '[8-inf]'],
    'rate_plan': ['Standard', 'Nonref'],
    'room_segment': ['[0-160]', '[160-260]', '[260-360]', '[360-500]', '[500-900]'],
    'n_people_bucket': ['[1-1]', '[2-2]', '[3-4]', '[5-inf]'],
    'weekend_stay': ['True', 'False']
}

interactions_df['term'] = pd.Categorical(
    interactions_df['term'], categories=column_values_dict['term'])
interactions_df['length_of_stay_bucket'] = pd.Categorical(
    interactions_df['length_of_stay_bucket'], categories=column_values_dict['length_of_stay_bucket'])
interactions_df['rate_plan'] = pd.Categorical(
    interactions_df['rate_plan'], categories=column_values_dict['rate_plan'])
interactions_df['room_segment'] = pd.Categorical(
    interactions_df['room_segment'], categories=column_values_dict['room_segment'])
interactions_df['n_people_bucket'] = pd.Categorical(
    interactions_df['n_people_bucket'], categories=column_values_dict['n_people_bucket'])
interactions_df['weekend_stay'] = pd.Categorical(
    interactions_df['weekend_stay'], categories=column_values_dict['weekend_stay'])

interactions_df.to_csv(os.path.join(data_path, "hotel_data_interactions_df.csv"))

display(interactions_df.head(15))