# Expedia Personalised Hotel Searches
VU Data Mining Techniques 2024 | Assignment 2 | Group 17

## Setup & Data Loading

In [1]:
import pandas as pd
from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np

In [2]:
train_path = 'data/training_set_VU_DM.csv'
test_path = 'data/test_set_VU_DM.csv'

In [3]:
df = pd.read_csv(train_path)
df.head()

## EDA
###### ToDo Jaime

In [None]:
competitive_metrics = ['compX_rate', 'compX_inv', "compX_rate_percent_diff"]
full_competitive_metrics = []
for metric in competitive_metrics:
    metric_list = []
    for x in range(1,9):
        metric_list.append(metric.replace("X", str(x)))
    full_competitive_metrics.append(metric_list)

In [None]:
len(df)

In [None]:
len(df[full_competitive_metrics[2]].dropna(axis = 'index', how = 'all'))

too much nan values

In [None]:
len(df[full_competitive_metrics[1]].dropna(axis = 'index', how = 'all'))

feature not relevant enough.

In [None]:
df['combined_comp_rate'] = df[full_competitive_metrics[0]].iloc[:, :].sum(axis=1)

In [None]:
comp_rate_nan_index = df.index.difference(df[full_competitive_metrics[0]].dropna(axis = 'index', how = 'all').index)
df.loc[comp_rate_nan_index, 'combined_comp_rate'] = np.nan

In [None]:
print(len(pd.unique(df['srch_id'])))
print(df['click_bool'].sum())
print(df['booking_bool'].sum())

On average more clicks than searches. On average less bookings than searches. Makes sense

In [None]:
def plot_vars_dist(df):
    variables = [var for var in df if var not in ['srch_id','date_time']]  # ignoring these as they are only 1 value indicating occurence of a call/sms
    fig, axes = plt.subplots(nrows=len(variables), ncols=1, figsize=(10, 6 * len(variables)))

    for ax, var in zip(axes.flatten(), variables):
        var_data = df[var]
        mean = var_data.mean()
        std = var_data.std()
        # Adjust bins for better visualization based on data range and characteristics
        bins = min(30, int(var_data.nunique()))  # Use a minimum of 30 bins or less if fewer unique values

        ax.hist(var_data, bins=bins, alpha=0.75, color='blue', edgecolor='black', label=f'{var} Scores')
        ax.set_title(f'Distribution of {var} values')
        ax.set_xlabel(f'{var.capitalize()} Score')
        ax.set_ylabel('Frequency')
        ax.grid(axis='y', alpha=0.75)
        legend_label = f"Mean: {mean:.2f}, Std: {std:.2f}"
        ax.legend([f"{var.capitalize()} Scores\n{legend_label}"], loc='upper right', title='Statistics', frameon=True)

    plt.tight_layout()
    plt.show()

In [None]:
plot_vars_dist(df)

In [None]:
df_corr = df.drop(['srch_id','date_time'],axis = 1)

In [None]:
df_corr.corr()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

## Data Preprocessing
###### ToDo Ryan

### Datatype Conversion & Grouping Features

In [5]:
df['date_time'] = pd.to_datetime(df['date_time'])

boolean_features = ['prop_brand_bool', 'promotion_flag', 'srch_saturday_night_bool', 'random_bool', 'click_bool', 'booking_bool']
for col in boolean_features:
    df[col] = df[col].astype('bool')
    
categorical_features = ['site_id', 'visitor_location_country_id', 'prop_country_id']  # 'srch_id', 'prop_id' and 'srch_destination_id' are not included because they are real identifiers rather than categorical features
for col in categorical_features:
    df[col] = df[col].astype('category')
    
numerical_features = [col for col in df.columns if col not in boolean_features + categorical_features + ['date_time', 'srch_id', 'prop_id', 'srch_destination_id']]
    
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4958347 entries, 0 to 4958346
Data columns (total 54 columns):
 #   Column                       Dtype         
---  ------                       -----         
 0   srch_id                      int64         
 1   date_time                    datetime64[ns]
 2   site_id                      category      
 3   visitor_location_country_id  category      
 4   visitor_hist_starrating      float64       
 5   visitor_hist_adr_usd         float64       
 6   prop_country_id              category      
 7   prop_id                      int64         
 8   prop_starrating              int64         
 9   prop_review_score            float64       
 10  prop_brand_bool              bool          
 11  prop_location_score1         float64       
 12  prop_location_score2         float64       
 13  prop_log_historical_price    float64       
 14  position                     int64         
 15  price_usd                    float64       
 16  

### Imputing Missing Values for Numerical Features

In [6]:
print('Missing values replaced with 0:')
print('------------------------------')
for feature in numerical_features:
    print(f'{feature} - {df[feature].isna().sum()}')
    df.fillna({feature: 0}, inplace=True)

Missing values replaced with 0:
visitor_hist_starrating - 4706481
visitor_hist_adr_usd - 4705359
prop_starrating - 0
prop_review_score - 7364
prop_location_score1 - 0
prop_location_score2 - 1090348
prop_log_historical_price - 0
position - 0
price_usd - 0
srch_length_of_stay - 0
srch_booking_window - 0
srch_adults_count - 0
srch_children_count - 0
srch_room_count - 0
srch_query_affinity_score - 4640941
orig_destination_distance - 1607782
comp1_rate - 4838417
comp1_inv - 4828788
comp1_rate_percent_diff - 4863908
comp2_rate - 2933675
comp2_inv - 2828078
comp2_rate_percent_diff - 4402109
comp3_rate - 3424059
comp3_inv - 3307357
comp3_rate_percent_diff - 4485550
comp4_rate - 4650969
comp4_inv - 4614684
comp4_rate_percent_diff - 4827261
comp5_rate - 2735974
comp5_inv - 2598327
comp5_rate_percent_diff - 4117248
comp6_rate - 4718190
comp6_inv - 4697371
comp6_rate_percent_diff - 4862173
comp7_rate - 4642999
comp7_inv - 4601925
comp7_rate_percent_diff - 4819832
comp8_rate - 3041693
comp8_inv - 2

Zero was chosen as the imputation value for the numerical features because it intuitively denotes a missing record. Furthermore, for the competitor data where we have either +1, 0, -1, assuming 0 in the case of missing data assumes no advantage for Expedia or a competitor.

### Dropping Features with Many Missing Values
Not done anymore!

In [None]:
# # create a list of columns to drop where more than 50% of the data is missing
# columns_to_drop = df.columns[df.isnull().mean() > 0.5].tolist()
# pprint(f'Columns that are dropped due to excessive missing values: {columns_to_drop}')
# df.drop(columns=columns_to_drop, inplace=True)

### Normalising/Standardising Numerical Features

Normalisation (scaling data between 0 and 1) or standardisation (shifting the distribution to have a mean of zero and a standard deviation of one) can be beneficial for algorithms that are sensitive to the scale of input data (like SVM or KNN).

In [None]:
# Example
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Example: Standardizing 'price_usd'
scaler = StandardScaler()
df['price_usd'] = scaler.fit_transform(df[['price_usd']])

# Or for normalization
# scaler = MinMaxScaler()
# df['price_usd'] = scaler.fit_transform(df[['price_usd']])

### Encoding Categorical Variables

In [None]:
# Example for 1-hot encoding a categorical feature
df = pd.get_dummies(df, columns=['prop_country_id'], drop_first=True)

### Feature Engineering

Ideas for this:
1. Time Features: Break down the date_time column into year, month, day, weekday, and hour components. Time could affect booking patterns.
2. Interaction Features: Create features that represent interactions between the customer’s historical preferences and property attributes, like the difference between the user’s average star rating and the property’s star rating.
3. Textual and Categorical Embeddings: If there are textual descriptions available or high-cardinality categorical variables, consider using embeddings or hashing techniques to reduce their dimensionality.

In [None]:
# Create time-based features
df['year'] = df['date_time'].dt.year
df['month'] = df['date_time'].dt.month
df['day'] = df['date_time'].dt.day
df['weekday'] = df['date_time'].dt.weekday
df['hour'] = df['date_time'].dt.hour

# Interaction feature: difference between user's average star rating and property's star rating
df['starrating_diff'] = df['visitor_hist_starrating'] - df['prop_starrating']

# Re-check the dataset
df.head()