# Load Dataset and Libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv('../data/hotel_bookings.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

# Handle Missing Values

In [4]:
df['children'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['children'].fillna(0, inplace=True)


In [5]:
df['country'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['country'].fillna('Unknown', inplace=True)


In [6]:
df['agent']=df['agent'].fillna('Unknown').astype(str)

In [7]:
df['company']=df['company'].fillna('Unknown').astype(str)

In [8]:
df.isnull().sum()

hotel                             0
is_canceled                       0
lead_time                         0
arrival_date_year                 0
arrival_date_month                0
arrival_date_week_number          0
arrival_date_day_of_month         0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          0
babies                            0
meal                              0
country                           0
market_segment                    0
distribution_channel              0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
assigned_room_type                0
booking_changes                   0
deposit_type                      0
agent                             0
company                           0
days_in_waiting_list              0
customer_type                     0
adr                         

# Convert Data Types

In [9]:
df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'], dayfirst=True, errors='coerce')

  df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'], dayfirst=True, errors='coerce')


In [10]:
month_mapping = {month: index for index, month in enumerate([
    "January", "February", "March", "April", "May", "June", 
    "July", "August", "September", "October", "November", "December"], start=1)}
df['arrival_date_month'] = df['arrival_date_month'].map(month_mapping)

# Remove Duplicate Rows (If Any)

In [11]:
df.duplicated().sum()

np.int64(31994)

In [12]:
df.drop_duplicates(inplace=True)
print(f"Duplicates remaining: {df.duplicated().sum()}")

Duplicates remaining: 0


Removing Invalid Entries

In [13]:
mask=(df[['children','adults','babies']]==0).all(axis=1)

In [14]:
df=df.drop(df[mask].index)

# Convert Country Codes During Data Cleaning

In [15]:
import pycountry

# Function to convert ISO country codes to full names
def convert_country_code(code):
    try:
        return pycountry.countries.get(alpha_3=code).name
    except:
        return "Unknown"  # Replace missing or invalid codes

# Apply conversion during data cleaning
df['country'] = df['country'].apply(convert_country_code)

# Verify conversion
print("Country codes converted to full names!")
print(df[['country']].head())


Country codes converted to full names!
          country
0        Portugal
1        Portugal
2  United Kingdom
3  United Kingdom
4  United Kingdom


# Save the cleaned data

In [16]:
df.to_csv('../data/hotel_bookings_cleaned.csv',index=False)