In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# <font color = blue> _Workation Price Prediction_</font>
****
**_Overview_**
***
The new covid-era has provided a new way of living the work-life balance. We have seen a lot of different websites providing packages to work from different locations. From Kashmir to Kanyakumari, from Gujarat to Assam we have collected packages in and around India. It becomes really difficult to find the best place with all the amenities such as high-speed internet, a comfortable stay as well as within the budget. To solve the real-world problem of finding the best deals for a calm and enjoying workation trip. Workation is the best way to work at a remote location with a recreational and rejuvenating vacation for the team.

In this competition, one has to use the knowledge of machine learning, deep learning, and model building to predict the price per person for your next workstation trip. The data has more than 18000+ rows of different packages with the details like start location, hotel type, cost per person, destination, Itinerary, and many more.

## _Load the Libraries_

In [None]:
#arrays and dataframes operations 
import os
import math
from pprint import pprint
import statistics
import numpy as np
import pandas as pd
from datetime import date
#visualization imports
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
#consistent plot sizes
from pylab import rcParams
rcParams['figure.figsize'] = 12,5
rcParams['axes.labelsize'] = 12
rcParams['xtick.labelsize'] = 12
rcParams['ytick.labelsize'] = 12
#handle unwanted warnings
import warnings
warnings.filterwarnings(action='ignore',category=DeprecationWarning)
warnings.filterwarnings(action='ignore',category=FutureWarning)
#display all the columns
pd.options.display.max_columns = False
#import label encoder
from sklearn.preprocessing import LabelEncoder
#import transformer
from sklearn.preprocessing import PowerTransformer
#import the modeling libraries
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
#linear models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
#ensemble techniqques
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
#xgboost
from xgboost import XGBRegressor
#pipeline
from sklearn.pipeline import Pipeline
#model evaluation
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
#dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import IncrementalPCA
from sklearn.manifold import LocallyLinearEmbedding

#helper function
#create a feature with the number of occurences

def num_sightseeing(dataset,col='Sightseeing Places Covered'):
    
    '''This function returns the number of attributes separated by |. 
    The function was written later in the code but found its usefulness for multiple features.
    Hence it is added here and the name of the function resembles it is only for sight seeing 
    feature which is not the case in reality. '''
    
    num_sightseeing = []
    for i in range(len(dataset)):
        if dataset[col][i] == 'Not Available':
            num_sightseeing.append(-1)
        else:
            num_sightseeing.append(dataset[col][i].count('|'))    
    return num_sightseeing 

## _Load the data and Basic Sanity Checks_

In [None]:
#load as pandas dataframe
train = pd.read_csv('/kaggle/input/work-vacation-price-prediction-dataset/Train.csv',delimiter=',',engine='python')
test =  pd.read_csv('/kaggle/input/work-vacation-price-prediction-dataset/Test.csv',delimiter=',',engine='python')

In [None]:
#check the few top rows
train.head()

In [None]:
test.head(3)

In [None]:
#check the info .. 
train.info()

<b> _There are 21000 entries in the train dataset with 13 features and 1 target variable which is the price of the travel package. Flight stops, Meals are integers, price is in float whereas all teh rest of the features are of object/string type including the Travel Date. There are no null/NaN values in the dataset. However, there are feature rows where it mentions 'Not Available'. These are as good as NaNs_ </b>

**_There is no feature which states the end of the travel. The duration of the vacation can be derived from the Itinerary which mentions the number of nights_**

In [None]:
test.info()

In [None]:
#check for number of rows where we have Not Available string
if train['Airline'].str.contains('Not Available').any():
    print ("Not Available is present")

In [None]:
#check the frequency of 'Not Available' in the train dataset
cat_features = train.select_dtypes(include='object').columns.tolist()
for col in cat_features:
    NA_count= train[col].str.contains('Not Available').sum()
    if NA_count>0:
        print ("In {} there are {} 'Not Available'".format(col,NA_count))

In [None]:
#check the frequency of 'Not Available' in the test dataset 
for col in cat_features:
    NA_count= test[col].str.contains('Not Available').sum()
    if NA_count>0:
        print ("In {} there are {} 'Not Available'".format(col,NA_count))

In [None]:
#check for duplicates in the train data
train.duplicated().sum()

In [None]:
#check the number of unique id's
len(train['Uniq Id'].unique())

**_There are features which have a lot of occurrences which mentions 'Not Available'.  Secondly, there are no duplicate rows in the dataset. Third there are as many number of unique id's as is the number of observations_**

In [None]:
#basic stats of the numerical features
train.describe()

<font color = blue>_The minimum number of meals is 2 and max is 5 with 3 as the median value. Flight stops on the other hand is 0 as min and 2 as max with median of 1_</font>

In [None]:
#check the skew of the target variable 
train['Per Person Price'].skew()

_The per person price median is 17765 and mean is 20059. The max value is 171062 rupees and the minimum is 791 rupees. The data at first glance appears to be right skewed_

## _Exploratory Data Analysis_

### _Explore the numerical features_

In [None]:
#explore the price per person 
plt.hist(train['Per Person Price'],bins=30,histtype='stepfilled',color='green',alpha=0.3)
plt.title('Histogram of Per Person Price')
plt.xlabel('Per Person Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
#check for the normality of the target variable using Shapiro Wilk test
from scipy.stats import shapiro
warnings.filterwarnings(action='ignore')
data = train['Per Person Price']
stat, p = shapiro(data)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
    print('Probably Gaussian')
else:
    print('Probably not Gaussian')

**_The target variable is not Gaussian. Before building the prediction model, the target variable can be normalized so that the loss function can be optimized_**

In [None]:
#create a list of numerical features
num_features = train.select_dtypes(exclude='object').columns.tolist()
num_features

In [None]:
#explore the Flight stops
sns.countplot(train['Flight Stops'])
plt.title('Countplot of Flight Stops')
plt.xlabel('Number of Flight Stops')
plt.show()

_There are more direct flight detsinations that the travellers have picked_

In [None]:
#explore the price per person 
sns.countplot(train['Meals'])
plt.title('Countplot for Meals')
plt.xlabel('Number of Meals')
plt.show()

_There are more travellers having 3 meals per day as part of the package_

In [None]:
#check the correlation between the numerical features
train.corr()['Per Person Price']

**_There is negative correlation between the price per person and the number of flight stops. On the other hand there is positive correlation between the price per person and the number of meals selected as part of the package. This is expected as the non direct flights are usually cheaper than the direct ones_**

In [None]:
#check the correlation b/n meals and flight stops
train[['Meals','Flight Stops']].corr()

_This is expected as the number of meals per day will reduce with more flight stops. Only those meals are counted which are offered in the hotel stay or offered during the sight seeing. This also summarises the fact there is high multicollinearity in the dataset due to these two features._

In [None]:
#check the travel package for which the price per person is very high
train[train['Per Person Price'] == train['Per Person Price'].min()]

**_The start city is New Delhi and the destionation is Wayanad in way south of India. The Airline information is not available. Perhaps the airline booking option was not selected by the customer. The package name is young and free. While it cannot be ruled out the possibility of driving/train to this location, the fair assumption is that the flight booking would be done at a later date by the person_**

In [None]:
#check the travel package for which the price per person is very high
train[train['Per Person Price'] == train['Per Person Price'].max()]

**_This one has 10N booked and is a trip to the south east Asian countries. The airlines are all international fliers. The package type is also Deluxe which might also have a bearing on the price.The sight seeing places is not available. Another important consideration is the travel date which is in the month of May. The start city is New Delhi. Most of the cities in India have school summer vacation starting from the month of May all the way till the end of June. In case of the southern states in India the vacation starts in April till end of May. Some of the colder states however have winter vacation instead of summer vacation usually in Dec and Jan._** 

In [None]:
#price per person vs package type and categorised by start city
sns.stripplot(x='Meals',y='Per Person Price',hue='Flight Stops',data=train)
plt.title('Price per person vs the #Meals categorised by Flight Stops')
plt.xlabel('Number of Meals')
plt.show()

### _Explore the categorical features_

In [None]:
#list of categorical features
cat_features

In [None]:
#number of cancellation policies / rules
train['Cancellation Rules'].nunique()

**_The cancellation rules are as per the travel company policy. The dataset belongs to a single travel company. Cancellation policy do affect the price per person as the travel company would like to offer the flexi policy for a premium. This is also true in case of booking the flight tickets and the travel agent would book a flexi fare tour with the flights depending on the cancellation policy opted by their customers_**

In [None]:
#check frequency of various cancellation rules
train['Count'] = 1
pd.pivot_table(data=train,index='Cancellation Rules',values='Count',aggfunc='sum')

In [None]:
#frequency of each package type
train['Package Type'].value_counts()

In [None]:
#check the mean per person price for different package type 
package_price = pd.pivot_table(data=train,values='Per Person Price',index='Package Type',aggfunc='mean')
package_price.sort_values(by='Per Person Price',ascending=False)

In [None]:
#price per person vs package type and categorised by start city
sns.stripplot(x='Package Type',y='Per Person Price',hue='Start City',data=train)
plt.title('Price per person vs the Package Type')
plt.show()

<font color = blue> **_The mean priceis highest for Premium and lowest for Budget. However, it is important to note that there is a mix of international and domestic travel package in the dataset. This could be one of the feature to be added to the dataset and could be an important factor to decide the final per price person.The more expensive travel packages are starting from New Delhi_** </font>

In [None]:
#look at the data again
train.tail(2)

In [None]:
#unique number of packages
train['Package Name'].nunique()

In [None]:
#check the most frequent / booked packages
train['Package Name'].value_counts().sort_values(ascending=False)

In [None]:
#check the start city -- > determine whether all travels start from India or elsewhere
train['Start City'].value_counts()

<font color=blue> **_All the travel starts from New Delhi and Mumbai_**</font>

In [None]:
#unique destinations / combination of destinations managed by the travel company
train['Destination'].nunique()

In [None]:
#frequency of the destinations 
train['Destination'].value_counts()

<font color = blue> **_Goa, Shimla, Manali, Munnar are the favorite destinations. The less frequent destunations are the international destinations which can be very expensive depending on the number of places included_**</font>

In [None]:
#average per person price when destination is Goa alone .. 
train[train['Destination']=='Goa'].groupby('Destination')['Per Person Price'].apply(np.mean)

In [None]:
#hotel details
train['Hotel Details'].nunique()

In [None]:
#top 5 hotel details .. 
train['Hotel Details'].value_counts()[:5]

_Goa is the most favored destination and hence we can see that Goa hotels are where the max stay is booked_

In [None]:
#check all the hotel details when the destination is in Goa
train[train['Destination']=='Goa']['Hotel Details'].value_counts()

<font color = blue> **_All the hotel details here show the ratings of the hotel as well. Novotel Goa and Resort is occurring twice due the variation in its overall rating. This could be explained by the fact the booking in this hotel could have been done at two separte cluster of times and the average rating would have changed between these two time clusters._**</font>

In [None]:
#check a few of the travels where the hotel detail is not available 
train[train['Hotel Details'] ==  'Not Available'][:3]

In [None]:
train[train['Hotel Details'] ==  'Not Available'].loc[13]['Package Name']

_There are specific travels which do not require an air ticket as in this case. Volvo is commonly referred as the bus type used for the travel which is generally air conditioned, comfortable and considered premium over the rest._

In [None]:
#start city and no airline booking to the destinations .. 
no_airline = train[train['Airline']=='Not Available']
no_airline['Start City'].value_counts().plot(kind='bar')
plt.title('Start City without Airline Booking')
plt.show()

In [None]:
train['Itinerary'].value_counts()

<font color = blue> **_There are 966 unique Itinerary. The destinations are repeating in various and various combination is adding to the uniqueness.By the way 3 Nights in Goa seems to be the favorite_** </font>

In [None]:
#convert the date column to a datetime object
train['Travel Date'] = pd.to_datetime(train['Travel Date'])
test['Travel Date'] = pd.to_datetime(test['Travel Date'])

In [None]:
#create a new featute based on travel month 
train['Travel Month'] = train['Travel Date'].dt.month_name()

In [None]:
test['Travel Month'] = test['Travel Date'].dt.month_name()

In [None]:
#preferred month to Goa 
train[train['Destination']=='Goa']['Travel Month'].value_counts().plot(kind='bar',color='green',alpha=0.4)
plt.title('Month wise booking to destination Goa')
plt.xlabel('Month')
plt.ylabel('Number of Travels')
plt.grid(False)
plt.show()

_December is the least as it becomes very expensive during this month in Goa due to new year celebration. Off seasons like May, July and Sep are the top 3 most visited_

In [None]:
#price per person vs package type and categorised by start city
plt.figure(figsize=(15,5))
sns.stripplot(x='Travel Month',y='Per Person Price',data=train,alpha=0.4,hue='Start City',dodge=True)
plt.title('Price per person vs the Travel Month')
plt.show()

In [None]:
#price per person vs travel month and destination is goa
plt.figure(figsize=(15,5))
sns.stripplot(x='Travel Month',y='Per Person Price',data=train[train['Destination']=='Goa'],alpha=0.6)
plt.title('Price per person vs the Travel Month for Destination Goa')
plt.show()

In [None]:
#check whether Destination and Places Covered are identical series
train['Destination'].equals(train['Places Covered'])


_The Destination and Place Covered feature have identical information_

In [None]:
#drop the Places covered from the train and test dataset
train.drop('Places Covered',axis=1,inplace=True)
test.drop('Places Covered',axis=1,inplace=True)

## _Feature Extraction_

In [None]:
train.head(1)

In [None]:
#create a new featute based on travel weekday
train['Travel Start Day'] = train['Travel Date'].dt.weekday
test['Travel Start Day'] = test['Travel Date'].dt.weekday

In [None]:
#for modeling the integer representation of the month will be more appropriate .. 
train['Travel Month'] = train['Travel Date'].dt.month
test['Travel Month'] = test['Travel Date'].dt.month

In [None]:
#create a new featute based on travel year
train['Travel Year'] = train['Travel Date'].dt.year
test['Travel Year'] = test['Travel Date'].dt.year

In [None]:
#conver to float -- better for neural network models
train['Travel Start Day'] = train['Travel Start Day'].astype('float')
test['Travel Start Day'] = test['Travel Start Day'].astype('float')

train['Travel Month'] = train['Travel Month'].astype('float')
test['Travel Month'] = test['Travel Month'].astype('float')

train['Meals'] = train['Meals'].astype('float')
test['Meals'] = test['Meals'].astype('float')

train['Flight Stops'] = train['Flight Stops'].astype('float')
test['Flight Stops'] = test['Flight Stops'].astype('float')

In [None]:
#one hot encoding for the Start City feature
start_city = pd.get_dummies(train['Start City'],drop_first=True)
train = pd.concat([train,start_city],axis=1)
train.drop('Start City',axis=1,inplace=True)
train.head(1)

In [None]:
#one hot encoding for the Start City feature in test dataset
start_city = pd.get_dummies(test['Start City'],drop_first=True)
test = pd.concat([test,start_city],axis=1)
test.drop('Start City',axis=1,inplace=True)
test.head(1)

In [None]:
#one hot encoding for the package type in the train dataset
package_type = pd.get_dummies(train['Package Type'],drop_first=True)
train = pd.concat([train,package_type],axis=1)
train.drop('Package Type',axis=1,inplace=True)

#one hot encoding of the package type in the test dataset
package_type = pd.get_dummies(test['Package Type'],drop_first=True)
test = pd.concat([test,package_type],axis=1)
test.drop('Package Type',axis=1,inplace=True)

In [None]:
#drop the uniq id from the feature set
train.drop('Uniq Id',axis=1,inplace=True)
test.drop('Uniq Id',axis=1,inplace=True)

In [None]:
#drop the dummy count column created earlier
train.drop('Count',axis=1,inplace=True)

In [None]:
#view the new dataframe 
train.head(2)

In [None]:
#convert travel year to object and one hot encode , there are only two years 2021 and 2022
train['Travel Year'] = train['Travel Year'].astype(str)
test['Travel Year'] = test['Travel Year'].astype(str)

travel_year = pd.get_dummies(train['Travel Year'],drop_first=True)
train = pd.concat([train,travel_year],axis=1)
train.drop('Travel Year',axis=1,inplace=True)

travel_year = pd.get_dummies(test['Travel Year'],drop_first=True)
test = pd.concat([test,travel_year],axis=1)
test.drop('Travel Year',axis=1,inplace=True)

### _Feature extraction_
#### _Destination feature_
***
_The places in the Destination feature can be converted to separate columns and then lable encoded_

In [None]:
# new data frame with split value columns on the destination feature
new = train['Destination'].str.split('|',expand = True) 
new.head(3)

In [None]:
#check the info 
new.info()

<font color=blue> _The first 3 columns contains the maximum information of the key places a person would visit. The number of rows where the number of places is 10 is only 6. May be retaining upto the 4th place column would be more justified_</font>

In [None]:
new.columns = ['place_1','place_2','place_3','place_4','place_5','place_6','place_7','place_8',
              'place_9','place_10','place_11']
new.head(2)

In [None]:
new.drop(['place_5','place_6','place_7','place_8','place_9','place_10','place_11'],axis=1,
        inplace=True)
new.head(2)


In [None]:
#append to the train column .. 
train = pd.concat([train,new],axis=1)

In [None]:
new = test['Destination'].str.split('|',expand = True) 
new.info()

In [None]:
#repeat the steps on the test dataset 
new = test['Destination'].str.split('|',expand = True) 
new.columns = ['place_1','place_2','place_3','place_4','place_5','place_6','place_7','place_8',
              'place_9','place_10','place_11']
new.drop(['place_5','place_6','place_7','place_8','place_9','place_10','place_11'],axis=1,
        inplace=True)
#append to the test column .. 
test = pd.concat([test,new],axis=1)

In [None]:
#add a new feature = number of places covered durign the trip 
'''The function definition is below. This feature was added later.'''
#create the new feature -- > number of hotels booked during the trip
train['num_destination'] = num_sightseeing(dataset=train,col='Destination')
test['num_destination'] =  num_sightseeing(dataset=test,col='Destination')

In [None]:
#drop the redundant Destination column from the train and test dataset
train.drop('Destination',axis=1,inplace=True)
test.drop('Destination',axis=1,inplace=True)

In [None]:
#replace the None in place_* with Not Available string
places = ['place_1','place_2','place_3','place_4']
for col in places:
    train[col].replace([None],np.nan,inplace=True)
    test[col].replace([None],np.nan,inplace=True)
    train[col].fillna('Not Available',inplace=True)
    test[col].fillna('Not Available',inplace=True)

In [None]:
train.head(3)

In [None]:
test.head(2)

#### _Airline Feature_

In [None]:
#create separate columns for the various airlines in the train dataset 
airline = train['Airline'].str.split('|',expand = True) 
airline.columns = ['airline_1','airline_2','airline_3','airline_4','airline_5','airline_6','airline_7','airline_8',
                   'airline_9','airline_10','airline_11']
airline.drop(['airline_5','airline_6','airline_7','airline_8','airline_9','airline_10','airline_11'],
             axis=1,inplace=True)

#append to the train column .. 
train = pd.concat([train,airline],axis=1)

#create separate columns for the various airlines in the test dataset 
airline = test['Airline'].str.split('|',expand = True) 
airline.columns = ['airline_1','airline_2','airline_3','airline_4','airline_5','airline_6','airline_7']
airline.drop(['airline_5','airline_6','airline_7'],
             axis=1,inplace=True)

#append to the test column .. 
test = pd.concat([test,airline],axis=1)

#new feature = number of airlines booked
#create the new feature -- > number of hotels booked during the trip
train['num_airlines'] = num_sightseeing(dataset=train,col='Airline')
test['num_airlines'] =  num_sightseeing(dataset=test,col='Airline')


#drop the redundant airline column
train.drop('Airline',axis=1,inplace=True)
test.drop('Airline',axis=1,inplace=True)

#replace the None in place_* with Not Available string
airlines = ['airline_1','airline_2','airline_3','airline_4']
for col in airlines:
    train[col].replace([None],np.nan,inplace=True)
    test[col].replace([None],np.nan,inplace=True)
    train[col].fillna('Not Available',inplace=True)
    test[col].fillna('Not Available',inplace=True)


In [None]:
#drop the Travel Date column from both the train and test dataset 
train.drop('Travel Date',axis=1,inplace=True)
test.drop('Travel Date',axis=1,inplace=True)

In [None]:
train.head()

#### _Sight Seeing Places_

In [None]:
#create the new feature -- > number of sight seeing places
train['num_sightseeing'] = num_sightseeing(train)
test['num_sightseeing'] =  num_sightseeing(test)

In [None]:
train.head(2)

In [None]:
#drop the original Sightseeing column from both the train and test dataset 
train.drop('Sightseeing Places Covered',axis=1,inplace=True)
test.drop('Sightseeing Places Covered',axis=1,inplace=True)

In [None]:
train.info()

In [None]:
#check the new dataframe
train.head(2)

In [None]:
#lets drop the cancellation rules in the first iteration of the modeling
train.drop('Cancellation Rules',axis=1,inplace=True)
test.drop('Cancellation Rules',axis=1,inplace=True)

_Package Name information is well reflected in the destination feature. Hence this could be a redundant or very similar feature. For simplicity lets drop this column as well_

In [None]:
#drop the package name
train.drop('Package Name',axis=1,inplace=True)
test.drop('Package Name',axis=1,inplace=True)

<b> _Now we have two very important features from which features can be extracted. From the itinerary, the number of nights will be an important feature that can be derived. From the hotel details, the number of hotels and average rating of the hotels would also be an important feature. Good rating of hotels yields higher booking numbers and amount_ </b>

#### _Itinerary_

In [None]:
#create a column with list of all the night stay at different hotels .. 
import re

train['num_nights'] = ''
for i in range(len(train)):
    temp = sum(list(map(int,re.findall(r'\d+',train['Itinerary'][i]))))
    train['num_nights'][i] = temp

In [None]:
test['num_nights'] = ''
for j in range(len(test)):
    temp_test = sum(list(map(int,re.findall(r'\d+',test['Itinerary'][j]))))
    test['num_nights'][j] = temp_test

In [None]:
#drop the itinerary column from the train and test set
train.drop('Itinerary',axis=1,inplace=True)
test.drop('Itinerary',axis=1,inplace=True)

#### _Hotel Details_
- Number of Hotels Booked
- Average Rating of the Hotels selected


In [None]:
train.head(2)

In [None]:
#create the new feature -- > number of hotels booked during the trip
train['num_hotels'] = num_sightseeing(dataset=train,col='Hotel Details')
test['num_hotels'] =  num_sightseeing(dataset=test,col='Hotel Details')

In [None]:
train.head(1)

In [None]:
train['avg_rating'] = ''
for i in range(len(train)):
    temp = np.mean(list(map(int,re.findall(r'\d+',train['Hotel Details'][i]))))
    train['avg_rating'][i] = temp
    
test['avg_rating'] = ''
for i in range(len(test)):
    temp = np.mean(list(map(int,re.findall(r'\d+',test['Hotel Details'][i]))))
    test['avg_rating'][i] = temp

In [None]:
train['avg_rating'].median()

In [None]:
train.head(2)

In [None]:
test.head(2)

In [None]:
#fill the NaN values in avg_rating to 0 
train['avg_rating'].fillna(0,inplace=True)
test['avg_rating'].fillna(0,inplace=True)

In [None]:
#drop the Hotel Details
train.drop('Hotel Details',axis=1,inplace=True)
test.drop('Hotel Details',axis=1,inplace=True)

In [None]:
train.head(2)

In [None]:
encode_features=['place_1','place_2','place_3','place_4','airline_1','airline_2','airline_3','airline_4']
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
for col in encode_features:
    encoder.fit(list(train[col].values) + list(test[col].values))
    train[col] = encoder.transform(list(train[col].values))
    test[col] = encoder.transform(list(test[col].values))

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
#conver the num_nights to numerical
train['num_nights'] = train['num_nights'].astype(int)
train['num_nights'] = train['num_nights'].astype(int)

In [None]:
train.info()

In [None]:
#check the correlation with the numerical features 
train.corr()['Per Person Price'].sort_values(ascending=False)

In [None]:
#construct the heatmap
sns.heatmap(train.corr(),cmap='viridis')
plt.show()

## _Transformation, Dimensionality Reduction and Modeling_

In [None]:
#split the dataset into train and test set
seed = 21
X = train.drop('Per Person Price',axis=1)
y = train['Per Person Price']

X_train,X_test,y_train,y_test =  train_test_split(X,y,test_size=0.1,random_state=seed)
X_train.shape, X_test.shape

In [None]:
from sklearn.model_selection import GridSearchCV
rf_reg = RandomForestRegressor(random_state=seed)
params = {'n_estimators':[100,300,500],'max_depth':[14,21,25]}
grid = GridSearchCV(estimator=rf_reg,param_grid=params,cv=10,scoring='neg_mean_squared_log_error')
grid_fit = grid.fit(X_train,y_train)

In [None]:
grid_fit.best_estimator_

In [None]:
grid_fit.best_score_

In [None]:
rf_reg = grid_fit.best_estimator_
rf_reg.fit(X_train,y_train)
rf_reg_train_pred = rf_reg.predict(X_test)
msle = mean_squared_log_error(y_test,rf_reg_train_pred)
print('RMSLE: %.3f'%(np.sqrt(msle)))

In [None]:
#fit the model on the entire training dataset before the final predictions
rf_reg.fit(X,y)
final_pred = rf_reg.predict(test)

In [None]:
#create a submission df based on the submission format .. 
submission = pd.DataFrame(data=final_pred,columns=['Per Person Price'])

In [None]:
#create the submission file -- >  yields 0.19474 (top30% on the Public Leaderboard)
submission.to_csv('rf_submission.csv')

In [None]:
plt.hist(submission['Per Person Price'],bins=30,color='green',alpha=0.3)
plt.title('Predicted Price')
plt.show()

## _Deep Neural Network_

In [None]:
#scale the data before feeding to the dense neural network 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#Deep Neural Network for Per Person Price Prediction
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.callbacks import EarlyStopping

In [None]:
#define the early stopping callback
callback = EarlyStopping(monitor='val_loss',patience=50,restore_best_weights=True)

#define the model
model = Sequential()
#add the layers 
model.add(Dense(500,input_dim=X_train.shape[1],activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(300,activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(150,activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(75,activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(25,activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1))
#compile the model
model.compile(optimizer='adam',loss='mean_squared_logarithmic_error',metrics=['mse'])
#print summary
model.summary()
#fit the model
history = model.fit(X_train,y_train,epochs=500,callbacks=[callback],verbose=0,batch_size=32,validation_data=(X_test,y_test))

In [None]:
loss = pd.DataFrame(history.history)
loss.head(3)

In [None]:
#prediction on the tet data .. 
test =  scaler.transform(test)
predictions = model.predict(test)

In [None]:
nn_pred = pd.DataFrame(predictions,columns=['Per Person Price'])
nn_pred.to_csv('nn_submission.csv')

In [None]:
plt.hist(nn_pred['Per Person Price'],bins=30,color='green',alpha=0.3)
plt.title('Neural Network Predicted Price')
plt.show()

In [None]:
plt.plot(loss[['loss','val_loss']])
plt.show()