# Introduction

#### This project is about predicting the workation trip price for a person.

#### The workation trip price will be based on various factors such as destination location, hotels and their amenities, and so on.

### Import Necessary Packages

In [None]:
import pandas as pd #for data manipulation
import numpy as np #for mathematical operations
import seaborn as sns #for visualization
from plotnine import * #for visualization (based on ggplot library)
import warnings
warnings.filterwarnings('ignore') #ignore warning message

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline

### Import Train and Test Dataset

In [None]:
train=pd.read_csv('../input/workation-price-prediction-challengemachinehack/Train.csv')
test=pd.read_csv('../input/workation-price-prediction-challengemachinehack/Test.csv')

### Let structure of the train and test dataset

In [None]:
train.info()

#### In train dataset there are 15 columns(including target column) and 21000 data entries.
#### The target columns is **Per Person Price**

In [None]:
test.info()

#### In test dataset there are 14 columns and 9000 data entries.

### Let's view the sample data from train and test dataset

In [None]:
train.head(2)

In [None]:
test.head(2)

### Let's check if there is any missing value in train and test dataset.

In [None]:
train.isnull().sum()

#### The above summary shows that there is no missing in any column.

In [None]:
test.isnull().sum()

#### The above summary shows that there is no missing in any column.

### let's perform exploratory data analysis and see how the workation destination price has changing based on various factors.

### let's see target column(**Per Person Price**) distribution.

#### Let's create a function to find a inter-quartile-range.

In [None]:
def iqr(x: [int,float])->[int,float]:
    """Inter_Quartile-Range.
    
    with the help of numpy percentile function we can get the 1st and 3rd quartile.
    Then subtract the 1st quartile value from 3rd quartile.
    
    parameters:
    -----------
    input: list of numerical values,array
    return: single nummerical value.
    """
    q1_x = np.percentile(x, 25, interpolation='midpoint')
    q3_x = np.percentile(x, 75, interpolation='midpoint')
    return q3_x - q1_x

### Let's create a fuction to find a optimal bin width by using **Freedman Diaconis Rule**

In [None]:
def bin_w(x: [int,float])->[int,float]:
    """
    with help of above iqr function we can get iqr value.
    Using the iqr value and the following freedman diaconis formula we can get optimal bin width.
    
    parametes:
    ----------
    input: list of numerical values,array
    return: single nummerical value.
    
    """
    bw=(2 * iqr(x)) / np.power(x.shape[0], 1/3)
    return bw

In [None]:
price_bw=bin_w(train['Per Person Price'])
(ggplot(train)+geom_histogram(aes(x='Per Person Price'),fill='green',color='yellow',
                             binwidth=price_bw)+
labs(y='',title='Per Person Price Distribution')+
theme_seaborn(style='ticks')+
theme(figure_size=(8,8),
axis_ticks=element_blank(),      
plot_title=element_text(style='normal',size=16,weight='bold'),      
axis_text=element_text(style='normal',size=14,weight='bold'),
axis_title=element_text(style='normal',size=14,weight='bold'),
strip_text=element_text(style='normal',size=14,weight='bold')))

In [None]:
(ggplot(train)+geom_boxplot(aes(x='0',y='Per Person Price'),fill='green',color='red'
                             )+
labs(x='',y='',title='Per Person Price Boxplot')+
theme_seaborn(style='ticks')+
theme(#figure_size=(8,8),
plot_title=element_text(style='normal',size=16,weight='bold'),      
axis_text=element_text(style='normal',size=14,weight='bold'),
axis_text_x=element_blank(),    
axis_ticks=element_blank(),    
axis_title=element_text(style='normal',size=14,weight='bold'),
strip_text=element_text(style='normal',size=14,weight='bold')))

In [None]:
print("The Median is:",train['Per Person Price'].median())

In [None]:
import statistics as st

In [None]:
print("The Mode is:",st.mode(train['Per Person Price']))

In [None]:
train['Per Person Price'].describe()

#### The above histogram explains that per person price distribution is right skewed(mode is less than median).
#### The boxplot explains that there are some outlier points outside q3.
#### The mean is 20059.

### Let' see how many workation trip packages are available.

In [None]:
train['Package Name'].nunique()

#### There are 2204 unique workation trip packages.Let's compare the vaious package name with package type by using word cloud.

In [None]:
train['Package Type'].value_counts()

In [None]:
from wordcloud import WordCloud, STOPWORDS
stopwords=set(STOPWORDS)

In [None]:
c=1
fig=plt.figure(figsize=(14, 14))
for x in train['Package Type'].unique():
    wc = WordCloud(background_color="white", max_words=100, stopwords=stopwords,
                   max_font_size=40, random_state=42).generate(train[train['Package Type']==x]['Package Name'].to_string())  
   
    plt.subplot(3,2,c)
    plt.imshow(wc)
    plt.title(x,fontsize='24',fontweight='20')
    plt.axis("off")
    c+=1
plt.show() 

#### The above wordcloud shows various package names under different types of workation trip package.

### Let's analyse relationship between price and pacakge types.

In [None]:
(ggplot(train)+geom_violin(aes(x='Package Type',y='Per Person Price',
                               fill='Package Type'
                               ))+

geom_boxplot(aes(x='Package Type',y='Per Person Price'),fill=None,width=0.3)+
labs(title='Package Type and Per Person Price')+
theme_seaborn(style='ticks')+
theme(figure_size=(8,5),
legend_position='none',      
plot_title=element_text(style='normal',size=16,weight='bold'),      
axis_text=element_text(style='normal',size=14,weight='bold'),    
axis_ticks=element_blank(),    
axis_title=element_text(style='normal',size=14,weight='bold'),
strip_text=element_text(style='normal',size=14,weight='bold')))

In [None]:
train.groupby(['Package Type'],as_index=False).agg({'Per Person Price':['min','median','mean','max']})

#### The above violin plot explains that there is a significant difference between the **package type** and **per person price**.
#### The minimum per person price difference between luxury and deluxe package margin is too low.

### Let's calculate the number of night stay of each packages.

#### The **Itinerary** column has the destination location name and its number of night stay information.

#### Let's collect those numbers and find a total staying night.

In [None]:
import re #regular expression

In [None]:
#create function to extract the numbers from 'Itinerary' column and convert it into integer list then retrun the sum of list.
def sum_n(x):
    """
    parameters:
    ----------- 
    input: object(mixed data types)
    output: integer
    """
    if not None:
        r=[]
        a=re.findall(r'\d+',x)#extract the numbers in Itinerary column
        for i in a:
            r.append(int(i))#convert string to integer and store it in new list
    return sum(r)  

In [None]:
#create a new column to store the total days of stay 
train["total_days_stay"] = train['Itinerary'].map(sum_n) #using map function to map the created sum_n function 

### Let's see package type-wise total days of stay.

In [None]:
tot_nights=(train.
            groupby(['Package Type','total_days_stay']).
        agg({'total_days_stay':['count']}))

tot_nights.columns = ['total']
tot_nights=tot_nights.sort_values(["Package Type","total"], ascending=False).reset_index()
tot_nights=tot_nights[tot_nights['total']>0]

tot_nights=tot_nights.astype({'total_days_stay':'category'})


In [None]:
(ggplot(tot_nights)+
 geom_bar(aes(x='total_days_stay',y='total',
                               fill='total_days_stay'
                               ),stat='identity')+
facet_wrap('Package Type',scales='free',ncol=2)+
#scale_x_discrete(limits=num_place_package['Package Type'].index) +
labs(title='Package Type wise Staying Day Categories')+
theme_seaborn(style='ticks')+
theme(figure_size=(12,10),
legend_position='none', 
subplots_adjust={'hspace': 0.4,'wspace': 0.4} ,
plot_title=element_text(style='normal',size=16,weight='bold'),      
axis_text=element_text(style='normal',size=14,weight='bold'),    
axis_ticks=element_blank(),    
axis_title=element_text(style='normal',size=14,weight='bold'),
strip_text=element_text(style='normal',size=14,weight='bold')))

#### The above chart explains that the budget workation trip package has more number of three-days stay packages.
#### The standard, premium, luxury, and deluxe workation trip packages have more number of 3 to 6-days stay packages.

### Let's compare the price distribution between the package type and total days of stay.

In [None]:
(ggplot(train.astype({'total_days_stay':'category'}))+geom_boxplot(aes(x='total_days_stay',y='Per Person Price',
                               fill='total_days_stay'
                               ))+
facet_wrap('Package Type',scales='free',ncol=2)+
#geom_boxplot(aes(x='Package Type',y='Per Person Price'),fill=None,width=0.3)+
labs(title=' Per Person Price Distribution between Package Type and Total Days of Stay')+
theme_seaborn(style='ticks')+
theme(figure_size=(10,10),
legend_position='none',  
subplots_adjust={'hspace': 0.4,'wspace': 0.4} ,     
plot_title=element_text(style='normal',size=16,weight='bold'),      
axis_text=element_text(style='normal',size=14,weight='bold'),    
axis_ticks=element_blank(),    
axis_title=element_text(style='normal',size=14,weight='bold'),
strip_text=element_text(style='normal',size=14,weight='bold')))

In [None]:
#train[train['Package Type'].isin(['Premium','Standard'])].groupby(['Package Type','total_days_stay']).agg({'Per Person Price':['min','median','max']}).reset_index()

train.groupby(['Package Type','total_days_stay'],as_index=False).agg({'Per Person Price':['min','median','max']})

#### The chart explains that the per person workation trip price will increase based on the total days of stay.

#### The premium and Luxury package's 10-night stay trip's per person price is cheaper than the budget package's maximum price.


### Let's see which destination location is most popular based on categories of total days of stay.

In [None]:
c=1
fig=plt.figure(figsize=(20, 20))
for x in train['total_days_stay'].sort_values().unique():
    wc = WordCloud(background_color="white", max_words=1000, stopwords=stopwords,
                   max_font_size=40, random_state=42).generate(train[train['total_days_stay']==x]['Places Covered'].to_string())  
    plt.subplot(5,3,c)
    plt.imshow(wc)
    plt.title(label="{}_Night_Stay".format(x) ,fontsize='26',fontweight='20')
    plt.axis("off")
    c+=1
plt.show()

#### The above wordcloud explains various most famous destination based on the total days of stay.

### Let's see how many destinations locations will be covered by each type of packages.

#### Let' create one function to count the number of destination places.

In [None]:
#function to count number of  location in each pacakage
def count_dest(x):
    """
    parameter:
    ----------
    
    input:string seperated by delimeter
    return:total count
    """
    if not None:
        l=0 
        s=x.split("|")
        l=len(s)
    return l

In [None]:
train['num_of_places']=train['Places Covered'].map(count_dest) #map count_dest function to column

In [None]:
(ggplot(train.astype({'num_of_places':'category'}))+geom_bar(aes(x='num_of_places',
                               fill='num_of_places'
                               ))+
scale_x_discrete(limits=train['num_of_places'].value_counts().index) +
labs(title='Number of Destination Places of Workation Trip')+
theme_seaborn(style='ticks')+
theme(figure_size=(8,5),
legend_position='none',       
plot_title=element_text(style='normal',size=16,weight='bold'),      
axis_text=element_text(style='normal',size=14,weight='bold'),    
axis_ticks=element_blank(),    
axis_title=element_text(style='normal',size=14,weight='bold'),
strip_text=element_text(style='normal',size=14,weight='bold')))

In [None]:
train['num_of_places'].value_counts().sort_values(ascending=False)

#### The above chart explains that most of the workation trips will have 1 to 4 destination locations.

### Let's see the number of locations count by package type.

In [None]:
num_place_package=(train[['Package Type','num_of_places']].
                   groupby(['Package Type','num_of_places'])['num_of_places'].
                   agg({'count'}).reset_index().sort_values(by=['Package Type','count'],ascending=False).
                   astype({'num_of_places':'category'}))

In [None]:
(ggplot(num_place_package)+
 geom_bar(aes(x='num_of_places',y='count',
                               fill='num_of_places'
                               ),stat='identity')+
facet_wrap('Package Type',scales='free',ncol=2)+
#scale_x_discrete(limits=num_place_package['Package Type'].index) +
labs(title='Package Type and its Number of Destination Location')+
theme_seaborn(style='ticks')+
theme(figure_size=(12,8),
legend_position='none', 
subplots_adjust={'hspace': 0.4,'wspace': 0.4} ,
plot_title=element_text(style='normal',size=16,weight='bold'),      
axis_text=element_text(style='normal',size=14,weight='bold'),    
axis_ticks=element_blank(),    
axis_title=element_text(style='normal',size=14,weight='bold'),
strip_text=element_text(style='normal',size=14,weight='bold')))

#### The above chart explains that the budget workation trip package has mostly one destination location.
#### The deluxe, luxury, premium, and standard workation trip package have mostly 1 to 3 destination locations.

### Let's see is there any change between per person price and number of destinations 

In [None]:
(ggplot(train.astype({'num_of_places':'category'}))+geom_boxplot(aes(x='num_of_places',y='Per Person Price',
                               fill='num_of_places'
                               ))+
facet_wrap('Package Type',scales='free',ncol=2)+
#geom_boxplot(aes(x='Package Type',y='Per Person Price'),fill=None,width=0.3)+
labs(title=' Per Person Price Distribution between Package Type and Number of Destinations')+
theme_seaborn(style='ticks')+
theme(figure_size=(10,10),
legend_position='none',  
subplots_adjust={'hspace': 0.4,'wspace': 0.4} ,     
plot_title=element_text(style='normal',size=16,weight='bold'),      
axis_text=element_text(style='normal',size=14,weight='bold'),    
axis_ticks=element_blank(),    
axis_title=element_text(style='normal',size=14,weight='bold'),
strip_text=element_text(style='normal',size=14,weight='bold')))

In [None]:
(train[['Package Type','num_of_places','Per Person Price']].groupby(['Package Type','num_of_places']).
 agg({'Per Person Price':['min','mean','median','max']}).reset_index())

#### The above boxplot explains the various price ranges based on the number of destinations.
#### The standard package with 11 destinations, that package's minimum, and maximum price is almost in a close range

### Let's see the Number of destination wise most popular workation trip location.

In [None]:
c=1
fig=plt.figure(figsize=(20,20))
for x in train['num_of_places'].sort_values().unique():
    wc = WordCloud(background_color="white", max_words=1000, stopwords=stopwords,
                   max_font_size=40, random_state=42).generate(train[train['num_of_places']==x]['Places Covered'].to_string())  
    plt.subplot(5,2,c)
    plt.imshow(wc)
    plt.title(label="{}_Location_trip".format(x) ,fontsize='26',fontweight='20')
    plt.axis("off")
    c+=1
plt.show()

#### The above wordcloud explain various popular workation trip location based on the number of destination.

### Let's see how per person price is changing over time.

In [None]:
train['Travel Date']=pd.to_datetime(train['Travel Date'], errors='coerce')

In [None]:
(ggplot(train)+geom_line(aes(x='Travel Date',y='Per Person Price' ),color='green')+
scale_x_datetime(date_labels ="%b,%Y")+
labs(title='Per Person Price Changes over Time')+
theme_seaborn(style='ticks')+
theme(figure_size=(15,8),     
plot_title=element_text(style='normal',size=16,weight='bold'),      
axis_text=element_text(style='normal',size=13,weight='bold'),    
axis_ticks=element_blank(),    
axis_title=element_text(style='normal',size=14,weight='bold'),
strip_text=element_text(style='normal',size=14,weight='bold')))

### Let's break down the date into month, year, quarter, and so on.

In [None]:
train['day'],train['day_label'],train['day_number'],train['month_number'],train['month_label'],train['year_quarter'],train['week_of_year'],train['year'] = train['Travel Date'].dt.day,train['Travel Date'].dt.day_name(),train['Travel Date'].dt.dayofweek,train['Travel Date'].dt.month,train['Travel Date'].dt.strftime('%b'),train['Travel Date'].dt.quarter,train['Travel Date'].dt.week, train['Travel Date'].dt.year

In [None]:
train.head(2)

### Let's package type wise per person price changes over time.

In [None]:
(ggplot(train)+geom_line(aes(x='Travel Date',y='Per Person Price'),color='green')+
scale_x_datetime(date_labels ="%b,%Y")+
facet_wrap('Package Type',scales='free',ncol=1)+
labs(title='Package Type Wise Per Person Price Changes over Time')+
theme_seaborn(style='ticks')+
theme(figure_size=(10,10),
subplots_adjust={'hspace': 0.8,'wspace': 0.4} ,     
plot_title=element_text(style='normal',size=16,weight='bold'),      
axis_text=element_text(style='normal',size=10,weight='bold'),    
axis_ticks=element_blank(),    
axis_title=element_text(style='normal',size=14,weight='bold'),
strip_text=element_text(style='normal',size=14,weight='bold')))

#### The above line plot explains how price changes over time based on the package type.

### Let's see how per person price changes over time by various category days of stay.

In [None]:
(ggplot(train)+geom_line(aes(x='Travel Date',y='Per Person Price'),color='green')+
scale_x_datetime(date_labels ="%b")+
facet_wrap('total_days_stay',scales='free',ncol=2)+
labs(title='Per Person Price Changes over Time by Various Category Days of Stay')+
theme_seaborn(style='ticks')+
theme(figure_size=(15,12),
subplots_adjust={'hspace':1,'wspace': 0.2} ,     
plot_title=element_text(style='normal',size=16,weight='bold'),      
axis_text=element_text(style='normal',size=10,weight='bold'),    
axis_ticks=element_blank(),    
axis_title=element_text(style='normal',size=14,weight='bold'),
strip_text=element_text(style='normal',size=14,weight='bold')))

### let's see how per person price changes over time by various category of destination location count.

In [None]:
(ggplot(train)+geom_line(aes(x='Travel Date',y='Per Person Price'),color='green')+
scale_x_datetime(date_labels ="%m,%y")+
facet_wrap('num_of_places',scales='free',ncol=2)+
labs(title='Per Person Price Changes over Time by Various Category of Destinatio Location Count')+
theme_seaborn(style='ticks')+
theme(figure_size=(15,12),
subplots_adjust={'hspace':1,'wspace': 0.2} ,     
plot_title=element_text(style='normal',size=16,weight='bold'),      
axis_text=element_text(style='normal',size=10,weight='bold'),    
axis_ticks=element_blank(),    
axis_title=element_text(style='normal',size=14,weight='bold'),
strip_text=element_text(style='normal',size=14,weight='bold')))

### Let's use calplot and see how  per person price changes over time.

In [None]:
!pip install calplot

In [None]:
import calplot

In [None]:
price = pd.Series(train['Per Person Price'].values, index=train['Travel Date'])

In [None]:
cal_plot=calplot.calplot(price,edgecolor="red",cmap='twilight',linewidth=5,
                         yearlabel_kws = {"fontsize":"large"},
                        figsize=(40,30))