In [1]:
library(tidyverse)
library(ggplot2)

Registered S3 method overwritten by 'rvest':
  method            from
  read_xml.response xml2
-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 3.1.1       v purrr   0.3.2  
v tibble  2.1.1       v dplyr   0.8.0.1
v tidyr   0.8.3       v stringr 1.4.0  
v readr   1.3.1       v forcats 0.4.0  
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()


In [2]:
# load dataset
data <- read.csv("Downloads/Data_train.csv")
head(data)

"cannot open file 'Downloads/Data_train.csv': No such file or directory"

ERROR: Error in file(file, "rt"): cannot open the connection


In [None]:
# glimpse of data
str(data)

## Data Cleaning

In [None]:
# Remove duplicate rows
data <- data %>% distinct()
str(data)

In [None]:
# Convert date of journey to days of week only
data1 <- strftime(data$Date_of_Journey, "%A")

#Days of week
data2 <- data %>% 
    mutate(Days_of_week = data1)
head(data2)


In [None]:
#Count values for each Airlines
unique(data2$Airline)
airline <- data2 %>% group_by(Airline) %>% count()
airline

In [None]:
# Drop Airlines with less than 15 observations
data3 <- data2 %>%
  filter(Airline %in% c("IndiGo", "Air India", "Jet Airways", "SpiceJet", "Multiple carriers", "GoAir", "Vistara", "Air Asia"))
head(data3)

In [None]:
# Drop the following additional columns:
# Route and Additional_info
data3  <- data2  %>% 
    select(-c(5,10))
head(data3)

In [None]:
# split date_of_Journey column to Day, Month and Year
data3 = separate(data3, "Date_of_Journey", c("Day", "Month", "Year"), sep = "/")
head(data3)


In [None]:
# Drop Day and Year columns
data3  <- data3  %>% 
    select(-c(2,4))
head(data3)

In [None]:
# Select observations only with source from Banglore, Delhi and Kolkata
data4 <- data3 %>% 
    filter(Source %in% c("Banglore", "Delhi", "Kolkata"))
str(data4)

In [None]:
# Check unique values in Source
unique(data4$Source)

In [None]:
# Check unique values in Destination
unique(data4$Destination)

### We noticed New Delhi and Delhi are same city in India. So, we are going to combine them as New Delhi

In [None]:
destination <- replace(data4$Destination, data4$Destination =="Delhi","New Delhi")


In [None]:
data5 <- data4 %>% 
    mutate(Destination = destination)

In [None]:
unique(data5$Destination)

In [None]:
# Convert Dep_time(from 20:00 to 2000)
data5 <- separate(data5, Dep_Time, into = c("a","b"), sep = ":")
data5 <- unite(data5, a, b, col = "Dep_Time", sep = "")
head(data5)

In [None]:
# Change data type of Dep_time column to numeric
class(data5$Dep_Time)

In [None]:
data5$Dep_Time <- as.numeric(as.character(data5$Dep_Time))

In [None]:
class(data5$Dep_Time)

In [None]:
# convert Dep_time into four classes such as morning, evening, ......
data5 <- data5 %>%
  mutate(Departure = 0)%>%
  mutate(Departure = if_else(Dep_Time>=0000 & Dep_Time<0600, 1,Departure)) %>% 
  mutate(Departure = if_else(Dep_Time>=0600 & Dep_Time<1000, 2,Departure)) %>%
  mutate(Departure = if_else(Dep_Time>=1000 & Dep_Time<1800, 3,Departure)) %>%
  mutate(Departure = if_else(Dep_Time>=1800 & Dep_Time<2359, 4,Departure))
head(data5)

In [None]:
# Convert duration of hours and minutes to only minutes
data6 <- separate(data5, Duration, into = c("h","m"), sep = " ")
data6 <- separate(data6, h, into = c("a"), sep = "h")
data6 <- separate(data6, m, into = c("b"), sep = "m")
head(data6)

In [None]:
data6$a <- as.numeric(as.character(data6$a))
data6$b <- as.numeric(as.character(data6$b))
data6$a <- (data6$a*60)
head(data6)


In [None]:
data6$b = data6$b %>% replace_na(0)
data6 <- data6 %>% 
    mutate(Duration = data6$a + data6$b) %>% 
    select(-c(5,7,8)) # drop a, b, and Dep_Time columns
head(data6)


In [None]:
unique(data6$Total_Stops)

In [None]:
# filter data by total_stops values
data7 <- data6 %>% filter(Total_Stops %in% c("non-stop", "3 stops", "2 stops", "1 stop"))
str(data7)

In [None]:
unique(data7$Total_Stops)

In [None]:
data7$Total_Stops <- as.factor(data7$Total_Stops)

In [None]:
unique(data7$Source)

In [None]:
# Replace Delhi to New Delhi in source Column
# destination <- replace(data4$Destination, data4$Destination =="Delhi","New Delhi")
# data7$Destination  <- as.factor(data7$Destination)
#data7$Source  <- as.character(data7$Source)
# data7$Source  <- as.factor(data7$Source)
# data7$Days_of_week  <- as.factor(data7$Days_of_week)
#source <-replace(data7$Source, data7$Source == "Delhi", "New Delhi")
str(data7)

In [None]:
data8 <- data7 %>% 
    mutate(Source = source)
str(data8)

In [None]:
unique(data8$Source)

In [None]:
head(data8)

In [None]:
# Drop Arrival_time Column
data8  <- data8  %>% 
    select(-5)

In [None]:
head(data8)

## Data Visualizations

In [None]:
Plot_hist_Price <- ggplot(data8, aes(Price))+
  geom_histogram(fill = 'blue')
Plot_hist_Price 
# Data for Price is rightly skewed and needs to be transformed.

In [None]:
Plot_hist_logPrice <- ggplot(data8, aes(log(Price)))+
  geom_histogram(fill = "blue")
Plot_hist_logPrice
# Data looks good after the log transformation, so we will use price as log transformed.

In [None]:
Plot_AP <- ggplot(data8, aes(Airline,Price, color = Departure)) +
  geom_point()
Plot_AP

In [None]:
Plot_DP <- ggplot(data8, aes(Duration,Price, color = as.factor(Airline)))+
  geom_point()+
  facet_wrap(~Total_Stops)
Plot_DP

In [None]:
Plot_TP <- ggplot(data8, aes(Total_Stops,Price))+
  geom_point()
Plot_TP

In [None]:
#KEEP
Plot_TPbox <- ggplot(data8, aes(Total_Stops,Price, color = Total_Stops))+
  geom_boxplot()
Plot_TPbox

In [None]:
Plot_DAYP <- ggplot(data8, aes(Days_of_week,Price, color = Days_of_week)) +
  geom_boxplot()
Plot_DAYP

In [None]:
Plot_DeP <- ggplot(data8, aes(Departure,Price, color = as.factor(Departure)))+
  geom_boxplot()
Plot_DeP

In [None]:
#KEEP
Avg_week_price <- data8 %>% group_by(Days_of_week) %>% summarise(mean_price = mean(Price))
ggplot(Avg_week_price, aes(x = Days_of_week, y = mean_price, fill = Days_of_week)) + geom_bar(stat = "identity")+
  theme(axis.text.x = element_text(angle=90))

In [None]:
min_price <- data8 %>% group_by(Days_of_week) %>% summarise(minP = min(Price))
ggplot(aes(x = Days_of_week, y = minP), data = min_price) + geom_bar(stat = "identity")

In [None]:
#KEEP
max_price <- data8 %>% group_by(Days_of_week) %>% summarise(maxP = max(Price))
ggplot(aes(x = Days_of_week, y = maxP, fill = Days_of_week), data = max_price) + geom_bar(stat = "identity")+
  theme(axis.text.x = element_text(angle=90))

In [None]:
Avg_month_price <- data8 %>% group_by(Month) %>% summarise(meanP = mean(Price))
ggplot(aes(x = Month, y = meanP, fill = Month), data = Avg_month_price) + geom_bar(stat = "identity")

In [None]:
max_month_price <- data8 %>% group_by(Month) %>% summarise(maxP = max(Price))
ggplot(aes(x = Month, y = maxP, fill = Month), data = max_month_price) + geom_bar(stat = "identity")

In [None]:
min_month_price <- data8 %>% group_by(Month) %>% summarise(minP = min(Price))
ggplot(aes(x = Month, y = minP), data = min_month_price) + geom_bar(stat = "identity")

In [None]:
Plot_DesP <- ggplot(data8, aes(Destination,Price, color = as.factor(Destination)))+
  geom_boxplot()
Plot_DesP

In [None]:
Dest_Price <- data8 %>% group_by(Destination) %>% summarise(Average_Price = mean(Price))
Plot_DesP_bar <- ggplot(data8, aes(Destination,Price, fill = as.factor(Destination)))+
  geom_bar(stat = "identity")
Plot_DesP_bar

In [None]:
#TOTAL STOPS
Plot_TS <- ggplot(data8, aes(Total_Stops, fill = Total_Stops))+
  geom_bar()
Plot_TS

In [None]:
# DEAPRTURE
Plot_DePbar <- ggplot(data8, aes(Departure, fill = as.factor(Departure)))+
  geom_bar()
Plot_DePbar

In [None]:
#DURATION

Plot_hist_DUR <- ggplot(data8, aes(Duration))+
  geom_histogram()
Plot_hist_DUR

In [None]:
Plot_hist_logDUR <- ggplot(data8, aes(log(Duration)))+
  geom_histogram()
Plot_hist_logDUR

In [None]:
data8 %>% count(Month)

In [None]:
Plot_DAYP_bar <- ggplot(data8, aes(Days_of_week)) +
  geom_bar() 
Plot_DAYP_bar

In [None]:
data8 %>% count(Days_of_week)


In [None]:
data9 <- data8 %>% 
  select(-c(Source)) # dropped source 

In [None]:
head(data9)

In [None]:
str(data9)

In [None]:
data9$Departure  <- as.factor(data9$Departure) # change class as factor

In [None]:
data9$Month  <- as.factor(data9$Month) # change type to factor

In [None]:
str(data9)

In [None]:
# install.packages("caTools")
library(caTools)


In [None]:
#use caTools function to split, SplitRatio for 70%:30% splitting
set.seed( 123)
df= sample.split(data9,SplitRatio = 0.3)

In [None]:
#subsetting into Train data
data9_test =subset(data9,df==TRUE)
#subsetting into Test data
data9_train =subset(data9,df==FALSE)

In [None]:
str(data9_train)

In [None]:
str(data9_test)

In [None]:
sum(is.na(data9)) # no na values in dataframe

## Linear Regression Model

In [None]:
# Linear REgression Model 
LR <- lm(Price~.,data9_train)

In [None]:
summary(LR)

In [None]:
# Linear REg model with log of price and log od Duration
LR2 <- lm(log(Price)~ Airline + Month + Destination + Total_Stops + Days_of_week + 
          as.factor(Departure) + log(Duration),data9_train)

In [None]:
summary(LR2)

In [None]:
# Linear REgression with only log(Price)----This is our preferred model of simple Multiple linear regression
LR3 <- lm(log(Price)~.,data9_train)
summary(LR3)

In [None]:
#First prediction on train data using LR3
predictLR3 <-  predict(LR3)

In [None]:
data10 <- data9_train %>% 
    mutate(Predicted_Price = predictLR3)

In [None]:
data11 <- data10 %>% 
    mutate(Predicted_Price_original = exp(Predicted_Price)) 



In [None]:
#install.packages("Metrics")
library(Metrics)

In [None]:
head(data11)

In [None]:
# calculate RMSE at original scale
rmse(data11$Price, data11$Predicted_Price_original)

In [None]:
#First prediction on test data

predict_test_LR3 <-  predict(LR3, data9_test)

data12 <- data9_test %>% 
    mutate(Predicted_test_Price = predict_test_LR3)

data13 <- data12 %>%
    mutate(Predicted_Price_original = exp(Predicted_test_Price)) 

rmse(data13$Price, data13$Predicted_Price_original)

## Decision Tree Model

In [None]:
# install.packages("rpart")
library(rpart)

In [None]:
#Decision Tree
Decision_Tree_Train <- rpart(Price~.,data9_train, method = "anova")


In [None]:
summary(Decision_Tree_Train)

In [None]:
# Model Evaluation in train set
Predict <- predict(Decision_Tree_Train)

data9_train_tree <- data9_train %>% 
    mutate(Predicted_Price = Predict)

rmse(data9_train_tree$Price, data9_train_tree$Predicted_Price)

In [None]:
# Prediction on Test data
Predict <- predict(Decision_Tree_Train, data9_test)

data9_test_tree <- data9_test %>% 
    mutate(Predicted_Price = Predict)

rmse(data9_test_tree$Price,data9_test_tree$Predicted_Price)

In [None]:
#Decision Tree with log(Price)
DT_log <- rpart(log(Price)~.,data9_train, method = "anova")

In [None]:
summary(DT_log)

In [None]:
# Model Evaluation in train set
Predict <- predict(DT_log)

DT_log_train <- data9_train %>% 
    mutate(Predicted_Price = exp(Predict))

rmse(DT_log_train$Price, DT_log_train$Predicted_Price)

In [None]:
# Prediction on Test data
Predict <- predict(DT_log, data9_test)

DT_log_test <- data9_test %>% 
    mutate(Predicted_Price = exp(Predict))

rmse(DT_log_test$Price,DT_log_test$Predicted_Price)

## Random Forests Model

In [None]:
library(randomForest)

In [None]:
# RANDOM FOREST TRAIN
RF <- randomForest(Price~.,data9_train, importance=TRUE)


In [None]:
summary(RF)

In [None]:
# Model Evaluation on Train set
Predict_forest_train <- predict(RF)

RF_train <- data9_train %>% mutate(Predicted_Price = Predict_forest_train)

rmse(RF_train$Price, RF_train$Predicted_Price)

In [None]:
# RANDOM FOREST TEST
Predict_forest_test <- predict(RF, data9_test)

RF_test<- data9_test %>% 
    mutate(Predicted_Price = Predict_forest_test)

rmse(RF_test$Price, RF_test$Predicted_Price)

In [None]:
#RAndomForests model with log(Price)
RF_log <- randomForest(log(Price)~.,data9_train, importance=TRUE)

In [None]:
summary(RF_log)

In [None]:
# Model Evaluation on Train set
RF_log_train_predict <- predict(RF_log)

RF_log_train <- data9_train %>% mutate(Predicted_Price = exp(RF_log_train_predict))

rmse(RF_log_train$Price, RF_log_train$Predicted_Price)

In [None]:
# RANDOM FOREST TEST
RF_log_test_predict <- predict(RF_log, data9_test)

RF_log_test<- data9_test %>% 
    mutate(Predicted_Price = exp(RF_log_test_predict))

rmse(RF_log_test$Price, RF_log_test$Predicted_Price)

## Gradiant Boosting Model

In [None]:
#install.packages('gbm')
library(gbm)

In [None]:
GB_model=gbm(Price ~ . ,data = data9_train,distribution = "gaussian",n.trees = 10000,
                  shrinkage = 0.01, interaction.depth = 4)


In [None]:
summary(GB_model) #Summary gives a table of Variable Importance and a plot of Variable Importance

In [None]:
# Model Evaluation on Train set
Predict_forest_train <- predict(RF)

RF_train <- data9_train %>% mutate(Predicted_Price = Predict_forest_train)

rmse(RF_train$Price, RF_train$Predicted_Price)

### Prediction on Test Set

In [None]:
GB_test_predict=predict(GB_model, n.trees = GB_model$n.trees, data9_test)

GB_test<- data9_test %>% 
    mutate(Predicted_Price = GB_test_predict)

rmse(GB_test$Price, GB_test$Predicted_Price)

### Prediction on Train Set

In [None]:
GB_train_predict=predict(GB_model, data9_train)

GB_train<- data9_train %>% 
    mutate(Predicted_Price = GB_train_predict)

rmse(GB_train$Price, GB_train$Predicted_Price)

In [None]:
Dest_Price <- data8 %>% group_by(Destination,Airline) %>% summarise(Average_Price = mean(Price))
Plot_DesP_line <- ggplot(Dest_Price, aes(y = Average_Price, x = Airline, group = Destination))+
    geom_line(aes(color = Destination))+
    geom_point(aes(color = Destination))+
    theme(axis.text.x = element_text(angle=90))
Plot_DesP_line

In [None]:
head(Dest_Price)

In [None]:
str(data9)

In [None]:
unique(data9$Airline)

In [None]:
data9$Airline  <- as.character(data9$Airline)

In [None]:
str(data9)

In [None]:
data9$Airline  <- as.factor(data9$Airline)

In [None]:
str(data9)

In [None]:
Dest_Duration <- data8 %>% group_by(Destination,Airline) %>% summarise(Average_Duration = mean(Duration))
Plot_DesD_line <- ggplot(Dest_Duration, aes(y = Average_Duration, x = Airline, group = Destination))+
    geom_line(aes(color = Destination))+
    geom_point(aes(color = Destination))+
    theme(axis.text.x = element_text(angle=90))
Plot_DesD_line

In [None]:

Plot_DesA_line <- ggplot(Dest_Duration, aes(y = Average_Duration, x = Destination, group = Airline))+
    geom_line(aes(color = Airline))+
    geom_point(aes(color = Airline))
Plot_DesA_line

In [None]:
# Gradiant Boosting using log(Price)
# install.packages("xgboost")
library(xgboost) #for fitting the xgboost model


In [None]:
library(caret)   #for general data preparation and model fitting

In [None]:
#make this example reproducible
set.seed(0)

#split into training (80%) and testing set (20%)
parts = createDataPartition(data9$Price, p = .7, list = F)
train = data9[parts, ]
test = data9[-parts, ]

In [None]:
#define predictor and response variables in training set
train_x = data.matrix(train[, -5])
train_y = train[,5]

#define predictor and response variables in testing set
test_x = data.matrix(test[, -5])
test_y = test[, 5]

#define final training and testing sets
xgb_train = xgb.DMatrix(data = train_x, label = train_y)
xgb_test = xgb.DMatrix(data = test_x, label = test_y)

In [None]:
head(train)

In [None]:
head(test_y)

In [None]:
head(train_x)

In [None]:
#define watchlist
watchlist = list(train=xgb_train, test=xgb_test)

#fit XGBoost model and display training and testing data at each round
model = xgb.train(data = xgb_train, max.depth = 3, watchlist=watchlist, nrounds = 1000)