In [37]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

In [2]:
train_data = pd.read_excel("Data/train_data.xlsx")
test_data = pd.read_excel("Data/test_data.xlsx")

In [3]:
df = pd.concat([train_data, test_data], ignore_index=True)

In [4]:
df

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897.0
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662.0
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882.0
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218.0
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302.0
...,...,...,...,...,...,...,...,...,...,...,...
13349,Air India,6/06/2019,Kolkata,Banglore,CCU → DEL → BLR,20:30,20:25 07 Jun,23h 55m,1 stop,No info,
13350,IndiGo,27/03/2019,Kolkata,Banglore,CCU → BLR,14:20,16:55,2h 35m,non-stop,No info,
13351,Jet Airways,6/03/2019,Delhi,Cochin,DEL → BOM → COK,21:50,04:25 07 Mar,6h 35m,1 stop,No info,
13352,Air India,6/03/2019,Delhi,Cochin,DEL → BOM → COK,04:00,19:15,15h 15m,1 stop,No info,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13354 entries, 0 to 13353
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Airline          13354 non-null  object 
 1   Date_of_Journey  13354 non-null  object 
 2   Source           13354 non-null  object 
 3   Destination      13354 non-null  object 
 4   Route            13353 non-null  object 
 5   Dep_Time         13354 non-null  object 
 6   Arrival_Time     13354 non-null  object 
 7   Duration         13354 non-null  object 
 8   Total_Stops      13353 non-null  object 
 9   Additional_Info  13354 non-null  object 
 10  Price            10683 non-null  float64
dtypes: float64(1), object(10)
memory usage: 1.1+ MB


In [6]:
df.isnull().sum()

Airline               0
Date_of_Journey       0
Source                0
Destination           0
Route                 1
Dep_Time              0
Arrival_Time          0
Duration              0
Total_Stops           1
Additional_Info       0
Price              2671
dtype: int64

# Feature Engineering

In [7]:
# dropping the useless columns

df.drop("Additional_Info", axis = 1, inplace = True)
df.drop("Route", axis = 1, inplace = True)

In [8]:
# dropping the row which contains null value

df.drop(9039, inplace = True)

In [9]:
df["Date"] = df["Date_of_Journey"].str.split("/").str[0]
df["Date"] = df["Date"].astype(int)

df["Month"] = df["Date_of_Journey"].str.split("/").str[1]
df["Month"] = df["Month"].astype(int)

df["Year"] = df["Date_of_Journey"].str.split("/").str[2]
df["Year"] = df["Year"].astype(int)

df.drop("Date_of_Journey", axis = 1, inplace = True)

In [10]:
df["Arrival_Time"] = df["Arrival_Time"].apply(lambda time : time.split(" ")[0])

df["Arrival_Hour"] = df["Arrival_Time"].apply(lambda time : time.split(":")[0])
df["Arrival_Hour"] = df["Arrival_Hour"].astype(int)

df["Arrival_Min"] = df["Arrival_Time"].apply(lambda time : time.split(":")[1])
df["Arrival_Min"] = df["Arrival_Min"].astype(int)

df.drop("Arrival_Time", axis = 1, inplace = True)

In [11]:
df["Departure_Hour"] = df["Dep_Time"].apply(lambda time : time.split(":")[0])
df["Departure_Hour"] = df["Departure_Hour"].astype(int)

df["Departure_Min"] = df["Dep_Time"].apply(lambda time : time.split(":")[1])
df["Departure_Min"] = df["Departure_Min"].astype(int)

df.drop("Dep_Time", axis = 1, inplace = True)

In [12]:
df["Total_Stops"] = df["Total_Stops"].replace({"1 stop": 1, "non-stop": 0, "2 stops": 2, "3 stops": 3, "4 stops": 4})

In [13]:
df["Duration_Hours"] = df["Duration"].str.split(" ").str[0].str[:-1]
df["Duration_Hours"] = df["Duration_Hours"].astype(int)

df["Duration_Minutes"] = df["Duration"].str.split(" ").str[1].str[:-1]
df["Duration_Minutes"].fillna(0, inplace = True)
df["Duration_Minutes"] = df["Duration_Minutes"].astype(int)

df.drop("Duration", axis = 1, inplace = True)

In [14]:
df["Duration_Minutes"] = (df["Duration_Hours"] * 60) + df["Duration_Minutes"]
df.drop("Duration_Hours", axis = 1, inplace = True)

In [15]:
le = LabelEncoder()

In [16]:
df["Airline"] = le.fit_transform(df["Airline"])
df["Source"] = le.fit_transform(df["Source"])
df["Destination"] = le.fit_transform(df["Destination"])

In [17]:
df

Unnamed: 0,Airline,Source,Destination,Total_Stops,Price,Date,Month,Year,Arrival_Hour,Arrival_Min,Departure_Hour,Departure_Min,Duration_Minutes
0,3,0,5,0,3897.0,24,3,2019,1,10,22,20,170
1,1,3,0,2,7662.0,1,5,2019,13,15,5,50,445
2,4,2,1,2,13882.0,9,6,2019,4,25,9,25,1140
3,3,3,0,1,6218.0,12,5,2019,23,30,18,5,325
4,3,0,5,1,13302.0,1,3,2019,21,35,16,50,285
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13349,1,3,0,1,,6,6,2019,20,25,20,30,1435
13350,3,3,0,0,,27,3,2019,16,55,14,20,155
13351,4,2,1,1,,6,3,2019,4,25,21,50,395
13352,1,2,1,1,,6,3,2019,19,15,4,0,915


In [18]:
test_data = df[df["Price"].isnull()]
test_data.drop("Price", axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.drop("Price", axis = 1, inplace = True)


In [19]:
train_data = df[~df["Price"].isnull()]

In [20]:
test_data

Unnamed: 0,Airline,Source,Destination,Total_Stops,Date,Month,Year,Arrival_Hour,Arrival_Min,Departure_Hour,Departure_Min,Duration_Minutes
10683,4,2,1,1,6,6,2019,4,25,17,30,655
10684,3,3,0,1,12,5,2019,10,20,6,20,240
10685,4,2,1,1,21,5,2019,19,0,19,15,1425
10686,6,2,1,1,21,5,2019,21,0,8,0,780
10687,0,0,2,0,24,6,2019,2,45,23,55,170
...,...,...,...,...,...,...,...,...,...,...,...,...
13349,1,3,0,1,6,6,2019,20,25,20,30,1435
13350,3,3,0,0,27,3,2019,16,55,14,20,155
13351,4,2,1,1,6,3,2019,4,25,21,50,395
13352,1,2,1,1,6,3,2019,19,15,4,0,915


In [21]:
train_data

Unnamed: 0,Airline,Source,Destination,Total_Stops,Price,Date,Month,Year,Arrival_Hour,Arrival_Min,Departure_Hour,Departure_Min,Duration_Minutes
0,3,0,5,0,3897.0,24,3,2019,1,10,22,20,170
1,1,3,0,2,7662.0,1,5,2019,13,15,5,50,445
2,4,2,1,2,13882.0,9,6,2019,4,25,9,25,1140
3,3,3,0,1,6218.0,12,5,2019,23,30,18,5,325
4,3,0,5,1,13302.0,1,3,2019,21,35,16,50,285
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10678,0,3,0,0,4107.0,9,4,2019,22,25,19,55,150
10679,1,3,0,0,4145.0,27,4,2019,23,20,20,45,155
10680,4,0,2,0,7229.0,27,4,2019,11,20,8,20,180
10681,10,0,5,0,12648.0,1,3,2019,14,10,11,30,160


In [40]:
X = train_data.drop("Price", axis = 1)
y = train_data[["Price"]]

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [47]:
xgb_model = XGBRegressor(random_state = 42)

search_space = {
    "n_estimators": [100, 200, 500],
    "max_depth": [3, 6, 9],
    "gamma": [0.01, 0.1],
    "learning_rate": [0.001, 0.01, 0.1, 1]
}

gs = GridSearchCV(
    estimator = xgb_model,
    param_grid = search_space,
    scoring = ["r2", "neg_root_mean_squared_error"],
    refit = "r2",
    cv = 5,
    verbose = 4
)
gs.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 1/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-4062.248) r2: (test=0.102) total time=   0.0s
[CV 2/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-4357.725) r2: (test=0.094) total time=   0.0s
[CV 3/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-4690.845) r2: (test=0.084) total time=   0.0s
[CV 4/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-4670.629) r2: (test=0.086) total time=   0.0s
[CV 5/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=100; neg_root_mean_squared_error: (test=-4099.638) r2: (test=0.100) total time=   0.0s
[CV 1/5] END gamma=0.01, learning_rate=0.001, max_depth=3, n_estimators=200; neg_root_mean_squared_error: (test=-3861.848) r2: (test=0.188) tot

In [50]:
gs.best_params_

{'gamma': 0.01, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200}

In [51]:
gs.best_score_

0.8283963002497167