## 1. Importing Libraries

In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.0.2-py3-none-manylinux2014_x86_64.whl (4.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m165.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.2


In [4]:
!pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.8.3-py2.py3-none-any.whl.metadata (9.9 kB)
Downloading feature_engine-1.8.3-py2.py3-none-any.whl (378 kB)
Installing collected packages: feature-engine
Successfully installed feature-engine-1.8.3


In [17]:
import os

import boto3

import pickle

import warnings

import numpy as np

import pandas as pd

import xgboost as xgb

import sklearn
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import(
    OneHotEncoder,
    MinMaxScaler,
    PowerTransformer,
    FunctionTransformer,
    OrdinalEncoder,
    StandardScaler
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
    RareLabelEncoder,
    MeanEncoder,
    CountFrequencyEncoder
)

import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)

## 2. Display Settings

In [7]:
pd.set_option("display.max_columns", None)

In [8]:
sklearn.set_config(transform_output="pandas")

In [9]:
warnings.filterwarnings("ignore")

## 3. Reading Datasets

In [10]:
train = pd.read_csv("train.csv")
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-05-27,Delhi,Cochin,20:55:00,12:35:00,940,1.0,In-flight meal not included,12898
1,Jet Airways,2019-06-12,Kolkata,Banglore,18:55:00,16:20:00,1285,1.0,No Info,13044
2,Air India,2019-05-18,Delhi,Cochin,09:45:00,09:25:00,1420,2.0,No Info,10975
3,Indigo,2019-06-03,Mumbai,Hyderabad,21:20:00,22:50:00,90,0.0,No Info,2227
4,Jet Airways,2019-04-01,Mumbai,Hyderabad,02:55:00,04:20:00,85,0.0,No Info,5678
...,...,...,...,...,...,...,...,...,...,...
6689,Spicejet,2019-06-09,Kolkata,Banglore,11:35:00,18:50:00,435,1.0,No Info,8479
6690,Multiple Carriers,2019-05-09,Delhi,Cochin,10:00:00,01:30:00,930,1.0,No Info,15078
6691,Air India,2019-05-18,Delhi,Cochin,12:00:00,07:40:00,1180,2.0,No Info,8603
6692,Air Asia,2019-05-18,Delhi,Cochin,07:55:00,13:25:00,330,1.0,No Info,8759


In [11]:
val = pd.read_csv("val.csv")
val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-05-27,Delhi,Cochin,09:00:00,19:00:00,600,1.0,In-flight meal not included,10675
1,Jet Airways,2019-05-24,Kolkata,Banglore,18:55:00,10:05:00,910,1.0,In-flight meal not included,8586
2,Jet Airways,2019-03-18,Banglore,Delhi,21:25:00,09:30:00,725,1.0,No Info,13555
3,Spicejet,2019-06-27,Chennai,Kolkata,17:45:00,20:05:00,140,0.0,No check-in baggage included,3543
4,Air Asia,2019-05-15,Kolkata,Banglore,07:35:00,19:25:00,710,1.0,No Info,5192
...,...,...,...,...,...,...,...,...,...,...
1669,Vistara,2019-05-06,Kolkata,Banglore,07:10:00,22:40:00,930,1.0,No Info,8452
1670,Indigo,2019-04-03,Delhi,Cochin,21:05:00,00:20:00,195,0.0,No Info,5021
1671,Air India,2019-03-01,Banglore,Delhi,17:00:00,19:45:00,165,0.0,No Info,25913
1672,Air India,2019-06-18,Mumbai,Hyderabad,06:20:00,07:40:00,80,0.0,No Info,3100


In [12]:
test = pd.read_csv("test.csv")
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-06,Banglore,Delhi,08:00:00,08:15:00,1455,1.0,No Info,17996
1,Spicejet,2019-06-06,Kolkata,Banglore,22:20:00,00:40:00,140,0.0,No Info,3873
2,Indigo,2019-03-18,Kolkata,Banglore,05:30:00,08:20:00,170,0.0,No Info,4462
3,Jet Airways,2019-03-24,Mumbai,Hyderabad,15:50:00,17:20:00,90,0.0,In-flight meal not included,2228
4,Spicejet,2019-04-27,Banglore,Delhi,09:30:00,12:20:00,170,0.0,No Info,4991
...,...,...,...,...,...,...,...,...,...,...
2088,Jet Airways,2019-05-27,Delhi,Cochin,19:15:00,12:35:00,1040,1.0,In-flight meal not included,12898
2089,Jet Airways,2019-05-27,Delhi,Cochin,02:15:00,19:00:00,1005,1.0,In-flight meal not included,12898
2090,Jet Airways,2019-06-03,Delhi,Cochin,02:15:00,04:25:00,1570,1.0,In-flight meal not included,11627
2091,Multiple Carriers,2019-06-06,Delhi,Cochin,15:15:00,01:30:00,615,1.0,No Info,6795


## 4. Preprocessing Operations

In [13]:
# airline
air_transformer=Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("grouper", RareLabelEncoder(tol=0.1,replace_with="Others",n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

#doj
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]


doj_transformer = Pipeline(steps=[
    ("dt",DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
    ("scaler", MinMaxScaler())
])

# source & destination
loc_transformer= Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])

# dep_time & arrival_time
time_pipe1 = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour","minute"])),
    ("scaler",MinMaxScaler())
])


def part_of_day(X):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col : pd.to_datetime(X.loc[:,col]).dt.hour
        for col in columns
    })

    return(
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_temp.loc[:,col].between(4,12, inclusive="left"),
                 X_temp.loc[:,col].between(12,16, inclusive="left"),
                 X_temp.loc[:,col].between(16,20, inclusive="left")],
                ["morning","afternoon","evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )

    
time_pipe2 = Pipeline(steps=[
    ("part", FunctionTransformer(func=part_of_day)),
    ("encoder", CountFrequencyEncoder()),
    ("scaler", MinMaxScaler())
])


time_transformer = FeatureUnion(transformer_list=[
    ("part1", time_pipe1),
    ("part2", time_pipe2)
])

# duration
def dur_cat(X):
    return(
        X
        .assign(duration_cat=np.select([X.duration.lt(180),
                                       X.duration.between(180,420,inclusive="left")],
                                       ["short","medium"],
                                       default="long"))
        .drop(columns="duration")
    )

    
def is_over(X):
	return (
		X
        .assign(duration_over_1000 = X.duration.ge(1000).astype(int))
		.drop(columns="duration")
	)

    
dur_pipe1 = Pipeline(steps=[
    ("cat",FunctionTransformer(func=dur_cat)),
    ("encoder",OrdinalEncoder(categories=[["short","medium","long"]]))
])

dur_union = FeatureUnion(transformer_list=[
    ("part1",dur_pipe1),
    ("part2",FunctionTransformer(func=is_over)),
    ("scaler",StandardScaler())
])

dur_transformer = Pipeline(steps=[
    ("outlier", Winsorizer(capping_method="iqr",fold=1.5)),
    ("imputer", SimpleImputer(strategy="median")),
    ("union", dur_union)
])

# total_stops
def is_direct(X):
    return (
        X
        .assign(is_direct_flight= X.total_stops.eq(0).astype(int))
    )

total_stops_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("direct", FunctionTransformer(func= is_direct))
])

# additional_info
info_pipe1= Pipeline(steps=[
    ("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="other")),
    ("encoder", OneHotEncoder(sparse_output=False,handle_unknown="ignore"))
])

def have_info(X):
    return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

info_union = FeatureUnion(transformer_list=[
    ("part1", info_pipe1),
    ("part2", FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
    ("union", info_union)
])

# column transformer
column_transformer = ColumnTransformer(transformers=[
    ("air", air_transformer, ["airline"]),
    ("doj", doj_transformer, ["date_of_journey"]),
    ("loc", loc_transformer, ["source", "destination"]),
    ("time", time_transformer, ["dep_time","arrival_time"]),
    ("dur", dur_transformer, ["duration"]),
    ("stops", total_stops_transformer, ["total_stops"]),
    ("info", info_transformer, ["additional_info"])
], remainder="passthrough")

# feature selector
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
	estimator=estimator,
	scoring="r2",
	threshold=0.1
) 

#preprocessor
preprocessor = Pipeline(steps=[
	("ct", column_transformer),
	("selector", selector)
])

In [14]:
preprocessor.fit(
    train.drop(columns="price"),
    train.price.copy()
)

0,1,2
,steps,"[('ct', ...), ('selector', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('air', ...), ('doj', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,tol,0.1
,n_categories,2
,max_n_categories,
,replace_with,'Others'
,variables,
,missing_values,'raise'
,ignore_format,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,variables,
,features_to_extract,"['month', 'week', ...]"
,drop_original,True
,missing_values,'raise'
,dayfirst,False
,yearfirst,True
,utc,
,format,'mixed'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,tol,0.1
,n_categories,2
,max_n_categories,
,replace_with,'Other'
,variables,
,missing_values,'raise'
,ignore_format,False

0,1,2
,variables,
,missing_values,'raise'
,ignore_format,False
,unseen,'ignore'
,smoothing,0.0

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True

0,1,2
,variables,
,features_to_extract,"['hour', 'minute']"
,drop_original,True
,missing_values,'raise'
,dayfirst,False
,yearfirst,False
,utc,
,format,

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,func,<function par...x7f453db38e50>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,encoding_method,'count'
,variables,
,missing_values,'raise'
,ignore_format,False
,unseen,'ignore'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,capping_method,'iqr'
,tail,'right'
,fold,1.5
,add_indicators,False
,variables,
,missing_values,'raise'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformer_list,"[('part1', ...), ('part2', ...), ...]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,func,<function dur...x7f453db38ca0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,categories,"[['short', 'medium', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,func,<function is_...x7f453db38ee0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function is_...x7f453db38f70>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformer_list,"[('part1', ...), ('part2', ...)]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,tol,0.1
,n_categories,2
,max_n_categories,
,replace_with,'other'
,variables,
,missing_values,'raise'
,ignore_format,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,func,<function hav...x7f453db39000>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,estimator,RandomForestR...ndom_state=42)
,scoring,'r2'
,cv,3
,groups,
,threshold,0.1
,variables,
,confirm_variables,False

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [15]:
preprocessor.transform(train.drop(columns="price"))

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_Others,doj__date_of_journey_week,doj__date_of_journey_day_of_year,loc__source,loc__destination,dur__duration_cat,dur__duration_over_1000,dur__duration,stops__total_stops,stops__is_direct_flight
0,0.0,1.0,0.0,0.764706,0.737288,1.040187,1.040187,2.0,0,0.609935,1.0,0
1,0.0,1.0,0.0,0.882353,0.872881,-0.190314,-0.190314,2.0,1,1.301752,1.0,0
2,0.0,0.0,0.0,0.647059,0.661017,1.040187,1.040187,2.0,1,1.572463,2.0,0
3,1.0,0.0,0.0,0.823529,0.796610,-1.915733,-1.915733,0.0,0,-1.094542,0.0,1
4,0.0,1.0,0.0,0.294118,0.262712,-1.915733,-1.915733,0.0,0,-1.104568,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
6689,0.0,0.0,1.0,0.823529,0.847458,-0.190314,-0.190314,2.0,0,-0.402725,1.0,0
6690,0.0,0.0,0.0,0.588235,0.584746,1.040187,1.040187,2.0,0,0.589882,1.0,0
6691,0.0,0.0,0.0,0.647059,0.661017,1.040187,1.040187,2.0,1,1.091199,2.0,0
6692,0.0,0.0,1.0,0.647059,0.661017,1.040187,1.040187,1.0,0,-0.613278,1.0,0


## 5. Preprocess Data and Upload to Bucket

In [15]:
BUCKET_NAME = "flights-prices-bucket"

DATA_PREFIX = "data"

In [20]:
def get_file_name(name):
    return f"{name}-pre.csv"

In [18]:
def export_data(data, name, pre):
    # split data into X and y subsets
    X = data.drop(columns="price")
    y= data.price.copy()

    # transformation
    X_pre = pre.transform(X)

    #exporting
    file_name = get_file_name(name)
    (
        y
        .to_frame()
        .join(X_pre)
        .to_csv(file_name, index=False, header=False)
    )

In [27]:
def upload_to_bucket(name):
    file_name = get_file_name(name)
    
    (
        boto3
        .Session()
        .resource("s3")
        .Bucket(BUCKET_NAME)
        .Object(os.path.join(DATA_PREFIX, f"{name}/{name}.csv"))
        .upload_file(file_name)
    )

In [28]:
def export_and_upload_bucket(data, name, pre):
    export_data(data, name, pre)
    upload_to_bucket(name)

In [29]:
export_and_upload_bucket(train, "train", preprocessor)

In [30]:
export_and_upload_bucket(val, "val", preprocessor)

In [31]:
export_and_upload_bucket(test, "test", preprocessor)

## 6. Model and Hyperparameter Tuning Set-up

In [34]:
session = sagemaker.Session()
region_name = session.boto_region_name

In [35]:
output_path = f"s3://{BUCKET_NAME}/model/output"

In [45]:
model = Estimator(
    image_uri=sagemaker.image_uris.retrieve("xgboost", region_name, "1.2-1"),
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size=5,
    output_path=output_path,
    use_spot_instances=True,
    max_run=300,
    max_wait=600,
    sagemaker_session=session
)

In [46]:
model.set_hyperparameters(
    objective="reg:linear",
    num_round=10,
    eta=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    alpha=0.1
)

In [47]:
hyperparameter_ranges = {
    "eta": ContinuousParameter(0.05, 0.2),
    "alpha": ContinuousParameter(0, 1),
    "max_depth": IntegerParameter(3, 5)
}

In [48]:
tuner = HyperparameterTuner(
    estimator=model,
    objective_metric_name="validation:rmse",
    hyperparameter_ranges=hyperparameter_ranges,
    strategy="Bayesian",
    objective_type="Minimize"
)

## 7. Data Channels

In [49]:
def get_data_channel(name):
    bucket_path = f"s3://{BUCKET_NAME}/{DATA_PREFIX}/{name}"
    return TrainingInput(bucket_path, content_type="csv")

In [50]:
train_data_channel = get_data_channel("train")
train_data_channel

<sagemaker.inputs.TrainingInput at 0x7f4ff0b55180>

In [51]:
val_data_channel = get_data_channel("val")

In [52]:
data_channels = {
    "train": train_data_channel,
    "validation": val_data_channel
}

## 8. Train and Tune The Model

In [53]:
tuner.fit(data_channels)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


...........................!


## 9. Model Evaluation

In [16]:
with open("xgboost-model", "rb") as f:
    best_model = pickle.load(f)
    
best_model

<xgboost.core.Booster at 0x7f45424595a0>

In [18]:
def evaluate_model(name):
    file_name = get_file_name(name)
    data = pd.read_csv(file_name)
    
    X = xgb.DMatrix(data.iloc[:, 1:])
    y = data.iloc[:, 0].copy()
    
    pred = best_model.predict(X)
    
    return r2_score(y, pred)

In [21]:
evaluate_model("train")

0.5913313031196594

In [22]:
evaluate_model("val")

0.6039263010025024

In [23]:
evaluate_model("test")

0.6286282539367676