## 1. Import Libraries

In [1]:
!pip install xgboost




In [2]:
!pip install feature-engine



In [5]:
import os

import boto3

import pickle

import warnings

import numpy as np

import pandas as pd

import xgboost as xgb

import sklearn
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/xdg-ubuntu/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sourabh/.config/sagemaker/config.yaml


## 2. Display Settings

In [6]:
pd.set_option("display.max_columns", None)

In [7]:
sklearn.set_config(transform_output="pandas")

In [8]:
warnings.filterwarnings("ignore")

## 3. Read Datasets

In [10]:
train = pd.read_csv("/home/sourabh/Flights-Sagemaker-Project/data/train.csv")
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-05-06,Delhi,Cochin,07:30:00,21:00:00,810,1.0,No Info,7191
1,Indigo,2019-04-06,Banglore,Delhi,23:30:00,02:20:00,170,0.0,No Info,4591
2,Jet Airways,2019-05-21,Kolkata,Banglore,14:05:00,10:05:00,1200,1.0,No Info,14388
3,Multiple Carriers,2019-05-15,Delhi,Cochin,15:00:00,01:30:00,630,1.0,No Info,13727
4,Multiple Carriers,2019-06-15,Delhi,Cochin,13:00:00,19:15:00,375,1.0,No Info,16108
...,...,...,...,...,...,...,...,...,...,...
635,Jet Airways,2019-06-27,Delhi,Cochin,19:10:00,12:35:00,1045,2.0,No Info,12819
636,Jet Airways,2019-03-01,Banglore,New Delhi,08:55:00,16:10:00,435,1.0,No Info,26890
637,Jet Airways,2019-05-15,Kolkata,Banglore,18:55:00,09:20:00,865,1.0,In-flight meal not included,9663
638,Indigo,2019-06-03,Banglore,Delhi,08:30:00,11:20:00,170,0.0,No Info,4823


In [12]:
val = pd.read_csv("/home/sourabh/Flights-Sagemaker-Project/data/val.csv")
val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-03-27,Delhi,Cochin,18:15:00,04:25:00,610,1.0,In-flight meal not included,6540
1,Spicejet,2019-04-01,Mumbai,Hyderabad,05:45:00,07:15:00,90,0.0,No check-in baggage included,1965
2,Jet Airways,2019-06-27,Delhi,Cochin,13:25:00,12:35:00,1390,2.0,In-flight meal not included,11150
3,Multiple Carriers,2019-05-15,Delhi,Cochin,13:05:00,22:30:00,565,1.0,No Info,9646
4,Goair,2019-06-06,Kolkata,Banglore,23:30:00,07:45:00,495,1.0,No Info,6686
...,...,...,...,...,...,...,...,...,...,...
155,Spicejet,2019-03-15,Kolkata,Banglore,09:00:00,11:30:00,150,0.0,No check-in baggage included,3815
156,Jet Airways,2019-06-06,Kolkata,Banglore,17:00:00,09:45:00,1005,1.0,In-flight meal not included,10539
157,Jet Airways,2019-03-06,Banglore,New Delhi,07:00:00,13:15:00,375,1.0,No Info,19225
158,Jet Airways,2019-05-01,Kolkata,Banglore,17:00:00,18:00:00,1500,1.0,No Info,11467


In [13]:
test = pd.read_csv("/home/sourabh/Flights-Sagemaker-Project/data/test.csv")
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Jet Airways,2019-06-03,Delhi,Cochin,02:15:00,12:35:00,620,1.0,No Info,14714
1,Jet Airways,2019-06-24,Mumbai,Hyderabad,19:35:00,21:05:00,90,0.0,No Info,5678
2,Air India,2019-03-27,Delhi,Cochin,06:10:00,08:15:00,1565,1.0,No Info,6681
3,Air India,2019-05-15,Delhi,Cochin,07:05:00,09:25:00,1580,2.0,No Info,10975
4,Air India,2019-03-15,Banglore,New Delhi,05:50:00,16:20:00,630,2.0,No Info,7664
...,...,...,...,...,...,...,...,...,...,...
195,Jet Airways,2019-04-09,Banglore,Delhi,06:00:00,08:45:00,165,0.0,No Info,7229
196,Vistara,2019-05-12,Kolkata,Banglore,07:10:00,18:50:00,700,1.0,No Info,8820
197,Multiple Carriers,2019-05-18,Delhi,Cochin,09:45:00,16:10:00,385,1.0,No Info,9646
198,Indigo,2019-05-24,Banglore,Delhi,08:30:00,11:20:00,170,0.0,No Info,4823


## 4. Preprocessing Operations

In [14]:
# airline
air_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

#doj
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
    ("scaler", MinMaxScaler())
])

# source & destination
location_pipe1 = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])

def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi", "Kolkata", "Mumbai", "New Delhi"]
    return (
        X
        .assign(**{
            f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

location_transformer = FeatureUnion(transformer_list=[
    ("part1", location_pipe1),
    ("part2", FunctionTransformer(func=is_north))
])

# dep_time & arrival_time
time_pipe1 = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
    ("scaler", MinMaxScaler())
])

def part_of_day(X, morning=4, noon=12, eve=16, night=20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour
        for col in columns
    })

    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_temp.loc[:, col].between(morning, noon, inclusive="left"),
                 X_temp.loc[:, col].between(noon, eve, inclusive="left"),
                 X_temp.loc[:, col].between(eve, night, inclusive="left")],
                ["morning", "afternoon", "evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )

time_pipe2 = Pipeline(steps=[
    ("part", FunctionTransformer(func=part_of_day)),
    ("encoder", CountFrequencyEncoder()),
    ("scaler", MinMaxScaler())
])

time_transformer = FeatureUnion(transformer_list=[
    ("part1", time_pipe1),
    ("part2", time_pipe2)
])

# duration
class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma


    def fit(self, X, y=None):
        if not self.variables:
            self.variables = X.select_dtypes(include="number").columns.to_list()

        self.reference_values_ = {
            col: (
                X
                .loc[:, col]
                .quantile(self.percentiles)
                .values
                .reshape(-1, 1)
            )
            for col in self.variables
        }

        return self


    def transform(self, X):
        objects = []
        for col in self.variables:
            columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
                data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
                columns=columns
            )
            objects.append(obj)
        return pd.concat(objects, axis=1)
    

def duration_category(X, short=180, med=400):
    return (
        X
        .assign(duration_cat=np.select([X.duration.lt(short),
                                    X.duration.between(short, med, inclusive="left")],
                                    ["short", "medium"],
                                    default="long"))
        .drop(columns="duration")
    )

def is_over(X, value=1000):
    return (
        X
        .assign(**{
            f"duration_over_{value}": X.duration.ge(value).astype(int)
        })
        .drop(columns="duration")
    )

duration_pipe1 = Pipeline(steps=[
    ("rbf", RBFPercentileSimilarity()),
    ("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
    ("cat", FunctionTransformer(func=duration_category)),
    ("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
    ("part1", duration_pipe1),
    ("part2", duration_pipe2),
    ("part3", FunctionTransformer(func=is_over)),
    ("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
    ("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
    ("imputer", SimpleImputer(strategy="median")),
    ("union", duration_union)
])

# total_stops
def is_direct(X):
    return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))


total_stops_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("", FunctionTransformer(func=is_direct))
])

# additional_info
info_pipe1 = Pipeline(steps=[
    ("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

def have_info(X):
    return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

info_union = FeatureUnion(transformer_list=[
("part1", info_pipe1),
("part2", FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
("union", info_union)
])

# column transformer
column_transformer = ColumnTransformer(transformers=[
("air", air_transformer, ["airline"]),
("doj", doj_transformer, ["date_of_journey"]),
("location", location_transformer, ["source", 'destination']),
("time", time_transformer, ["dep_time", "arrival_time"]),
("dur", duration_transformer, ["duration"]),
("stops", total_stops_transformer, ["total_stops"]),
("info", info_transformer, ["additional_info"])
], remainder="passthrough")

# feature selector
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
estimator=estimator,
scoring="r2",
threshold=0.1
) 

# preprocessor
preprocessor = Pipeline(steps=[
("ct", column_transformer),
("selector", selector)
])
preprocessor.fit(
    train.drop(columns="price"),
    train.price.copy()
)


In [15]:
preprocessor.transform(train.drop(columns="price"))

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_Other,doj__date_of_journey_week,doj__date_of_journey_day_of_year,dur__duration_rbf_25,dur__duration_cat,dur__duration_over_1000,dur__duration,stops__total_stops,stops__is_direct_flight
0,1.0,0.0,0.0,0.588235,0.559322,-0.346533,2.0,0,0.379026,1.0,0
1,1.0,0.0,0.0,0.294118,0.305085,2.656451,0.0,0,-0.918962,0.0,1
2,0.0,1.0,0.0,0.705882,0.686441,-0.346533,2.0,1,1.169988,1.0,0
3,0.0,0.0,0.0,0.647059,0.635593,-0.346533,2.0,0,0.013967,1.0,0
4,0.0,0.0,0.0,0.882353,0.898305,-0.346533,1.0,0,-0.503200,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...
635,0.0,1.0,0.0,1.000000,1.000000,-0.346533,2.0,1,0.855631,2.0,0
636,0.0,1.0,0.0,0.000000,0.000000,-0.346533,2.0,0,-0.381514,1.0,0
637,0.0,1.0,0.0,0.647059,0.635593,-0.346533,2.0,0,0.490572,1.0,0
638,1.0,0.0,0.0,0.823529,0.796610,2.656451,0.0,0,-0.918962,0.0,1


## 4. Preprocess Data and Upload to Bucket

In [16]:
BUCKET_NAME = "sagemaker-flights-bucket"

DATA_PREFIX = "data"

In [17]:
def get_file_name(name):
    return f"{name}-pre.csv"

In [18]:
def export_data(data, name, pre):
    # split data into X and y subsets
    X = data.drop(columns="price")
    y = data.price.copy()
    
    # transformation
    X_pre = pre.transform(X)
    
    # exporting
    file_name = get_file_name(name)
    (
        y
        .to_frame()
        .join(X_pre)
        .to_csv(file_name, index=False, header=False)
    )

In [19]:
def upload_to_bucket(name):
    file_name = get_file_name(name)
    
    (
        boto3
        .Session()
        .resource("s3")
        .Bucket(BUCKET_NAME)
        .Object(os.path.join(DATA_PREFIX, f"{name}/{name}.csv"))
        .upload_file(file_name)
    )


In [20]:
def export_and_upload_bucket(data, name, pre):
    export_data(data, name, pre)
    upload_to_bucket(name)

In [21]:
export_and_upload_bucket(train, "train", preprocessor)

NoCredentialsError: Unable to locate credentials

In [22]:
export_and_upload_bucket(val, "val", preprocessor)

NoCredentialsError: Unable to locate credentials

In [28]:
export_and_upload_bucket(test, "test", preprocessor)

NoCredentialsError: Unable to locate credentials

## 5. Model and Hyperparameter Tuning Set-up


In [29]:
session = sagemaker.Session()
region_name = session.boto_region_name

ValueError: Must setup local AWS configuration with a region supported by SageMaker.

In [30]:
output_path = f"s3://{BUCKET_NAME}/model/output"

In [31]:
model = Estimator(
    image_uri=sagemaker.image_uris.retrieve("xgboost", region_name, "1.2-1"),
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type="ml.m4.xlarge",
    volume_size=5,
    output_path=output_path,
    use_spot_instances=True,
    max_run=300,
    max_wait=600,
    sagemaker_session=session
)

NameError: name 'region_name' is not defined

In [32]:
model.set_hyperparameters(
    objective="reg:linear",
    num_round=10,
    eta=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    alpha=0.1
)

NameError: name 'model' is not defined

In [33]:
hyperparameter_ranges = {
    "eta": ContinuousParameter(0.05, 0.2),
    "alpha": ContinuousParameter(0, 1),
    "max_depth": IntegerParameter(3, 5)
}

In [34]:
tuner = HyperparameterTuner(
    estimator=model,
    objective_metric_name="validation:rmse",
    hyperparameter_ranges=hyperparameter_ranges,
    strategy="Bayesian",
    objective_type="Minimize"
)


NameError: name 'model' is not defined

## 6. Data Channels


In [35]:
def get_data_channel(name):
    bucket_path = f"s3://{BUCKET_NAME}/{DATA_PREFIX}/{name}"
    return TrainingInput(bucket_path, content_type="csv")

In [36]:
train_data_channel = get_data_channel("train")
train_data_channel


<sagemaker.inputs.TrainingInput at 0x7b854c3f0430>

In [37]:
val_data_channel = get_data_channel("val")

In [38]:
data_channels = {
    "train": train_data_channel,
    "validation": val_data_channel
}

## 7. Train and Tune the Model

In [39]:
tuner.fit(data_channels)

NameError: name 'tuner' is not defined

## 8. Model Evaluation

In [40]:
with open("xgboost-model", "rb") as f:
    best_model = pickle.load(f)
    
best_model

FileNotFoundError: [Errno 2] No such file or directory: 'xgboost-model'

In [41]:
def evaluate_model(name):
    file_name = get_file_name(name)
    data = pd.read_csv(file_name)
    
    X = xgb.DMatrix(data.iloc[:, 1:])
    y = data.iloc[:, 0].copy()
    
    pred = best_model.predict(X)
    
    return r2_score(y, pred)


In [42]:
evaluate_model("train")


NameError: name 'best_model' is not defined

In [43]:
evaluate_model("val")

NameError: name 'best_model' is not defined

In [44]:
evaluate_model("test")

NameError: name 'best_model' is not defined