In [41]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.0-py3-none-manylinux2014_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-2.1.0-py3-none-manylinux2014_x86_64.whl (4.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m87.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.0


In [7]:
!pip install feature_engine



## 1. Import Libraries

In [46]:
import pandas as pd

import numpy as np

import boto3

import xgboost as xgb

import os

import pickle

import warnings

import sklearn
from sklearn.metrics import r2_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import   Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import(
    OneHotEncoder,
    MinMaxScaler,
    PowerTransformer,
    FunctionTransformer,
    OrdinalEncoder,
    StandardScaler,

) 


from feature_engine.encoding import (
    RareLabelEncoder, 
    MeanEncoder, 
    CountFrequencyEncoder

)
from feature_engine.datetime import DatetimeFeatures
from feature_engine.outliers import Winsorizer
from feature_engine.selection import SelectBySingleFeaturePerformance


import warnings

import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)

## 2. Configurations

In [9]:
pd.set_option("display.max_columns", None)

In [10]:
sklearn.set_config(transform_output="pandas")

In [11]:
warnings.filterwarnings("ignore")

## 3. Upload Datasets 

In [12]:
train = pd.read_csv('train.csv')
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Multiple Carriers,2019-03-27,delhi,cochin,09:00:00,21:00:00,720,1.0,No info,12537
1,Indigo,2019-06-03,delhi,cochin,10:35:00,01:30:00,895,1.0,No info,5883
2,Spicejet,2019-04-03,kolkata,banglore,15:05:00,20:20:00,315,1.0,No info,4649
3,Air India,2019-03-06,mumbai,hyderabad,05:05:00,16:55:00,710,2.0,No info,16697
4,Jet Airways,2019-06-21,banglore,delhi,18:55:00,22:00:00,185,0.0,in-flight meal not included,7754
...,...,...,...,...,...,...,...,...,...,...
3195,Multiple Carriers,2019-03-21,delhi,cochin,09:00:00,15:30:00,390,1.0,No info,8307
3196,Air India,2019-04-09,delhi,cochin,14:05:00,17:55:00,230,0.0,No info,6724
3197,Jet Airways,2019-05-01,kolkata,banglore,09:35:00,23:35:00,840,1.0,No info,13067
3198,Jet Airways,2019-03-15,banglore,new delhi,21:25:00,05:05:00,460,1.0,No info,27210


In [13]:
test =pd.read_csv('test.csv')
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Spicejet,2019-06-06,mumbai,hyderabad,22:45:00,00:15:00,90,0.0,No info,2017
1,Spicejet,2019-03-21,kolkata,banglore,09:00:00,11:30:00,150,0.0,no check-in baggage included,3815
2,Indigo,2019-04-27,banglore,delhi,20:00:00,22:50:00,170,0.0,No info,4823
3,Spicejet,2019-06-03,banglore,delhi,21:10:00,00:05:00,175,0.0,No info,3311
4,Air India,2019-05-27,delhi,cochin,10:55:00,19:15:00,500,2.0,No info,13223
...,...,...,...,...,...,...,...,...,...,...
995,Air Asia,2019-03-24,banglore,new delhi,05:50:00,08:40:00,170,0.0,No info,4284
996,Jet Airways,2019-05-18,delhi,cochin,09:40:00,12:35:00,1615,2.0,No info,15129
997,Indigo,2019-05-18,banglore,delhi,08:30:00,11:20:00,170,0.0,No info,4823
998,Air India,2019-06-12,kolkata,banglore,20:30:00,13:45:00,1035,2.0,No info,12224


In [14]:
val =pd.read_csv('val.csv')
val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-05-03,kolkata,banglore,11:30:00,14:05:00,155,0.0,No info,5224
1,Jet Airways,2019-06-01,delhi,cochin,02:15:00,12:35:00,620,1.0,No info,14714
2,Jet Airways,2019-06-06,kolkata,banglore,21:10:00,08:15:00,665,1.0,in-flight meal not included,10844
3,Jet Airways,2019-03-06,banglore,new delhi,20:35:00,10:25:00,830,1.0,No info,17261
4,Jet Airways,2019-03-03,delhi,cochin,08:00:00,18:50:00,650,1.0,No info,17024
...,...,...,...,...,...,...,...,...,...,...
795,Air Asia,2019-05-15,delhi,cochin,16:45:00,07:10:00,865,1.0,No info,6752
796,Multiple Carriers,2019-06-27,delhi,cochin,07:00:00,19:00:00,720,1.0,No info,12192
797,Air India,2019-06-06,kolkata,banglore,12:00:00,18:30:00,1830,2.0,No info,9626
798,Indigo,2019-03-01,banglore,new delhi,18:25:00,21:20:00,175,0.0,No info,12649


## 4. Preprocessing

In [16]:
# airline
air_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

#doj
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
    ("scaler", MinMaxScaler())
])

# source & destination
location_pipe1 = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])

def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi", "Kolkata", "Mumbai", "New Delhi"]
    return (
        X
        .assign(**{
            f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

location_transformer = FeatureUnion(transformer_list=[
    ("part1", location_pipe1),
    ("part2", FunctionTransformer(func=is_north))
])

# dep_time & arrival_time
time_pipe1 = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
    ("scaler", MinMaxScaler())
])

def part_of_day(X, morning=4, noon=12, eve=16, night=20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour
        for col in columns
    })

    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_temp.loc[:, col].between(morning, noon, inclusive="left"),
                 X_temp.loc[:, col].between(noon, eve, inclusive="left"),
                 X_temp.loc[:, col].between(eve, night, inclusive="left")],
                ["morning", "afternoon", "evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )

time_pipe2 = Pipeline(steps=[
    ("part", FunctionTransformer(func=part_of_day)),
    ("encoder", CountFrequencyEncoder()),
    ("scaler", MinMaxScaler())
])

time_transformer = FeatureUnion(transformer_list=[
    ("part1", time_pipe1),
    ("part2", time_pipe2)
])

# duration
class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma


    def fit(self, X, y=None):
        if not self.variables:
            self.variables = X.select_dtypes(include="number").columns.to_list()

        self.reference_values_ = {
            col: (
                X
                .loc[:, col]
                .quantile(self.percentiles)
                .values
                .reshape(-1, 1)
            )
            for col in self.variables
        }

        return self


    def transform(self, X):
        objects = []
        for col in self.variables:
            columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
                data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
                columns=columns
            )
            objects.append(obj)
        return pd.concat(objects, axis=1)
    

def duration_category(X, short=180, med=400):
    return (
        X
        .assign(duration_cat=np.select([X.duration.lt(short),
                                    X.duration.between(short, med, inclusive="left")],
                                    ["short", "medium"],
                                    default="long"))
        .drop(columns="duration")
    )

def is_over(X, value=1000):
    return (
        X
        .assign(**{
            f"duration_over_{value}": X.duration.ge(value).astype(int)
        })
        .drop(columns="duration")
    )

duration_pipe1 = Pipeline(steps=[
    ("rbf", RBFPercentileSimilarity()),
    ("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
    ("cat", FunctionTransformer(func=duration_category)),
    ("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
    ("part1", duration_pipe1),
    ("part2", duration_pipe2),
    ("part3", FunctionTransformer(func=is_over)),
    ("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
    ("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
    ("imputer", SimpleImputer(strategy="median")),
    ("union", duration_union)
])

# total_stops
def is_direct(X):
    return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))


total_stops_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("", FunctionTransformer(func=is_direct))
])

# additional_info
info_pipe1 = Pipeline(steps=[
    ("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

def have_info(X):
    return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

info_union = FeatureUnion(transformer_list=[
("part1", info_pipe1),
("part2", FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
("union", info_union)
])

# column transformer
column_transformer = ColumnTransformer(transformers=[
("air", air_transformer, ["airline"]),
("doj", doj_transformer, ["date_of_journey"]),
("location", location_transformer, ["source", 'destination']),
("time", time_transformer, ["dep_time", "arrival_time"]),
("dur", duration_transformer, ["duration"]),
("stops", total_stops_transformer, ["total_stops"]),
("info", info_transformer, ["additional_info"])
], remainder="passthrough")

# feature selector
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)
selector = SelectBySingleFeaturePerformance(
estimator=estimator,
scoring="r2",
threshold=0.1
) 

# preprocessor
preprocessor = Pipeline(steps=[
("ct", column_transformer),
("selector", selector)
])

In [18]:
preprocessor.fit(
    train.drop(columns = 'price'),
    train.price.copy()
)

In [19]:
preprocessor.transform(train.drop(columns = 'price'))

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_Other,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,dur__duration_rbf_25,dur__duration_cat,dur__duration_over_1000,dur__duration,stops__total_stops,stops__is_direct_flight
0,0.0,0.0,0.0,0.235294,0.220339,1.058972,1.056852,-0.352914,2.0,0,0.170006,1.0,0
1,1.0,0.0,0.0,0.823529,0.796610,1.058972,1.056852,-0.352914,2.0,0,0.521544,1.0,0
2,0.0,0.0,1.0,0.294118,0.279661,-0.184239,-0.195717,-0.352914,1.0,0,-0.643553,1.0,0
3,0.0,0.0,0.0,0.058824,0.042373,-1.887646,-0.815121,-0.352914,2.0,0,0.149918,2.0,0
4,0.0,1.0,0.0,0.941176,0.949153,-0.911914,-1.822694,-0.352914,1.0,0,-0.904696,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3195,0.0,0.0,0.0,0.176471,0.169492,1.058972,1.056852,-0.352914,1.0,0,-0.492894,1.0,0
3196,0.0,0.0,0.0,0.352941,0.330508,1.058972,1.056852,-0.352914,1.0,0,-0.814300,0.0,1
3197,0.0,1.0,0.0,0.529412,0.516949,-0.184239,-0.195717,-0.352914,2.0,0,0.411061,1.0,0
3198,0.0,1.0,0.0,0.117647,0.118644,-0.911914,-0.815121,-0.352914,2.0,0,-0.352279,1.0,0


## 5. Preprocess and Upload to Bucket

In [20]:
BUCKET_NAME = "flightprediction-bucket"
DATA_PREFIX = "data"

In [21]:
def get_file_name(name):
    return f"{name}-pre.csv"

In [22]:
def export_data(data, name, pre):
    #spilt data into dep and indep variables/features.
    X = data.drop(columns="price")
    y = data.price.copy()
    X_pre = pre.transform(X)
    # format required by aws, price column should come first and header and index should be false.
    file_name = get_file_name(name)
    (
        y
        .to_frame()
        .join(X_pre)
        .to_csv(file_name, index=False, header=False)
    )

In [None]:
def upload_to_bucket(name):
    file_name = get_file_name(name)
    (
          boto3
        .resource("s3")
        .Bucket(BUCKET_NAME)
        .Object(os.path.join(DATA_PREFIX, f"{name}/{name}.csv"))
        .upload_file(file_name)
    )

In [23]:
def export_and_upload_bucket(data, name, pre):
    export_data(data, name, pre)
    upload_to_bucket(name)

In [None]:
export_and_upload_bucket(train,"train",preprocessor)

In [None]:
export_and_upload_bucket(test,"test",preprocessor)

In [None]:
export_and_upload_bucket(val,"val",preprocessor)

## 5. Model and Hyperparameter Tuning Set-up

In [24]:
session = sagemaker.Session()
region_name = session.boto_region_name

In [25]:
output_path = f"s3://{BUCKET_NAME}/model/output"

##### 1) intializing Amazon sagemaker Estimator object, which is used to specify training job.
##### 2) imageuri: Specifies the Docker container image URI for the XGBoost algorithm in Amazon SageMaker. The sagemaker.image_uris.retrieve function retrieves the appropriate image URI based on the algorithm (xgboost), AWS region (region_name), and version (1.2-1).
##### 3) role: Specifies the IAM role that SageMaker will assume to perform tasks like reading training data from Amazon S3.
##### 4) instance_count: Number of instances to use for training. Here, it's set to 1 (instance_count=1), indicating a single instance will be used.
##### 5) instance_type: Specifies the type of Amazon SageMaker instance to use for training.
##### 6) volume_size: Size of the EBS volume attached to the training instance(s), in GB.
##### 7) output_path: The S3 location where model artifacts and output will be stored after training.
##### 8) use_spot_instances: Specifies whether to use Amazon EC2 Spot Instances for training, which can reduce costs by taking advantage of unused capacity.
##### 9) max_run: Maximum time in seconds that a training job is allowed to run.
##### 10) max_wait: Maximum time in seconds to wait for Spot Instances to become available.
##### 11) sagemaker_session: The session object that manages interactions with SageMaker services.


In [26]:

model = Estimator(
    image_uri = sagemaker.image_uris.retrieve("xgboost", region_name, "1.2-1"),
    role = sagemaker.get_execution_role(),
    instance_count=1,
    instance_type="ml.m4.xlarge",
    volume_size=5,
    output_path = output_path,
    use_spot_instances = True,
    max_run = 300,
    max_wait = 600,
    sagemaker_session = session   
)

-) set_hyperparameters: Sets hyperparameters for the XGBoost training algorithm.

-) objective: Specifies the learning task and the corresponding learning objective. "reg:linear" indicates regression with linear regression.

-)num_round: Number of boosting rounds (iterations) for training.

-)eta: Learning rate (also known as shrinkage or step size) to prevent overfitting.

-)max_depth: Maximum depth of a tree. Higher values can lead to overfitting.

-)subsample: Fraction of samples used to train each tree. Helps prevent overfitting by training on subsets of data.

-) colsample_bytree: Fraction of features (columns) used to train each tree. Helps prevent overfitting by training on subsets of features.

-)alpha: L1 regularization term on weights. A higher value leads to more regularization.


In [29]:
model.set_hyperparameters(
    objective = "reg:linear",
    num_round = 10,
    eta = 0.1, 
    max_depth = 5,
    subsample = 0.8,
    colsample_bytree = 0.8,
    alpha=0.1
)

eta, alpha, max_depth: These are hyperparameters of the XGBoost algorithm that control its learning process.

Parameter Types:

ContinuousParameter(min_value, max_value):
Represents a hyperparameter that can take on any value within a specified continuous range.

IntegerParameter(min_value, max_value):
Represents a hyperparameter that can take on integer values within a specified range.


In [30]:
hyperparameter_ranges = {
    "eta": ContinuousParameter(0.05, 0.2),
    "alpha": ContinuousParameter(0, 1),
    "max_depth": IntegerParameter(3, 5)
}


HyperparameterTuner: This is an Amazon SageMaker utility for automating the process of hyperparameter tuning:

Specifies the SageMaker Estimator object (model in this case) that defines the training job configuration.

objective_metric_name: Defines the metric to optimize during tuning. Here, "validation:rmse" suggests minimizing the Root Mean Square Error (RMSE) on a validation dataset.

hyperparameter_ranges: Specifies the dictionary of hyperparameter ranges (hyperparameter_ranges defined earlier).

strategy: Specifies the strategy for searching the hyperparameter space. "Bayesian" uses Bayesian optimization, which intelligently selects the next set of hyperparameters based on past results to efficiently find optimal values.

objective_type: Specifies whether to minimize or maximize the objective_metric_name. "Minimize" indicates that lower values of RMSE are better.



In [31]:
tuner = HyperparameterTuner(
    estimator=model,
    objective_metric_name="validation:rmse",
    hyperparameter_ranges=hyperparameter_ranges,
    strategy="Bayesian",
    objective_type="Minimize"
)

## 7. Data Channel

In [33]:
def get_data_channel(name):
    bucket_path = f"s3://{BUCKET_NAME}/{DATA_PREFIX}/{name}"
    return TrainingInput(bucket_path, content_type="csv")
    

In [34]:
train_data_channel = get_data_channel("train")
train_data_channel

<sagemaker.inputs.TrainingInput at 0x7fa8e956a770>

In [35]:
val_data_channel = get_data_channel("val")
val_data_channel

<sagemaker.inputs.TrainingInput at 0x7fa8e956a200>

In [36]:
data_channels ={
    "train": train_data_channel,
    "validation":val_data_channel
}

## 8. Model Training and Tuning

In [37]:
tuner.fit(data_channels)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


.................................!


## 9. Model Evaluation

In [42]:
with open("xgboost-model", "rb") as f:
    best_model = pickle.load(f)
    
best_model

<xgboost.core.Booster at 0x7fa8e95b08b0>

In [62]:
def evaluate_model(name):
    file_name = get_file_name(name)
    data = pd.read_csv(file_name)
    
    #xgb.DMATRIX because xgboost expects data in this format.
    X= xgb.DMatrix(data.iloc[:,1:]) 
    y= data.iloc[:,0]
    
    pred = best_model.predict(X)
    return r2_score(y, pred)

In [63]:
evaluate_model("train")

-0.6200721829046618

In [64]:
evaluate_model("val")

-0.7190789605302215

In [65]:
evaluate_model("test")

-0.7020341946550956