## 1. Import Libraries

In [1]:
!pip install xgboost



In [2]:
!pip install feature-engine



!pip install boto3                   #popular library for interacting with Amazon Web Services (AWS).

!pip install sagemaker              #Here we need to use this if we run this code inside sagemaker then no need to use


In [3]:
!pip install boto3                  #SDK for communication between python and sagemaker 
!pip install sagemaker            



In [4]:
import os

import boto3

import pickle

import warnings

import numpy as np

import pandas as pd

import xgboost as xgb

import sklearn
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

import sagemaker
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.tuner import (
    IntegerParameter,
    ContinuousParameter,
    HyperparameterTuner
)

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\samee\AppData\Local\sagemaker\sagemaker\config.yaml


## 2. Display Settings

In [5]:
pd.set_option("display.max_columns", None)

In [6]:
sklearn.set_config(transform_output="pandas")

In [7]:
warnings.filterwarnings("ignore")

## 3. Read Datasets

In [8]:
train = pd.read_csv("train.csv")
train

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-05-15,Delhi,Cochin,17:05:00,22:15:00,310,1.0,No Info,6027
1,Multiple Carriers,2019-05-15,Delhi,Cochin,07:10:00,22:30:00,920,1.0,No Info,6513
2,Air India,2019-06-18,Delhi,Cochin,18:05:00,21:10:00,185,0.0,No Info,5201
3,Multiple Carriers,2019-05-27,Delhi,Cochin,12:50:00,21:00:00,490,1.0,No Info,7005
4,Multiple Carriers,2019-03-03,Delhi,Cochin,16:00:00,01:35:00,575,1.0,No Info,21226
...,...,...,...,...,...,...,...,...,...,...
635,Jet Airways,2019-03-12,Banglore,New Delhi,18:55:00,16:10:00,1275,1.0,In-flight meal not included,11087
636,Indigo,2019-05-12,Banglore,Delhi,07:10:00,10:05:00,175,0.0,No Info,4823
637,Multiple Carriers,2019-03-09,Delhi,Cochin,06:50:00,19:15:00,745,1.0,In-flight meal not included,14817
638,Air India,2019-03-01,Banglore,New Delhi,05:50:00,16:20:00,630,2.0,Change airports,15119


In [9]:
val = pd.read_csv("val.csv")
val

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Indigo,2019-05-21,Chennai,Kolkata,19:35:00,21:55:00,140,0.0,No Info,3597
1,Air Asia,2019-04-15,Kolkata,Banglore,10:20:00,12:55:00,155,0.0,No Info,4409
2,Jet Airways,2019-04-18,Delhi,Cochin,11:00:00,14:15:00,195,0.0,In-flight meal not included,5443
3,Indigo,2019-03-09,Banglore,New Delhi,16:55:00,21:50:00,295,1.0,No Info,4259
4,Jet Airways,2019-05-18,Delhi,Cochin,19:10:00,12:35:00,1045,2.0,No Info,15129
...,...,...,...,...,...,...,...,...,...,...
155,Air India,2019-05-15,Delhi,Cochin,13:05:00,09:25:00,1220,2.0,No Info,10975
156,Indigo,2019-05-21,Delhi,Cochin,10:35:00,01:30:00,895,1.0,No Info,6794
157,Spicejet,2019-05-18,Mumbai,Hyderabad,22:45:00,00:15:00,90,0.0,No check-in baggage included,1965
158,Indigo,2019-06-18,Banglore,Delhi,16:55:00,19:55:00,180,0.0,No Info,4823


In [10]:
test = pd.read_csv("test.csv")
test

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
0,Air India,2019-06-21,Chennai,Kolkata,11:40:00,13:55:00,135,0.0,No Info,4667
1,Air India,2019-05-12,Kolkata,Banglore,09:25:00,18:30:00,545,1.0,No Info,8576
2,Air India,2019-06-06,Kolkata,Banglore,14:30:00,12:30:00,1320,2.0,No Info,15894
3,Jet Airways,2019-03-24,Banglore,New Delhi,19:55:00,22:35:00,160,0.0,No Info,7229
4,Indigo,2019-06-06,Banglore,Delhi,00:25:00,03:15:00,170,0.0,No Info,3943
...,...,...,...,...,...,...,...,...,...,...
195,Jet Airways,2019-06-09,Delhi,Cochin,18:25:00,12:35:00,1090,2.0,No Info,14237
196,Jet Airways,2019-03-18,Banglore,New Delhi,11:40:00,05:05:00,1045,1.0,In-flight meal not included,11087
197,Jet Airways,2019-05-12,Kolkata,Banglore,21:10:00,04:40:00,450,1.0,In-flight meal not included,8586
198,Jet Airways,2019-06-06,Delhi,Cochin,05:30:00,04:25:00,1375,2.0,In-flight meal not included,10368


## 4. Preprocessing Operations

In [11]:
# airline
air_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

#doj
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
    ("scaler", MinMaxScaler())
])

# source & destination
location_pipe1 = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])

def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi", "Kolkata", "Mumbai", "New Delhi"]
    return (
        X
        .assign(**{
            f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

location_transformer = FeatureUnion(transformer_list=[
    ("part1", location_pipe1),
    ("part2", FunctionTransformer(func=is_north))
])

# dep_time & arrival_time
time_pipe1 = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
    ("scaler", MinMaxScaler())
])

def part_of_day(X, morning=4, noon=12, eve=16, night=20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour
        for col in columns
    })

    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_temp.loc[:, col].between(morning, noon, inclusive="left"),
                 X_temp.loc[:, col].between(noon, eve, inclusive="left"),
                 X_temp.loc[:, col].between(eve, night, inclusive="left")],
                ["morning", "afternoon", "evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )

time_pipe2 = Pipeline(steps=[
    ("part", FunctionTransformer(func=part_of_day)),
    ("encoder", CountFrequencyEncoder()),
    ("scaler", MinMaxScaler())
])

time_transformer = FeatureUnion(transformer_list=[
    ("part1", time_pipe1),
    ("part2", time_pipe2)
])

# duration
class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma


    def fit(self, X, y=None):
        if not self.variables:
            self.variables = X.select_dtypes(include="number").columns.to_list()

        self.reference_values_ = {
            col: (
                X
                .loc[:, col]
                .quantile(self.percentiles)
                .values
                .reshape(-1, 1)
            )
            for col in self.variables
        }

        return self


    def transform(self, X):
        objects = []
        for col in self.variables:
            columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
                data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
                columns=columns
            )
            objects.append(obj)
        return pd.concat(objects, axis=1)
    

def duration_category(X, short=180, med=400):
    return (
        X
        .assign(duration_cat=np.select([X.duration.lt(short),
                                    X.duration.between(short, med, inclusive="left")],
                                    ["short", "medium"],
                                    default="long"))
        .drop(columns="duration")
    )

def is_over(X, value=1000):
    return (
        X
        .assign(**{
            f"duration_over_{value}": X.duration.ge(value).astype(int)
        })
        .drop(columns="duration")
    )

duration_pipe1 = Pipeline(steps=[
    ("rbf", RBFPercentileSimilarity()),
    ("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
    ("cat", FunctionTransformer(func=duration_category)),
    ("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
    ("part1", duration_pipe1),
    ("part2", duration_pipe2),
    ("part3", FunctionTransformer(func=is_over)),
    ("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
    ("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
    ("imputer", SimpleImputer(strategy="median")),
    ("union", duration_union)
])

# total_stops
def is_direct(X):
    return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))


total_stops_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("", FunctionTransformer(func=is_direct))
])

# additional_info
info_pipe1 = Pipeline(steps=[
    ("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

def have_info(X):
    return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

info_union = FeatureUnion(transformer_list=[
("part1", info_pipe1),
("part2", FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
("union", info_union)
])

# column transformer
column_transformer = ColumnTransformer(transformers=[
("air", air_transformer, ["airline"]),
("doj", doj_transformer, ["date_of_journey"]),
("location", location_transformer, ["source", 'destination']),
("time", time_transformer, ["dep_time", "arrival_time"]),
("dur", duration_transformer, ["duration"]),
("stops", total_stops_transformer, ["total_stops"]),
("info", info_transformer, ["additional_info"])
], remainder="passthrough")

# feature selector
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
estimator=estimator,
scoring="r2",
threshold=0.1
) 

# preprocessor
preprocessor = Pipeline(steps=[
("ct", column_transformer),
("selector", selector)
])

In [12]:
preprocessor.fit(
    train.drop(columns="price"),
    train.price.copy()
)

In [13]:
preprocessor.transform(train.drop(columns="price"))

Unnamed: 0,air__airline_Jet Airways,air__airline_Other,doj__date_of_journey_month,doj__date_of_journey_week,location__source,location__destination,dur__duration_rbf_25,dur__duration_cat,dur__duration_over_1000,dur__duration,stops__total_stops,stops__is_direct_flight
0,0.0,0.0,0.666667,0.647059,0.963946,0.960350,-0.368298,1.0,0,-0.684948,1.0,0
1,0.0,0.0,0.666667,0.647059,0.963946,0.960350,-0.368298,2.0,0,0.536380,1.0,0
2,0.0,0.0,1.000000,0.941176,0.963946,0.960350,-0.364777,1.0,0,-0.935220,0.0,1
3,0.0,0.0,0.666667,0.764706,0.963946,0.960350,-0.368298,2.0,0,-0.324556,1.0,0
4,0.0,0.0,0.000000,0.000000,0.963946,0.960350,-0.368298,2.0,0,-0.154371,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
635,1.0,0.0,0.000000,0.117647,-1.230536,-0.854212,-0.368298,2.0,1,1.247152,1.0,0
636,0.0,0.0,0.666667,0.588235,-1.230536,-1.830057,3.067125,0.0,0,-0.955241,0.0,1
637,0.0,0.0,0.000000,0.058824,0.963946,0.960350,-0.368298,2.0,0,0.185999,1.0,0
638,0.0,0.0,0.000000,0.000000,-1.230536,-0.854212,-0.368298,2.0,0,-0.044251,2.0,0


## 4. Preprocess Data and Upload to Bucket

In [14]:
BUCKET_NAME = "sagemaker-flights-bucket"

DATA_PREFIX = "data"

In [15]:
def get_file_name(name):
    return f"{name}-pre.csv"

**Exporting data** refers to saving a dataset or DataFrame into an external file format for use outside of the Python environment.

In [16]:
def export_data(data, name, pre):
    # split data into X and y subsets
    X = data.drop(columns="price")
    y = data.price.copy()
    
    # transformation
    X_pre = pre.transform(X)
    
    # exporting
    file_name = get_file_name(name)
    (
        y
        .to_frame()
        .join(X_pre)
        .to_csv(file_name, index=False, header=False)  # we Want to data in csv file format this is sagemaker format
    )

In [17]:
def upload_to_bucket(name):
    file_name = get_file_name(name)
    
    (
        boto3
        .Session()  # We are working with this environment so called session
        .resource("s3")    #here we want s3 
        .Bucket(BUCKET_NAME)    #Here give bucket
        .Object(os.path.join(DATA_PREFIX, f"{name}/{name}.csv"))  #Inside bucket where we want to save our file , first we will create folder Data Prefix
        #Then file name we will give
        .upload_file(file_name)   #File name we want to upload
    )

In [20]:
#Combination of both methods here

In [18]:
def export_and_upload_bucket(data, name, pre): #Here in input we need data, give name of data and preprocessor
    export_data(data, name, pre)  #Here we are exporting our preprocessed data
    upload_to_bucket(name)  #Here we are uploading that preprocessed data inside bucket

In [19]:
export_and_upload_bucket(train, "train", preprocessor)

NoCredentialsError: Unable to locate credentials

In [26]:
export_and_upload_bucket(val, "val", preprocessor)

In [27]:
export_and_upload_bucket(test, "test", preprocessor)

## 5. Model and Hyperparameter Tuning Set-up

In [30]:
session = sagemaker.Session()  # key class used to interact with SageMaker resources.
region_name = session.boto_region_name #retrieves the AWS region associated with the current session. 

In [31]:
output_path = f"s3://{BUCKET_NAME}/model/output"  #To save train model we use this

In [32]:
model = Estimator(
    image_uri=sagemaker.image_uris.retrieve("xgboost", region_name, "1.2-1"), #This retrieves the Docker image URI for the specified framework (xgboost), the region (region_name), and version (1.2-1).
    role=sagemaker.get_execution_role(), #Gets the AWS Identity and Access Management (IAM) execution role associated with SageMaker.Grants permissions for SageMaker to access AWS services such as S3 (for data storage) and CloudWatch (for logging).
    instance_count=1, #Specifies the number of EC2 instances to use for training.Determines the scale of computation resources.
    instance_type="ml.m4.xlarge", #Specifies the type of EC2 instance to use for training.
    volume_size=5,  #Allocates 5 GB of storage to the training instance in S#.
    output_path=output_path,  #Specifies the S3 path where SageMaker will save the model artifacts after training.
    use_spot_instances=True, #Configures SageMaker to use Spot Instances, which are lower-cost, spare EC2 instances.
    max_run=300,  #Sets the maximum runtime for the training job in seconds. Here, it's 300 seconds
    max_wait=600, #Sets the maximum wait time in seconds for the training job when using Spot Instances.
    sagemaker_session=session  #Provides the session object, which manages interactions with SageMaker.
)

In [33]:
model.set_hyperparameters(
    objective="reg:linear", #Consider using "reg:squarederror"
    num_round=10, #Indicates the number of boosting rounds (iterations).
    eta=0.1, #Learning rate that scales the weight updates during training
    max_depth=5,  #Limits the depth of trees
    subsample=0.8, #Fraction of training samples randomly selected for each boosting round. 
    colsample_bytree=0.8, #Fraction of features (columns) randomly chosen for each tree. 
    alpha=0.1 #L1 regularization term on weights. 
)

In [34]:
hyperparameter_ranges = {
    "eta": ContinuousParameter(0.05, 0.2), #Specifies that the learning rate (eta) can take continuous values between 0.05 and 0.2.
    "alpha": ContinuousParameter(0, 1),
    "max_depth": IntegerParameter(3, 5)
}

The **HyperparameterTuner** is a class in AWS SageMaker used to automate the process of finding the best hyperparameter configuration for a machine learning model.

In [35]:
tuner = HyperparameterTuner(
    estimator=model,
    objective_metric_name="validation:rmse",  # root mean square error on the validation set.
    hyperparameter_ranges=hyperparameter_ranges,
    strategy="Bayesian",  # Uses Bayesian optimization to explore the parameter space efficiently.
    objective_type="Minimize"  #Reduces the metric value (e.g., RMSE, loss).
)

## 6. Data Channels

In [36]:
def get_data_channel(name):  #Here we are telling location of dataset
    bucket_path = f"s3://{BUCKET_NAME}/{DATA_PREFIX}/{name}"
    return TrainingInput(bucket_path, content_type="csv")

In [37]:
train_data_channel = get_data_channel("train")
train_data_channel

<sagemaker.inputs.TrainingInput at 0x7fa7b7b29510>

In [38]:
val_data_channel = get_data_channel("val")

In [39]:
data_channels = {
    "train": train_data_channel,
    "validation": val_data_channel
}

## 7. Train and Tune the Model

In [40]:
tuner.fit(data_channels)  #Here we are giving data channels for fit data

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


......................................!


In [1]:
#tuner.best_estimator().deploy()

## 8. Model Evaluation

In [44]:
with open("xgboost-model", "rb") as f:
    best_model = pickle.load(f)
    
best_model

<xgboost.core.Booster at 0x7fa7b7870610>

In [49]:
def evaluate_model(name):
    file_name = get_file_name(name)
    data = pd.read_csv(file_name)
    
    X = xgb.DMatrix(data.iloc[:, 1:]) # is using XGBoost's DMatrix object to handle data input for the XGBoost algorithm.
    y = data.iloc[:, 0].copy()
    
    pred = best_model.predict(X)
    
    return r2_score(y, pred)

In [50]:
evaluate_model("train")

-0.5000379618028745

In [51]:
evaluate_model("val")

-0.4516233294836658

In [52]:
evaluate_model("test")

-0.43416313174551946