In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

pd.options.mode.chained_assignment = None

In [2]:
train_url = "https://s3-ap-southeast-1.amazonaws.com/mettl-arq/questions/codelysis/machine-learning/fare-prediction/train.csv"
test_url = "https://s3-ap-southeast-1.amazonaws.com/mettl-arq/questions/codelysis/machine-learning/fare-prediction/test.csv"
train = pd.read_csv(train_url)
y_train = train['Fare']
test = pd.read_csv(test_url)

In [3]:
def preprocess(df):
    df["Booking Date"] = pd.to_datetime(df["Booking Date"])
    df['Flight Date'] = pd.to_datetime(df["Flight Date"])
    df['Date of Birth'] = pd.to_datetime(df['Date of Birth'])
    df['Age'] = round((df['Booking Date'] - df['Date of Birth']).dt.days / 365.25)
    df['Title'] = df.Name.map(lambda name : name.split(' ')[0].strip('.'))
    df['Name'] = df.Name.map(lambda name : ' '.join(name.split(' ')[1:]))
    df['Day of Week of Flight'] =  pd.to_datetime(df['Flight Date']).dt.day_of_week
    df['Month of Flight'] = pd.to_datetime(df['Flight Date']).dt.month
    df['Flight Time'] = df['Flight Time'].map(lambda time : int(''.join(time.split(':'))))
    df['Gender'] = df['Name'].map(lambda name : 'Male' if name[0] == 'M' else 'Female')
    df['Day of Week of Booking'] =  pd.to_datetime(df['Booking Date']).dt.day_of_week
    df['Month of Booking'] = pd.to_datetime(df['Booking Date']).dt.month
    df['Booking interval'] = (df['Flight Date'] - df['Booking Date']).dt.days
    df['Route'] = pd.Series(zip(df["From"],df['To']))
    df.drop(['Date of Birth', 'Flight Date', 'Booking Date'], axis = 1 ,inplace = True)

    df = df[["Title", "Gender", "Age", "Class", "Route","Flight Time",
           "Booking interval","Month of Booking", "Day of Week of Booking" ,
           "Month of Flight", "Day of Week of Flight"]]
    cols = df.select_dtypes(['object']).columns
    df[cols] = df[cols].apply(lambda col: pd.factorize(col)[0])
    return df

In [4]:
X_train = preprocess(train)
X_test = preprocess(test)
regressor = XGBRegressor(n_estimators = 100,learning_rate = 0.4)

In [5]:
cross_val_score(regressor,X_train, y_train, cv = 3, scoring = 'r2')

array([0.93209189, 0.94006211, 0.93631967])