In [None]:
# Import all the libraries
import numpy as np
import pandas as pd
from sklearn import *
import seaborn as sns
from xgboost import *
from sklearn.model_selection import train_test_split

In [None]:
# Import train and test dataset
train = pd.read_csv("../input/cab-fare-prediction-dataset/TRAIN.csv")
test = pd.read_csv("../input/cab-fare-prediction-dataset/TEST.csv")

In [None]:
# Combine both the dataset
df = pd.concat([train,test]).reset_index(drop=True)

In [None]:
# Checking the 1st 5 rows
df.head()

In [None]:
# Dropping the index column
df.drop("index", axis=1, inplace=True)

In [None]:
# Checking the number of null values per column
df.isnull().sum()

In [None]:
# Converting the time_stamp column into timestamp datatype
df['time_stamp'] = pd.to_datetime(df['time_stamp'], unit='ms')

In [None]:
# Checkng the datatime column
df['time_stamp'].describe(datetime_is_numeric=True)

In [None]:
# Returns the hour of the current time e.g. for 6:30 it will return 6
def hour_convert(time):
    return time.time().hour

In [None]:
# Converting the time_stamp with above function
df['time_stamp'] = list(map(hour_convert, df['time_stamp']))

In [None]:
# Change the data into 4 intervals
def interval_convert(time):
    if  0 <= time <= 6:
        return 'midnight'
    elif 7 <= time <=12:
        return 'morning'
    elif 13<= time <=19:
        return 'mid-day'
    else:
        return 'Night'

In [None]:
df['time_stamp'] = list(map(interval_convert, df['time_stamp']))

In [None]:
# One-Hot-Encoding the new time_stamp column
df = pd.get_dummies(df, columns=['time_stamp'])

In [None]:
label_encoder = preprocessing.LabelEncoder()

In [None]:
df.cab_provider.unique()

In [None]:
# Label encoding the cab column
df.cab_provider = df.cab_provider.replace({"Lyft":0, "Uber":1})

In [None]:
df.groupby('cab_provider')['surge_multiplier'].value_counts()

In [None]:
# Aggregating fare and distance per cab_type
df.groupby(['cab_provider','cab_type']).agg({'fare':['sum'], 'distance':['sum']})

In [None]:
# Calculating the cost of Cab_type per KiloMeter
df.groupby(['cab_provider','cab_type']).agg({'fare':['sum']}).values / df.groupby(['cab_provider','cab_type']).agg({'distance':['sum']}).values

In [None]:
# Creating new feature with the above data
df['USD/KM'] = df.cab_type.replace({'Lux':6.6, 'Lux Black':8.4, 'Lux Black XL':11.9, 'Lyft':3.5, 'Lyft XL':5.6, 'Shared':2.2,
                       'Black':7.5, 'Black SUV':11.1, 'UberPool':3.2, 'UberX':3.5, 'UberXL':5.7, 'WAV':3.6})

In [None]:
#Encoding USD/KM column with 3 category
def usd_km_convert(price):
    if 2.2<= price <=4:
        return 'Budget_class'
    elif 4< price <=7.5:
        return 'Mid_class'
    else:
        return 'High_class'

In [None]:
df['USD/KM'] = list(map(usd_km_convert, df['USD/KM']))

In [None]:
# One-Hot-Encoding the new USD/KM column
df = pd.get_dummies(df, columns=['USD/KM'])

In [None]:
# One-Hot-Encoding the new cab_type column
df = pd.get_dummies(df, columns=['cab_type'])

In [None]:
df.source.unique()

In [None]:
# One-Hot-Encoding the new source column
df = pd.get_dummies(df, columns=['source'])

In [None]:
df.distance

In [None]:
# Dividing the Distance column into 4 intervals
df.distance = pd.cut(df['distance'], 4)

In [None]:
# After dividing, label encode them 
df['distance']= label_encoder.fit_transform(df['distance'])

In [None]:
# One-Hot-Encoding the destination column
df = pd.get_dummies(df, columns=['destination'])

In [None]:
df['surge_multiplier'].value_counts()

In [None]:
# One-Hot-Encoding the new surge_multiplier column
df['surge_multiplier'] = label_encoder.fit_transform(df['surge_multiplier'])

In [None]:
# Returning whether multiplier applied or not
def multi(num):
    if num==1.0:
        return 0
    else:
        return 1

In [None]:
# Creating a column of that data
df['multiplier_applied'] = list(map(multi, df['surge_multiplier']))

In [None]:
# Splitting the original data back into its original form
train_df = df[:100000]
test_df = df[100000:].drop('fare', axis=1)

In [None]:
# Splitting the features and target columns
X = train_df.drop('fare', axis=1)
y = train_df['fare']/np.array(train_df['fare'].mean())

In [None]:
# Splitting the train data train and test for model training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

In [None]:
# Initiating the XGB Model with hyperparameters
xgb_model = XGBRegressor(base_score=0.4, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4603, gamma=0.05,
             gpu_id=0, importance_type='gain', interaction_constraints='',
             learning_rate=0.999, max_delta_step=1, max_depth=4,
             min_child_weight=1.7817, monotone_constraints='()',
             n_estimators=220, n_jobs=3, nthread=-1, num_parallel_tree=5,
             random_state=8, reg_alpha=0.364, reg_lambda=0.671,
             scale_pos_weight=1, subsample=0.5213,silent = True,tree_method='exact',
             validate_parameters=1, verbosity=0)

In [None]:
# Fitting the data, predicting on the splitted test data and calculating the mean squared error
xgb_model.fit(X_test.values, y_test.values)
pred = xgb_model.predict(X_test.values)
metrics.mean_squared_error(y_test, pred)

In [None]:
# Predicting on the original test data
Prediction = xgb_model.predict(test_df.values)*(np.array(train_df['fare'].mean()))

In [None]:
# Creating a empty DataFrame
submission = pd.DataFrame()

In [None]:
# Putting the prediction in submission csv file
submission['fare'] = Prediction

In [None]:
submission