In [1]:
# to manipulate data
import pandas as pd

# processing and manipulating data in array form
import numpy as np

# for data processing and storage
import ast
import pickle

# function to generate random numbers
import random



In [3]:
# loading previously saved objects from files using pickle
with open('model_scaler.pkl', 'rb') as file_1:
  model_scaler = pickle.load(file_1)

with open('model_encoder.pkl', 'rb') as file_2:
  model_encoder = pickle.load(file_2)

with open('huber_model.pkl', 'rb') as file_3:
  huber_model = pickle.load(file_3)

with open('list_num_columns.txt', 'r') as file_4:
  list_num_columns = file_4.read()

with open('list_cat_columns.txt', 'r') as file_5:
  list_cat_columns = file_5.read()

In [4]:
# converting strings that may contain information about column names in a dataset back into Python data structures
list_num_columns = ast.literal_eval(list_num_columns)
list_cat_columns = ast.literal_eval(list_cat_columns)

list_num_columns

['distance', 'surge_multiplier']

In [5]:
# Set the seed here
seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)

# Number of data points to create
n = 30000

# Create data for the 'name' column
name_options = ['Shared', 'Lux', 'Lyft', 'Lux Black XL', 'Lyft XL', 'Lux Black', 'UberXL', 'Black', 'UberX', 'WAV', 'Black SUV', 'UberPool']
names = random.choices(name_options, k=n)

# Create data for the 'surge_multiplier' column
surge_options = [1.0, 1.25, 2.5, 2.0, 1.75, 1.5, 3.0]
surges = random.choices(surge_options, k=n)

# Create data for the 'distance' column
distances = np.round(np.random.uniform(0.5, 5.0, n), 2)

# Create DataFrame
data_inf1 = pd.DataFrame({
    'name': names,
    'distance': distances,
    'surge_multiplier': surges
})


In [7]:
# show data_inf1 dataframe
data_inf1

Unnamed: 0,name,distance,surge_multiplier
0,Black,2.19,2.50
1,Shared,4.78,3.00
2,Lux Black XL,3.79,1.00
3,Lyft,3.19,1.50
4,UberX,1.20,1.75
...,...,...,...
29995,Lyft XL,4.79,3.00
29996,Lux Black,3.87,1.00
29997,Lux Black XL,0.59,2.50
29998,UberX,2.30,2.50


In [8]:
# splitting the columns of the DataFrame (data_inf1) into two parts, numerical (data_inf_num) and categorical (data_inf_cat).
data_inf_num = data_inf1[list_num_columns]
data_inf_cat = data_inf1[list_cat_columns]

data_inf_cat.columns

Index(['name'], dtype='object')

In [9]:
# scaling for numerical column data
data_inf_num_scaled = model_scaler.transform(data_inf_num)
data_inf_num_scaled

array([[0.27678571, 0.75      ],
       [0.60714286, 1.        ],
       [0.48086735, 0.        ],
       ...,
       [0.07270408, 0.75      ],
       [0.29081633, 0.75      ],
       [0.20918367, 0.5       ]])

In [10]:
# encoding for categorical column data
data_inf_cat_encoded = model_encoder.transform(data_inf_cat).toarray()
data_inf_cat_encoded

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [11]:
# combine the scaled numerical columns and the encoded categorical columns into a single array or DataFrame
data_inf_final = np.concatenate([data_inf_num_scaled, data_inf_cat_encoded], axis=1)

data_inf_final

array([[0.27678571, 0.75      , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.60714286, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.48086735, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.07270408, 0.75      , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.29081633, 0.75      , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.20918367, 0.5       , 1.        , ..., 0.        , 0.        ,
        0.        ]])

In [12]:
# performing inference
y_pred_inf = huber_model.predict(data_inf_final)
y_pred_inf

array([45.06842426, 45.71510069, 34.63854806, ..., 51.93527782,
       34.89255754, 45.31447708])

In [13]:
# to structure and display the predicted values
y_pred_inf_df = pd.DataFrame(y_pred_inf, columns=['Price - Prediction'])
y_pred_inf_df

Unnamed: 0,Price - Prediction
0,45.068424
1,45.715101
2,34.638548
3,19.900091
4,19.764347
...,...
29995,54.120893
29996,25.934269
29997,51.935278
29998,34.892558


In [14]:
# to combine the original data (data_inf1) with the predicted values (y_pred_inf_df) horizontally
pd.concat([data_inf1, y_pred_inf_df], axis=1)

Unnamed: 0,name,distance,surge_multiplier,Price - Prediction
0,Black,2.19,2.50,45.068424
1,Shared,4.78,3.00,45.715101
2,Lux Black XL,3.79,1.00,34.638548
3,Lyft,3.19,1.50,19.900091
4,UberX,1.20,1.75,19.764347
...,...,...,...,...
29995,Lyft XL,4.79,3.00,54.120893
29996,Lux Black,3.87,1.00,25.934269
29997,Lux Black XL,0.59,2.50,51.935278
29998,UberX,2.30,2.50,34.892558


In [15]:
# check the intercept and slope results.
intercept = huber_model.intercept_
slope = huber_model.coef_

print('Intercept : ', intercept)
print('Slope : ', slope)

Intercept :  14.884001848461182
Slope :  [ 18.81555129  33.30206215   9.84353802  -3.24258416   1.8104879
  10.70676198 -10.91724496  -5.51289837 -13.89469088  -3.30620586
 -11.29168761 -10.43986043  -4.94648967 -10.43480724]
