#### tip_amount prediction

- In tip amount prediction we have 2 models Linear regression and lasso
- As this is a new pynb file , we will do the loading of data and train test split
- We uses the trained model pickle file and find the testing mean r2

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer




#loading data
taxi_data= pd.read_csv('yellow_taxi_data.csv')

#If your test dataset contains these invalid values, your pipeline may fail or give unrealistic predictions
# Performing : removing the bad rows (same as did in training.ipynb)
taxi_data = taxi_data[taxi_data['passenger_count'] != 0]
num_cols = ['trip_distance','fare_amount','extra','mta_tax','tolls_amount',
            'improvement_surcharge','congestion_surcharge','airport_fee']    

# Keep rows where values are either NaN or >= 0(avoiding negative values)
for col in num_cols:
    taxi_data= taxi_data[(taxi_data[col].isna()) | (taxi_data[col] >= 0)]
    
# Split features and target
X_train_tip, X_test_tip = train_test_split(taxi_data, test_size=0.2,
                                random_state=0)
X_train_tip_features = X_train_tip.drop(columns=['tip_amount'])  
y_train_tip = X_train_tip['tip_amount'].copy()    

X_test_features_tip = X_test_tip.drop(columns=['tip_amount'])  
y_test_tip = X_test_tip['tip_amount'].copy() 
    

- I need to load Date_time_total_encoder , which is used to add new features
- without loading Date_time_total_encoder , will have some error .

In [2]:
class Date_time_total_encoder(BaseEstimator, TransformerMixin): 
    def __init__(self):
        pass  # nothing to initialize
        
    def fit(self, X, y=None): 
        return self  # nothing else to do
    
    def transform(self, X):
        X = X.copy()
        
        # Encoding date with the respective day of the week
        pickup_day_of_week = pd.to_datetime(X['tpep_pickup_datetime'], errors='coerce').dt.day_name()
        
        pickup_hours = pd.to_datetime(X['tpep_pickup_datetime'], errors='coerce').dt.hour.astype(float)
        pickup_time_of_day = pd.cut(pickup_hours,
                                    bins=[0, 12, 17, 19, 24],
                                    labels=['Morning','Afternoon','Evening','Night'],
                                    right=False)
        
        # Create pre_tip_total_amount
        X['pre_tip_total_amount'] = (
            X['fare_amount'] + X['extra'] + X['mta_tax'] + X['tolls_amount'] +
            X['improvement_surcharge'] + X['congestion_surcharge'] + X['airport_fee']
        )
        
        X['pickup_time_of_day'] = pickup_time_of_day
        X['pickup_day_of_week'] = pickup_day_of_week
        
        # Drop original datetime columns
        X = X.drop(['tpep_pickup_datetime','tpep_dropoff_datetime'], axis=1)
        
        return X

In [3]:


# Load trained Linear Regression pipeline
linear_model = joblib.load('tip_linear_model.pkl')
# Load trained Lasso Regression pipeline
lasso_model = joblib.load('tip_linear_lasso_model.pkl')

In [4]:
# Predict tip_amount using Linear Regression
y_pred_linear = linear_model.predict(X_test_features_tip)
# Predict tip_amount using linear-Lasso Regression
y_pred_lasso = lasso_model.predict(X_test_features_tip)

In [5]:

from sklearn.metrics import r2_score
import numpy as np

def r2_ci(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    r2 = r2_score(y_true, y_pred)
    
    n_bootstrap = 1000
    bootstrapped_r2 = []
    n = len(y_true)
    
    for _ in range(n_bootstrap):
        idx = np.random.choice(np.arange(n), size=n, replace=True)
        bootstrapped_r2.append(r2_score(y_true[idx], y_pred[idx]))
    
    ci_lower = np.percentile(bootstrapped_r2, 2.5)
    ci_upper = np.percentile(bootstrapped_r2, 97.5)
    
    return r2, ci_lower, ci_upper


# Linear Regression
r2_lin, ci_low_lin, ci_up_lin = r2_ci(y_test_tip, y_pred_linear)
print(f"Linear Regression R² = {r2_lin:.3f}, 95% CI = [{ci_low_lin:.3f}, {ci_up_lin:.3f}]")

# Lasso Regression
r2_lasso, ci_low_lasso, ci_up_lasso = r2_ci(y_test_tip, y_pred_lasso)
print(f"lineear Regression with lasso R² = {r2_lasso:.3f}, 95% CI = [{ci_low_lasso:.3f}, {ci_up_lasso:.3f}]")


Linear Regression R² = 0.596, 95% CI = [0.529, 0.648]
lineear Regression with lasso R² = 0.622, 95% CI = [0.548, 0.685]


In [6]:
# to show predictions
pd.DataFrame({
    'y_true_tip': y_test_tip[:10],
    'y_pred_linear_tip': y_pred_linear[:10],
    'y_pred_lasso_tip': y_pred_lasso[:10]
})


Unnamed: 0,y_true_tip,y_pred_linear_tip,y_pred_lasso_tip
2725,16.11,11.674796,11.321434
1811,3.4,3.382731,3.031659
1706,0.0,-1.864907,-1.657701
9677,0.0,2.986335,5.38667
5711,4.38,4.188563,3.665486
2036,5.12,4.482241,4.505437
5924,2.24,2.624881,2.764284
4875,0.0,-2.674313,-3.103006
3788,3.92,3.203212,3.586097
2983,5.32,4.744582,4.779868


#### Fare amount

In [7]:
# Step 1: Split features and target
X_train_fare, X_test_fare = train_test_split(taxi_data, test_size=0.2,
                                  random_state=0)
X_train_fare_features = X_train_fare.drop(columns=['fare_amount'])  
y_train_fare = X_train_fare['fare_amount'].copy()    

X_test_fare_feature = X_test_fare.drop(columns=['fare_amount'])  
y_test_fare = X_test_fare['fare_amount'].copy() 

In [8]:
class Date_time_total_fare_encoder(BaseEstimator, TransformerMixin): 
    def __init__(self):
        pass  # nothing to initialize
        
    def fit(self, X, y=None): 
        return self  # nothing else to do
    
    def transform(self, X):
        X = X.copy()
        
        # Encoding date with the respective day of the week
        pickup_day_of_week = pd.to_datetime(X['tpep_pickup_datetime'], errors='coerce').dt.day_name()
        
        pickup_hours = pd.to_datetime(X['tpep_pickup_datetime'], errors='coerce').dt.hour.astype(float)
        pickup_time_of_day = pd.cut(pickup_hours,
                                    bins=[0, 12, 17, 19, 24],
                                    labels=['Morning','Afternoon','Evening','Night'],
                                    right=False)
    
        
        X['pickup_time_of_day'] = pickup_time_of_day
        X['pickup_day_of_week'] = pickup_day_of_week
        
        # Drop original datetime columns
        X = X.drop(['tpep_pickup_datetime','tpep_dropoff_datetime'], axis=1)
        
        return X


In [9]:

# Load trained Linear Regression pipeline
fare_linear_model = joblib.load('fare_linear_model.pkl')
# Load trained Lasso Regression pipeline
fare_lasso_model = joblib.load('fare_linear_lasso_model.pkl')

In [10]:
# Predict tip_amount using Linear Regression
fare_y_pred_linear = fare_linear_model.predict(X_test_fare_feature)
# Predict tip_amount using linear-Lasso Regression
fare_y_pred_lasso = fare_lasso_model.predict(X_test_fare_feature)

In [11]:

from sklearn.metrics import r2_score
import numpy as np

def r2_ci(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    r2 = r2_score(y_true, y_pred)
    
    n_bootstrap = 1000
    bootstrapped_r2 = []
    n = len(y_true)
    
    for _ in range(n_bootstrap):
        idx = np.random.choice(np.arange(n), size=n, replace=True)
        bootstrapped_r2.append(r2_score(y_true[idx], y_pred[idx]))
    
    ci_lower = np.percentile(bootstrapped_r2, 2.5)
    ci_upper = np.percentile(bootstrapped_r2, 97.5)
    
    return r2, ci_lower, ci_upper


# Linear Regression
r2_lin, ci_low_lin, ci_up_lin = r2_ci(y_test_fare, fare_y_pred_linear)
print(f"Linear Regression R² = {r2_lin:.3f}, 95% CI = [{ci_low_lin:.3f}, {ci_up_lin:.3f}]")

# Lasso Regression
r2_lasso, ci_low_lasso, ci_up_lasso = r2_ci(y_test_fare, fare_y_pred_lasso)
print(f"lineear Regression with lasso R² = {r2_lasso:.3f}, 95% CI = [{ci_low_lasso:.3f}, {ci_up_lasso:.3f}]")


Linear Regression R² = 0.887, 95% CI = [0.851, 0.916]
lineear Regression with lasso R² = 0.900, 95% CI = [0.865, 0.928]


In [12]:
pd.DataFrame({
    'y_true_fare': y_test_fare[:10],
    'y_pred_linear_fare': fare_y_pred_linear[:10],
    'y_pred_lasso_fare': fare_y_pred_lasso[:10]
})


Unnamed: 0,y_true_fare,y_pred_linear_fare,y_pred_lasso_fare
2725,70.0,71.433588,70.964266
1811,12.1,10.744725,10.817641
1706,7.9,8.994463,9.743692
9677,55.5,71.701989,75.935238
5711,13.5,13.78994,13.50577
2036,19.1,14.917273,14.757618
5924,7.2,10.624475,10.627819
4875,8.6,12.552123,14.712778
3788,15.6,13.846179,13.967881
2983,22.6,19.92666,19.909202
