### Import Nessary library

In [13]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
import gcsfs
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

###  Load dataset 

In [15]:
#loading CSV file in the GCS bucket
gcs_path = 'gs://cab_bucket/cab-gcp-vertex-pipelines1/data/Final_Chicago_Train.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(gcs_path)

# Display the DataFrame
df.head()


Unnamed: 0,unique_key,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,pickup_community_area,dropoff_community_area,...,company,pickup_latitude,pickup_longitude,pickup_location,dropoff_latitude,dropoff_longitude,dropoff_location,trip_start_timestamp_day,trip_start_timestamp_month,trip_start_timestamp_hour
0,08ce1d75a8a36df6f00823d64033c5042b79ec8b,b7eb3d8ccfb18e383435f51a87024fc0d3503a6fd90a66...,2019-01-14 18:45:00+00:00,2019-01-14 19:00:00+00:00,900,0.0,17031330000.0,17031080000.0,33.0,8.0,...,Taxi Affiliation Services,41.85935,-87.617358,POINT (-87.6173580061 41.859349715),41.893216,-87.637844,POINT (-87.6378442095 41.8932163595),14,1,18
1,9eb190a4d5e5facee2b4ef5b0d0ed0f08932113c,d1cb0b38e64d922ac1b61791a9cc03cd203fee1cfbef86...,2019-03-07 05:45:00+00:00,2019-03-07 05:45:00+00:00,3,0.02,,,,,...,Patriot Taxi Dba Peace Taxi Associat,,,,,,,7,3,5
2,cce2d0ec80c9f7cdcb2c7a68b95de6c0d93dfa3e,ecfb6f2cdce5d4c4e80218f58070ae719060ee47e648f4...,2019-01-17 21:30:00+00:00,2019-01-17 22:00:00+00:00,1200,14.7,,,76.0,24.0,...,Top Cab Affiliation,41.980264,-87.913625,POINT (-87.913624596 41.9802643146),41.901207,-87.676356,POINT (-87.6763559892 41.90120699410001),17,1,21
3,7d7d3218828b6f3cf18358447a13f4c174b2837a,ff8391eff75559d6fd22b704d4b2c69422ca8dc8ed0ac4...,2019-03-13 07:00:00+00:00,2019-03-13 07:15:00+00:00,519,1.29,17031080000.0,17031840000.0,8.0,32.0,...,Sun Taxi,41.891972,-87.612945,POINT (-87.6129454143 41.8919715078),41.880994,-87.632746,POINT (-87.6327464887 41.8809944707),13,3,7
4,37950728f711fe741f2dd9576633750093bb8a72,bed9183af3fecf2a370b57b32766defe663eb2f990a744...,2019-02-13 07:30:00+00:00,2019-02-13 07:45:00+00:00,1140,4.1,17031080000.0,,8.0,,...,Medallion Leasin,41.905858,-87.630865,POINT (-87.6308650266 41.9058577688),,,,13,2,7


In [17]:
# Convert the 'trip_start_timestamp' to datetime if it's not already.
df['trip_start_timestamp'] = pd.to_datetime(df['trip_start_timestamp'])

# Extract hour, day, and month from the timestamp
df['trip_start_hour'] = df['trip_start_timestamp'].dt.hour
df['trip_start_day'] = df['trip_start_timestamp'].dt.day
df['trip_start_month'] = df['trip_start_timestamp'].dt.month

In [18]:
columns_to_drop = ['unique_key', 'taxi_id', 'trip_start_timestamp', 
                   'trip_end_timestamp', 'dropoff_location', 'pickup_location']
# Dropping unnecessary columns 
df= df.drop(columns=columns_to_drop)

### Data Preprocessing 

In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.impute import SimpleImputer

def one_hot_encode(values, num_categories):
    """One-hot encode the values."""
    categories = sorted(set(values.dropna()))  # Handle NaN by dropping
    one_hot_encoded = []
    for v in values:
        encoding = [1 if v == category else 0 for category in categories]
        one_hot_encoded.append(encoding)
    return pd.DataFrame(one_hot_encoded, columns=[f"{values.name}_{c}" for c in categories], index=values.index)

def preprocess_data(df):
    numerical_features = ['trip_miles', 'trip_seconds', 'tips', 'tolls', 'extras', 'trip_total']
    bucket_features = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']
    categorical_numerical_features = [
        'trip_start_hour', 'trip_start_day', 'trip_start_month',
        'pickup_census_tract', 'dropoff_census_tract', 'pickup_community_area',
        'dropoff_community_area'
    ]
    categorical_string_features = ['payment_type', 'company']
    
    # Handling missing values and scaling numerical features
    for feature in numerical_features:
        df[feature] = SimpleImputer(strategy='mean').fit_transform(df[[feature]])
        df[feature] = StandardScaler().fit_transform(df[[feature]])
    
    # Bucketizing geographical features
    for feature in bucket_features:
        df[feature] = SimpleImputer(strategy='mean').fit_transform(df[[feature]])
        discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
        df[feature] = discretizer.fit_transform(df[[feature]])
    
    # One-hot encoding for categorical string features
    for feature in categorical_string_features:
        df_filled = SimpleImputer(strategy='constant', fill_value='missing').fit_transform(df[[feature]].astype(str))
        df_encoded = one_hot_encode(pd.Series(df_filled.flatten(), name=feature), num_categories=None)
        df = pd.concat([df, df_encoded], axis=1)
        df.drop(columns=[feature], inplace=True)
    
    # One-hot encoding for categorical numerical features
    for feature in categorical_numerical_features:
        df_filled = SimpleImputer(strategy='most_frequent').fit_transform(df[[feature]].astype(str))
        df_encoded = one_hot_encode(pd.Series(df_filled.flatten(), name=feature), num_categories=None)
        df = pd.concat([df, df_encoded], axis=1)
        df.drop(columns=[feature], inplace=True)
    
    # Fill missing values for the label (fare)
    df['fare'] = SimpleImputer(strategy='mean').fit_transform(df[['fare']])
    
    return df

In [20]:
# Assuming your DataFrame after dropping columns is named df
df_processed = preprocess_data(df)

# Now you can display the processed DataFrame
df_processed.head()

Unnamed: 0,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,pickup_latitude,pickup_longitude,dropoff_latitude,...,dropoff_community_area_71.0,dropoff_community_area_72.0,dropoff_community_area_73.0,dropoff_community_area_74.0,dropoff_community_area_75.0,dropoff_community_area_76.0,dropoff_community_area_77.0,dropoff_community_area_8.0,dropoff_community_area_9.0,dropoff_community_area_nan
0,0.052148,-0.613203,11.0,-0.602814,-0.009487,-0.038608,-0.097897,5.0,7.0,6.0,...,0,0,0,0,0,0,0,1,0,0
1,-0.560844,-0.609551,3.25,-0.602814,-0.009487,-0.038608,-0.221185,6.0,6.0,6.0,...,0,0,0,0,0,0,0,0,0,1
2,0.257163,2.071362,36.75,3.591021,-0.009487,0.080586,0.569445,8.0,0.0,6.0,...,0,0,0,0,0,0,0,0,0,0
3,-0.20822,-0.377619,7.5,-0.087178,-0.009487,-0.038608,-0.12176,6.0,7.0,6.0,...,0,0,0,0,0,0,0,0,0,0
4,0.21616,0.135553,14.5,0.944092,-0.009487,-0.038608,0.037321,6.0,7.0,6.0,...,0,0,0,0,0,0,0,0,0,1


In [21]:
df.columns

Index(['trip_seconds', 'trip_miles', 'pickup_census_tract',
       'dropoff_census_tract', 'pickup_community_area',
       'dropoff_community_area', 'fare', 'tips', 'tolls', 'extras',
       'trip_total', 'payment_type', 'company', 'pickup_latitude',
       'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude',
       'trip_start_timestamp_day', 'trip_start_timestamp_month',
       'trip_start_timestamp_hour', 'trip_start_hour', 'trip_start_day',
       'trip_start_month'],
      dtype='object')

### Defining X & y

In [22]:
y =df_processed['fare']
# Dropping unnecessary columns 
X= df_processed.drop(columns='fare')


### Train Test Split

In [23]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Fitting the model and evaluating it

In [24]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Define the ANN functions
def _build_ann_model():
    """Creates a simple artificial neural network model."""
    ann_model = MLPRegressor(hidden_layer_sizes=(100, 70, 50, 20),
                             activation='relu',
                             solver='adam',
                             learning_rate_init=0.0005,
                             random_state=42)
    return ann_model

def _train_ann_model(model, train_data, train_labels):
    """Trains the artificial neural network model."""
    model.fit(train_data, train_labels)

def _evaluate_ann_model(model, eval_data, eval_labels):
    """Evaluates the artificial neural network model."""
    eval_predictions = model.predict(eval_data)
    mse = ((eval_predictions - eval_labels) ** 2).mean()
    return mse

# Initialize models including the ANN model
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "ANN": _build_ann_model()  
}

# Define a function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    if isinstance(model, MLPRegressor):
        # Train and evaluate ANN model separately
        _train_ann_model(model, X_train, y_train)
        mse = _evaluate_ann_model(model, X_test, y_test)
        return mse, np.sqrt(mse), None  # No R2 score for ANN model
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        return mse, rmse, r2

# Evaluate each model
results = {}
for model_name, model in models.items():
    mse, rmse, r2 = evaluate_model(model, X_train, X_test, y_train, y_test)
    results[model_name] = {"MSE": mse, "RMSE": rmse, "R2": r2}

# Print the results
for model_name, metrics in results.items():
    if metrics['R2'] is not None:
        print(f"{model_name} - MSE: {metrics['MSE']:.4f}, RMSE: {metrics['RMSE']:.4f}, R2: {metrics['R2']:.4f}")
    else:
        print(f"{model_name} - MSE: {metrics['MSE']:.4f}, RMSE: {metrics['RMSE']:.4f}")


Linear Regression - MSE: 2438653910139406.0000, RMSE: 49382728.8649, R2: -12413898526467.4023
Decision Tree - MSE: 1981.1061, RMSE: 44.5096, R2: -9.0848
Random Forest - MSE: 3948.6124, RMSE: 62.8380, R2: -19.1003
Gradient Boosting - MSE: 6124.0131, RMSE: 78.2561, R2: -30.1741
ANN - MSE: 2.0556, RMSE: 1.4337


##### Here ANN model is giving the best result compared to other models. So we are selecting the ANN model

### Hyperparameter Tuning

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Define the ANN functions
def _build_ann_model():
    """Creates a simple artificial neural network model."""
    ann_model = MLPRegressor(random_state=42)
    return ann_model
def _train_ann_model(model, train_data, train_labels):
    """Trains the artificial neural network model."""
    model.fit(train_data, train_labels)
def _evaluate_ann_model(model, eval_data, eval_labels):
    """Evaluates the artificial neural network model."""
    eval_predictions = model.predict(eval_data)
    mse = mean_squared_error(eval_labels, eval_predictions)
    return mse
# Hyperparameter grids
param_grids = {
    "Linear Regression": {},
    "Decision Tree": {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 10, 20]
    },
    "Random Forest": {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 10, 20]
    },
    "Gradient Boosting": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    },
    "ANN": {
        'hidden_layer_sizes': [(50,), (100,), (100, 50), (100, 70, 50, 20)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'sgd'],
        'learning_rate_init': [0.001, 0.0005, 0.0001]
    }
}
# Initialize models including the ANN model
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "ANN": _build_ann_model()  # Include the ANN model here
}

# Define a function to evaluate models
def evaluate_model(model, param_grid, X_train, X_test, y_train, y_test):
    if isinstance(model, MLPRegressor):
        grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=3)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        _train_ann_model(best_model, X_train, y_train)
        mse = _evaluate_ann_model(best_model, X_test, y_test)
        return mse, np.sqrt(mse), None, best_model
    else:
        grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=3)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        return mse, rmse, r2, best_model

# Evaluate each model
results = {}
best_models = {}
for model_name, model in models.items():
    param_grid = param_grids[model_name]
    mse, rmse, r2, best_model = evaluate_model(model, param_grid, X_train, X_test, y_train, y_test)
    results[model_name] = {"MSE": mse, "RMSE": rmse, "R2": r2}
    best_models[model_name] = best_model

# Print the results
for model_name, metrics in results.items():
    if metrics['R2'] is not None:
        print(f"{model_name} - MSE: {metrics['MSE']:.4f}, RMSE: {metrics['RMSE']:.4f}, R2: {metrics['R2']:.4f}")
    else:
        print(f"{model_name} - MSE: {metrics['MSE']:.4f}, RMSE: {metrics['RMSE']:.4f}")

# Print the best model for each algorithm
for model_name, best_model in best_models.items():
    print(f"Best model for {model_name}: {best_model}")
