In [6]:
from google.cloud import bigquery
import warnings
warnings.filterwarnings('ignore')



# Initialize BigQuery Client
client = bigquery.Client.from_service_account_json("secrets/serviceKey.json")

project_id = "idmpproject-441123"
dataset_id = "uberFareEstimation"
uber_table_id = 'uber_data'
weather_table_id = 'weather_data'

table_path = f"{project_id}.{dataset_id}.{uber_table_id}"

# Define the query
query = f"""
    SELECT *
    FROM `{table_path}`
"""

# Run the query
query_job = client.query(query)

# Wait for the query to complete and fetch results
results = query_job.result()

df = results.to_dataframe()



RefreshError: ('invalid_grant: Invalid JWT Signature.', {'error': 'invalid_grant', 'error_description': 'Invalid JWT Signature.'})

In [23]:
# importing libraries for machine learning

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, r2_score


# change features depending on correlation metrics/what will make model most accurate
features = ['distance', 'cab_type', 'time_stamp', 'destination', 'source', 'surge_multiplier']

# variable that is being predicted
target_variable = 'price'

# Extract X (features) and y (target) from the data
X = df[features]
y = df[target_variable]


Unnamed: 0,distance,cab_type,time_stamp,destination,source,price,surge_multiplier,id,name
0,0.94,Uber,2018-11-28 23:30:00,North End,North Station,4.5,1.0,39765,UberPool
1,0.94,Uber,2018-12-14 19:30:00,North End,North Station,4.5,1.0,437984,UberPool
2,0.63,Uber,2018-11-27 21:30:00,Financial District,South Station,4.5,1.0,1644,UberPool
3,0.63,Uber,2018-12-15 15:00:00,Financial District,South Station,4.5,1.0,10780,UberPool
4,0.63,Uber,2018-12-15 13:30:00,Financial District,South Station,4.5,1.0,21598,UberPool


In [22]:
# preprocessing for model

# Convert 'time_stamp' into datetime format
X['time_stamp'] = pd.to_datetime(X['time_stamp'])

# Extract useful time-based features
X['hour'] = X['time_stamp'].dt.hour
X['day_of_week'] = X['time_stamp'].dt.dayofweek

# Drop the original 'time_stamp' column as it's no longer needed
X = X.drop('time_stamp', axis=1)

In [None]:
# Configures a ColumnTransformer to preprocess categorical features using OneHotEncoder
# leaves numerical features unchanged for model training



# Define categorical and numerical features
categorical_features = ['cab_type', 'source', 'destination']
numerical_features = ['distance', 'surge_multiplier', 'hour', 'day_of_week']

# Set up the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numerical_features)
    ]
)

# Display the preprocessor to confirm setup
print(preprocessor)

In [None]:
# splitting data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# creating pipeline and training the model

# Set up the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Create a pipeline with preprocessing and the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', rf_model)
])

# Train the pipeline on the training data
pipeline.fit(X_train, y_train)

print("Pipeline training complete.")

In [None]:
# making predictions and evaluating model

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model using MAE
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")

# Evaluate the model using R² Score
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2}")

In [None]:
# getting importance of each feature in the model

# Access the trained Random Forest model from the pipeline
rf_model_trained = pipeline.named_steps['regressor']

# Get feature importances
feature_names = pipeline.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out(categorical_features)
all_features = list(feature_names) + numerical_features
feature_importances = rf_model_trained.feature_importances_

# Combine feature names and their importances into a DataFrame
import pandas as pd
feature_importance_df = pd.DataFrame({
    'Feature': all_features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Display the feature importances
print(feature_importance_df)
