# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import time


## IF USING GOOGLE DRIVE

In [2]:
'''
#Connection with Google Drive
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

#Set the current directory
import os
os.chdir('/content/drive/My Drive/')
'''

"\n#Connection with Google Drive\nfrom google.colab import drive\ndrive.mount('/content/drive/', force_remount=True)\n\n#Set the current directory\nimport os\nos.chdir('/content/drive/My Drive/')\n"

# Import Data
Data should already be preprocessed

In [3]:
%cd Advanced_Python_Project

[Errno 2] No such file or directory: 'Advanced_Python_Project'
/scratch/mbh425/my_env/Advanced_Python_Project


In [4]:
clean_df = pd.read_csv('./Data/clean_data.csv', nrows=5_000_000)

In [5]:
train_df = clean_df[clean_df['year'].isin([2009, 2010, 2011, 2012, 2013, 2014])]
test_df = clean_df[clean_df['year'].isin([2015])]

In [6]:
X_train, y_train = train_df.drop('fare_amount', axis=1), train_df['fare_amount']
X_test, y_test = test_df.drop('fare_amount', axis=1), test_df['fare_amount']

# Models

In [7]:
# To store Results
res_df = pd.DataFrame(columns=['model', 'train_mae', 'test_mae', 'train_r2', 'test_r2', 'train_time'])

## Linear Regression

In [8]:
start_time = time.time()

# Initialize model
lr = LinearRegression()

# Train model
lr.fit(X_train, y_train)

# Predict on train and test data
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

# Calculate metrics
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"Linear Regression: Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}, Train R^2: {train_r2:.4f}, Test R^2: {test_r2:.4f}")
print(f"Linear Regression: Train time: {time.time() - start_time:.2f} seconds")

# Store results in the dataframe
res_df = pd.concat([res_df, pd.DataFrame({'model': ['linear_regression'], 'train_mae': [train_mae], 'test_mae': [test_mae], 'train_r2': [train_r2], 'test_r2': [test_r2], 'train_time': [time.time() - start_time]})], ignore_index=True)


Linear Regression: Train MAE: 1.9932, Test MAE: 2.3602, Train R^2: 0.7737, Test R^2: 0.7793
Linear Regression: Train time: 30.51 seconds


## Random Forest

In [9]:
start_time = time.time()

# Initialize model
rf = RandomForestRegressor(n_estimators=100, max_depth=10)

# Train model
rf.fit(X_train, y_train)

# Predict on train and test data
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

# Calculate metrics
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"Random Forest: Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}, Train R^2: {train_r2:.4f}, Test R^2: {test_r2:.4f}")
print(f"Random Forest: Train time: {time.time() - start_time:.2f} seconds")

# Store results in the dataframe
res_df = pd.concat([res_df, pd.DataFrame({'model': ['random_forest'], 'train_mae': [train_mae], 'test_mae': [test_mae], 'train_r2': [train_r2], 'test_r2': [test_r2], 'train_time': [time.time() - start_time]})], ignore_index=True)

Random Forest: Train MAE: 1.7098, Test MAE: 1.9167, Train R^2: 0.8326, Test R^2: 0.8377
Random Forest: Train time: 3028.52 seconds


## XGBoost

In [10]:
start_time = time.time()

# Define hyperparameters for XGBoost
params = {
    'objective': 'reg:squarederror',
    'max_depth': 9,
    'eta': 0.1,
    'subsample': 0.5,
    'num_round': 100,
    'tree_method':'hist', #was gpu_hist for gpu model
    'learning_rate': 0.3,
    'colsample_bytree' : 1,
    'alpha' : .01
}

# Initialize model
xgb = XGBRegressor(**params)

# Train model
xgb.fit(X_train, y_train)

# Predict on train and test data
y_train_pred = xgb.predict(X_train)
y_test_pred = xgb.predict(X_test)

# Calculate metrics
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"XGBoost: Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}, Train R^2: {train_r2:.4f}, Test R^2: {test_r2:.4f}")
print(f"XGBoost: Train time: {time.time() - start_time:.2f} seconds")

# Store results in the dataframe
res_df = pd.concat([res_df, pd.DataFrame({'model': ['xgboost'], 'train_mae': [train_mae], 'test_mae': [test_mae], 'train_r2': [train_r2], 'test_r2': [test_r2], 'train_time': [time.time() - start_time]})], ignore_index=True)

Parameters: { "num_round" } are not used.

XGBoost: Train MAE: 1.6331, Test MAE: 1.8610, Train R^2: 0.8491, Test R^2: 0.8471
XGBoost: Train time: 135.56 seconds


# Results

In [11]:
print(res_df)

               model  train_mae  test_mae  train_r2   test_r2   train_time
0  linear_regression   1.993208  2.360248  0.773681  0.779267    30.505146
1      random_forest   1.709813  1.916723  0.832561  0.837747  3028.520471
2            xgboost   1.633082  1.861046  0.849128  0.847062   135.563422


In [14]:
res_df.to_csv('./results/results_cpu_5mil.csv', index=False)