In [1]:
import pandas as pd
import numpy as np
from math import radians, sin, cos, sqrt, asin
from datetime import datetime
import altair as alt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# 1. Data importation and manipulation

In [2]:
def load_data(paths):
    dfs = [pd.read_csv(file) for file in paths]
    return pd.concat(dfs, ignore_index=True)


def haversine(row):
    lon1, lat1, lon2, lat2 = row["start_lng"], row["start_lat"], row["end_lng"], row["end_lat"]
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * asin(sqrt(a))
    r = 6371  # Radius of earth in kilometers
    return c * r


def ride_duration(row):
    start_time = datetime.strptime(row["started_at"], "%Y-%m-%d %H:%M:%S")
    end_time = datetime.strptime(row["ended_at"], "%Y-%m-%d %H:%M:%S")
    return (end_time - start_time).total_seconds() / 60


def preprocess_data(file_paths):
    df = load_data(file_paths)
    df["Haversine"] = df.apply(haversine, axis=1)
    df["Minutes"] = df.apply(ride_duration, axis=1)
    df = df.dropna()
    return df


In [3]:
FILE_PATHS = [f"data/JC-2022{i:02d}-citibike-tripdata.csv" for i in range(1, 12)]
df = preprocess_data(FILE_PATHS)

# 2. Exploratory data analysis and visualization

In [4]:
def visualize_data(df):
    dropdown = alt.binding_select(options=df.start_station_name.unique())
    select = alt.selection_single(fields=['start_station_name'], bind=dropdown, name="start_station_name")
    
    chart1 = alt.Chart(df.groupby(["member_casual", "start_station_name"]).sum().reset_index()).mark_bar().encode(
        color="member_casual", y='Minutes', x='start_station_name:N'
    ).add_selection(select).transform_filter(select).properties(title='Station-wise Minutes by Member and Casual')

    chart2 = alt.Chart(df.groupby(["member_casual", "start_station_name"]).sum().reset_index()).mark_bar().encode(
        color="member_casual", y='Haversine', x='start_station_name:N'
    ).add_selection(select).transform_filter(select).properties(title='Station-wise Distance (km) by Member and Casual')

    return chart1 | chart2


In [5]:
visualize_data(df)

# 3. Data preprocessing

In [6]:
def prepare_data(df):
    # Encoding categorical columns
    label_encoder = LabelEncoder()
    df['member_casual'] = label_encoder.fit_transform(df['member_casual'])
    df['start_station_name'] = label_encoder.fit_transform(df['start_station_name'])
    df['end_station_name'] = label_encoder.fit_transform(df['end_station_name'])

    # Split features and target
    X = df[['start_station_name', 'end_station_name', 'Haversine', 'Minutes']].to_numpy()
    y = df['member_casual'].to_numpy()

    # Scaling input features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    return train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
X_train, X_test, y_train, y_test = prepare_data(df)


# 4. Model selection, training, and evaluation

In [8]:
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, predictions))
    print("F1 Score:", f1_score(y_test, predictions))
    print("ROC AUC Score:", roc_auc_score(y_test, predictions))
    print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
    print("Classification Report:\n", classification_report(y_test, predictions))

Accuracy: 0.665

The accuracy represents the proportion of correct predictions out of the total predictions made. An accuracy of 0.665 indicates that the model correctly classified 66.5% of the instances in the test data.
F1 Score: 0.791

The F1 Score is the harmonic mean of Precision and Recall, providing a balance between these two metrics. An F1 Score of 0.791 for class 1 (positive class) suggests that the model has a reasonable performance in identifying the positive class, but there is still room for improvement.
ROC AUC Score: 0.530

The ROC AUC Score measures the ability of the model to correctly classify the positive and negative classes across various classification thresholds. An ROC AUC Score of 0.530 is only slightly better than random guessing (0.5) and indicates that the model does not perform well at distinguishing between the two classes.

In [9]:
# Logistic Regression

logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)
print("Logistic Regression Evaluation")
evaluate_model(logreg, X_test, y_test)


Logistic Regression Evaluation
Accuracy: 0.6651874322346709
F1 Score: 0.7911940110998619
ROC AUC Score: 0.5303534930395548
Confusion Matrix:
 [[  5209  53408]
 [  3102 107062]]
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.09      0.16     58617
           1       0.67      0.97      0.79    110164

    accuracy                           0.67    168781
   macro avg       0.65      0.53      0.47    168781
weighted avg       0.65      0.67      0.57    168781



In [None]:
# Random Forest Classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
print("Random Forest Evaluation")
evaluate_model(rf, X_test, y_test)

In [None]:
# XGBoost Classifier
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train, y_train)
print("XGBoost Evaluation")
evaluate_model(xgb, X_test, y_test)


# 5. Hyperparameter tuning and cross-validation

In [None]:
# GridSearchCV for RandomForestClassifier
param_grid = {
    'n_estimators': [10, 50, 100], # can do more but memory limits it
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5)
grid_search_rf.fit(X_train, y_train)
print("Best Parameters for RandomForestClassifier: ", grid_search_rf.best_params_)

In [None]:
# Updating the Random Forest model
best_rf = grid_search_rf.best_estimator_
print("Random Forest Evaluation (After GridSearchCV)")
evaluate_model(best_rf, X_test, y_test)


# 6. Model Comparison and final model selection

In [None]:
print("Logistic Regression Cross-Validated Score:", np.mean(cross_val_score(logreg, X_train, y_train, cv=5)))

print("Random Forest Cross-Validated Score:", np.mean(cross_val_score(best_rf, X_train, y_train, cv=5)))

print("XGBoost Cross-Validated Score:", np.mean(cross_val_score(xgb, X_train, y_train, cv=5)))

# Based on evaluation metrics and cross-validated score, choose the best model and finalize it for deployment.
best_model = best_rf