### Section 1: Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

### Section 2: Loading Data

In [3]:
# Load the cleaned data from the CSV file
final_df_clean = pd.read_csv('cleaned_bike_stations_pois.csv')

### Section 3: Building a Regression Model

#### 3.1 Feature Selection and Encoding

In [4]:
# Define features and target variable
target = 'number_of_bikes'

# One-hot encode categorical variables
final_df_encoded = pd.get_dummies(final_df_clean, columns=['poi_price', 'poi_category', 'source'], drop_first=True)

# Select all numeric and one-hot encoded features dynamically
all_features = [col for col in final_df_encoded.columns if col.startswith(('poi_price_', 'poi_category_', 'source_'))] + ['poi_rating', 'poi_latitude', 'poi_longitude']

# Check if there are any features selected
if not all_features:
    raise ValueError("No features selected")

#### 3.2 Data Preprocessing

In [5]:
# Convert features to numeric
X = final_df_encoded[all_features].apply(pd.to_numeric, errors='coerce')
y = pd.to_numeric(final_df_encoded[target], errors='coerce')

# Drop any columns in X that couldn't be converted to numeric data
X = X.select_dtypes(include=[np.number])

# Drop any rows with missing values in features or target
missing_rows = X.isna().any(axis=1) | y.isna()
X = X.loc[~missing_rows]
y = y.loc[~missing_rows]

# Check for sufficient data
if len(X) < len(X.columns):
    raise ValueError("Not enough data to fit the model")

#### 3.3 Fitting the OLS Regression Model

In [6]:
# Add a constant term for the OLS model
X = sm.add_constant(X)

# Create and fit the OLS regression model
model = sm.OLS(y, X).fit()

# Print the model summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:        number_of_bikes   R-squared:                       0.380
Model:                            OLS   Adj. R-squared:                  0.161
Method:                 Least Squares   F-statistic:                     1.737
Date:                Mon, 23 Sep 2024   Prob (F-statistic):           5.24e-06
Time:                        18:27:57   Log-Likelihood:                -2055.9
No. Observations:                 607   AIC:                             4430.
Df Residuals:                     448   BIC:                             5131.
Df Model:                         158                                         
Covariance Type:            nonrobust                                         
                                                                    coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------