Build a regression model.

In [None]:
# Import required Libraries
import sqlite3
import pandas as pd
import statsmodels.api as sm

In [5]:
# Connect to the SQLite database
connection = sqlite3.connect('../data/bikestation.sqlite')

# Load tables into DataFrames
stations_df = pd.read_sql('SELECT * FROM stations', connection)
pois_df = pd.read_sql('SELECT * FROM pois', connection)
station_pois_df = pd.read_sql('SELECT * FROM station_pois', connection)

In [None]:
# Merge POIs with station_pois to get station-poi relationship with POI details
merged_df = station_pois_df.merge(pois_df, on='poi_id')

# Aggregate POI features per station
poi_features = merged_df.groupby('station_id').agg({
    'poi_id': 'count',                # number of POIs near the station
    'rating': 'mean'                  # average POI rating
}).rename(columns={
    'poi_id': 'num_pois',
    'rating': 'avg_poi_rating'
}).reset_index()

# Merge POI features with station info
model_df = stations_df.merge(poi_features, on='station_id', how='left')

# Replace NaN (stations with no nearby POIs) with 0
model_df['num_pois'] = model_df['num_pois'].fillna(0)
model_df['avg_poi_rating'] = model_df['avg_poi_rating'].fillna(0)

# Preview
print(model_df[['bikes_available', 'num_pois', 'avg_poi_rating']].head())

    bikes_available  num_pois  avg_poi_rating
0                 9     500.0          4.3084
1                 5     500.0          4.3084
2                 6     500.0          4.3084
3                 4     500.0          4.3084
4                 0     500.0          4.3084
5                17     500.0          4.3084
6                 9     500.0          4.3084
7                 0     500.0          4.3084
8                 0     500.0          4.3084
9                 4     500.0          4.3084
10                3     500.0          4.3084
11                4     500.0          4.3084
12                2     500.0          4.3084
13                5     500.0          4.3084
14                1     500.0          4.3084
15                2     500.0          4.3084
16               19     500.0          4.3084
17                1     500.0          4.3084
18                3     500.0          4.3084
19                1     500.0          4.3084
20                5     500.0     

Provide model output and an interpretation of the results. 

In [9]:
# Define independent variables (with constant)
X = model_df[['num_pois', 'avg_poi_rating']]
X = sm.add_constant(X)

# Define dependent variable
y = model_df['bikes_available']

# Build OLS regression model
model = sm.OLS(y, X).fit()

# Print model summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:        bikes_available   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     1.767
Date:                Tue, 29 Jul 2025   Prob (F-statistic):              0.171
Time:                        18:54:07   Log-Likelihood:            -1.4772e+06
No. Observations:              444649   AIC:                         2.954e+06
Df Residuals:                  444646   BIC:                         2.954e+06
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const              6.4167      0.011    568.

**Interpretation:**
There’s no evidence from the data that bike availability is related to the number or rating of nearby POIs.
The model performs poorly in predicting bikes available.

**Possible reasons:**
Poorly chosen features.
Not enough variance in num_pois or avg_poi_rating.
Bikes may depend on other variables like time of day, weather, or user demand, not POIs.



# Stretch

How can you turn the regression model into a classification model?

Instead of predicting how many bikes are available (regression), we can classify bike stations into categories

### Classification Problem Setup
**Goal: Predict whether a station is:**

Low availability (e.g., < 3 bikes)

Medium availability (e.g., 3–6 bikes)

High availability (e.g., > 6 bikes)