In [8]:
# import numpy
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import os
from pathlib import Path
# import linear_model and datasets from sklearn
from sklearn import linear_model, datasets


In [9]:
merge_data = pd.read_csv('merge_data.csv')
merge_data.head(5)

Unnamed: 0,review_count,rating,bike_station,free_bikes,empty_slots,latitude,longitude,distance
0,6.0,5.0,0,7,39,41.96909,-87.674237,64
1,6.0,5.0,0,7,39,41.96909,-87.674237,42
2,6.0,5.0,0,7,39,41.96909,-87.674237,172
3,6.0,5.0,0,7,39,41.96909,-87.674237,50
4,6.0,5.0,0,7,39,41.96909,-87.674237,47


In [10]:
merge_data.describe()

Unnamed: 0,review_count,rating,bike_station,free_bikes,empty_slots,latitude,longitude,distance
count,108040.0,108040.0,108040.0,108040.0,108040.0,108040.0,108040.0,108040.0
mean,112.556831,4.243613,292.118937,6.043132,9.433728,41.879777,-87.653774,116.6
std,297.096537,0.848937,169.374708,4.576836,7.862702,0.078254,0.040165,64.9221
min,0.0,0.0,0.0,0.0,0.0,41.648501,-87.774704,42.0
25%,11.0,4.0,146.0,2.0,5.0,41.834734,-87.677856,50.0
50%,38.0,4.5,292.0,5.0,8.0,41.886875,-87.652855,110.5
75%,116.0,4.5,435.25,9.0,12.0,41.931248,-87.629544,162.0
max,9792.0,5.0,600.0,25.0,51.0,42.064854,-87.528232,249.0


In [11]:
y = merge_data['free_bikes']
indep = merge_data.drop('free_bikes', axis=1)
X = [sm.add_constant(indep[column]) for column in indep.columns] 
X

[        const  review_count
 0         1.0           6.0
 1         1.0           6.0
 2         1.0           6.0
 3         1.0           6.0
 4         1.0           6.0
 ...       ...           ...
 108035    1.0          27.0
 108036    1.0          27.0
 108037    1.0          27.0
 108038    1.0          27.0
 108039    1.0          27.0
 
 [108040 rows x 2 columns],
         const  rating
 0         1.0     5.0
 1         1.0     5.0
 2         1.0     5.0
 3         1.0     5.0
 4         1.0     5.0
 ...       ...     ...
 108035    1.0     4.5
 108036    1.0     4.5
 108037    1.0     4.5
 108038    1.0     4.5
 108039    1.0     4.5
 
 [108040 rows x 2 columns],
         const  bike_station
 0         1.0             0
 1         1.0             0
 2         1.0             0
 3         1.0             0
 4         1.0             0
 ...       ...           ...
 108035    1.0           600
 108036    1.0           600
 108037    1.0           600
 108038    1.0           6

In [12]:
Models = [sm.OLS(y,x) for x in X] #list of models
Results = [model.fit() for model in Models] #list of results
Adj_Rsquared = [results.rsquared_adj for results in Results] #list of rsquared
Pval = [results.pvalues for results in Results] #list of p-values
Params = [results.params for results in Results] #list of params

In [13]:
for i in range(len(Adj_Rsquared)):
     print(f'adj_R2: {Adj_Rsquared[i]:.3f}, P-values: {*Pval[i],}, column: {indep.columns[i]}')

adj_R2: 0.000, P-values: (0.0, 0.0005452682179099855), column: review_count
adj_R2: 0.004, P-values: (0.0, 3.853907691545418e-96), column: rating
adj_R2: 0.002, P-values: (0.0, 1.9480893692146179e-50), column: bike_station
adj_R2: 0.310, P-values: (0.0, 0.0), column: empty_slots
adj_R2: 0.006, P-values: (1.0250750530468502e-137, 1.172156385090497e-146), column: latitude
adj_R2: 0.000, P-values: (0.054348568283230016, 0.08449435348667043), column: longitude
adj_R2: -0.000, P-values: (0.0, 0.9999999999981914), column: distance


In [14]:
#run full model
y = merge_data['free_bikes']
X = merge_data.drop('free_bikes', axis=1)
X = sm.add_constant(X) #adds a column of 1's so the model will contain an intercept

model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:             free_bikes   R-squared:                       0.365
Model:                            OLS   Adj. R-squared:                  0.365
Method:                 Least Squares   F-statistic:                     8884.
Date:                Sat, 03 Jun 2023   Prob (F-statistic):               0.00
Time:                        19:46:37   Log-Likelihood:            -2.9307e+05
No. Observations:              108040   AIC:                         5.862e+05
Df Residuals:                  108032   BIC:                         5.862e+05
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const         1677.5734     26.374     63.608   