In [62]:
# set display options to show only last expression (is set to all by default on my system)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"

In [112]:
# import libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model

## Prep dataframe for regression

In [99]:
# load data from json
df = pd.read_json('../data/yelp_citybikes_clean.json')

In [117]:
# create new dataframe with only the columns we need
mod_df = df[['review_count', 'rating', 'price', 'venue_to_station_distance', 'total_slots', 'total_bikes_available']]

# create new column with ratio of bikes available to total slots
mod_df['bikes_available_ratio'] = mod_df['total_bikes_available'] / mod_df['total_slots']

# drop total_bikes_available and total_slots columns
mod_df = mod_df.drop(['total_bikes_available', 'total_slots'], axis=1)

# drop rows where price, review_count, or rating is null
mod_df = mod_df.dropna(axis=0, subset=['price', 'review_count', 'rating'])

# convert price to int (was float to support NaN values)
mod_df['price'] = mod_df['price'].astype(int)

# drop rows where bikes_available_ratio is null
mod_df = mod_df.dropna(axis=0, subset=['bikes_available_ratio'])

# drop rows where venue_to_station_distance is null
mod_df = mod_df.dropna(axis=0, subset=['venue_to_station_distance'])

# create dummy variables for price (ordinal categorical variable)
mod_df = pd.get_dummies(mod_df, columns=['price'])

# convert price dummies to int
mod_df[['price_1', 'price_2', 'price_3', 'price_4']] = mod_df[['price_1', 'price_2', 'price_3', 'price_4']].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mod_df['bikes_available_ratio'] = mod_df['total_bikes_available'] / mod_df['total_slots']


- bike availability ratio (bikes available / total slots) is being used in place of total bikes available, as it normalizes the data for stations with different numbers of slots, presumably due to: space constraints at the station location, density of stations in the area, and other factors

In [118]:
mod_df.head()

Unnamed: 0,review_count,rating,venue_to_station_distance,bikes_available_ratio,price_1,price_2,price_3,price_4
0,499,4.0,108.898789,0.3125,0,0,1,0
1,199,4.5,106.315327,0.3125,0,0,0,1
2,321,4.0,116.982898,0.3125,0,0,1,0
3,205,4.0,157.66564,0.3125,0,0,0,1
4,218,4.0,158.535072,0.3125,0,1,0,0


In [119]:
mod_df.describe()



Unnamed: 0,review_count,rating,venue_to_station_distance,bikes_available_ratio,price_1,price_2,price_3,price_4
count,8169.0,8169.0,8169.0,8169.0,8169.0,8169.0,8169.0,8169.0
mean,222.231852,3.921349,590.658114,0.477867,0.124862,0.706329,0.127678,0.041131
std,159.894786,0.451317,321.286249,0.255078,0.330583,0.455471,0.333751,0.198606
min,2.0,1.5,6.605632,0.0,0.0,0.0,0.0,0.0
25%,96.0,3.5,321.300341,0.277778,0.0,0.0,0.0,0.0
50%,174.0,4.0,567.688388,0.461538,0.0,1.0,0.0,0.0
75%,301.0,4.0,855.473501,0.714286,0.0,1.0,0.0,0.0
max,684.0,5.0,1411.407068,1.0,1.0,1.0,1.0,1.0


- from the summary statistics, can see that price_2 category makes up 70% of venues
    - this will be set as the reference category for the price dummy variables during regression

In [123]:
# check for null values
mod_df.isnull().sum()

review_count                 0
rating                       0
venue_to_station_distance    0
bikes_available_ratio        0
price_1                      0
price_2                      0
price_3                      0
price_4                      0
dtype: int64

In [None]:
# check that all columns are numeric
mod_df.dtypes

In [124]:
mod_df.corr()

Unnamed: 0,review_count,rating,venue_to_station_distance,bikes_available_ratio,price_1,price_2,price_3,price_4
review_count,1.0,0.100809,0.101001,0.082109,-0.213237,-0.023814,0.197123,0.07829
rating,0.100809,1.0,0.156954,0.115083,-0.035511,-0.076046,0.074804,0.107804
venue_to_station_distance,0.101001,0.156954,1.0,-0.01242,-0.042727,-0.008288,0.038171,0.025982
bikes_available_ratio,0.082109,0.115083,-0.01242,1.0,-0.054825,-0.011994,0.044251,0.044403
price_1,-0.213237,-0.035511,-0.042727,-0.054825,1.0,-0.585801,-0.14451,-0.078232
price_2,-0.023814,-0.076046,-0.008288,-0.011994,-0.585801,1.0,-0.593324,-0.321202
price_3,0.197123,0.074804,0.038171,0.044251,-0.14451,-0.593324,1.0,-0.079236
price_4,0.07829,0.107804,0.025982,0.044403,-0.078232,-0.321202,-0.079236,1.0


In [126]:
# count values for each price level, set the most common to the reference level for the regression
mod_df[].sum() # price_2 is the most common (5770/) so we'll use that as the reference level

review_count                 1.815412e+06
rating                       3.203350e+04
venue_to_station_distance    4.825086e+06
bikes_available_ratio        3.903697e+03
price_1                      1.020000e+03
price_2                      5.770000e+03
price_3                      1.043000e+03
price_4                      3.360000e+02
dtype: float64

## Multivariate Regression

In [127]:
# build a regression model to predict available bikes ratio

# dependent variable
y = mod_df['bikes_available_ratio']
# independent variables - omit price_2 to avoid multicollinearity
X = mod_df[['rating', 'review_count', 'price_1', 'price_3', 'price_4', 'venue_to_station_distance']]

X = sm.add_constant(X) # adding a constant
lin_reg = sm.OLS(y, X)

model = lin_reg.fit()
print_model = model.summary()
print(print_model)


                              OLS Regression Results                             
Dep. Variable:     bikes_available_ratio   R-squared:                       0.022
Model:                               OLS   Adj. R-squared:                  0.021
Method:                    Least Squares   F-statistic:                     30.84
Date:                   Mon, 11 Dec 2023   Prob (F-statistic):           7.66e-37
Time:                           21:37:10   Log-Likelihood:                -338.85
No. Observations:                   8169   AIC:                             691.7
Df Residuals:                       8162   BIC:                             740.8
Df Model:                              6                                         
Covariance Type:               nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------

#### Disccusion of Output

- Dependent Variable: The dependent variable for the model is bikes_available_ratio.

 - R-squared: The R-squared value is 0.022, which means that approximately 2.2% of the variance in the bikes_available_ratio can be explained by the independent variables included in the model. This is a very low value, suggesting that the model does not explain much of the variability in the dependent variable.

- F-statistic: The F-statistic is 30.84 with a very low probability (p-value) of 7.66e-37, suggesting that the overall model is statistically significant.

- all independent variables have p-values below threshhold, therefore are statistically significant
##### Coefficients:

- The const coefficient indicates that the average bikes_available_ratio, when all other variables are zero, is 19.12%.
- rating has a positive effect on the bikes_available_ratio with a coefficient of 0.0616
- review_count shows a very small positive effect, higher review counts correlates with more bikes available
- price categories must be interpreted in relation to the reference category: price_2
    - price_1 has a negative coefficient, implying that a proximity to lower priced venues correlates with less bikes available
    - price_3, is positive, having a coefficient of 0.0174, immplying that proximity to higher priced venues correlates with more bikes available
    - price_4 continues the trend, its coeffient is double that of price_3, meaning that the highest priced venues are most strongly correlated with more bike availability
- venue_to_station_distance has a negative effect on the bikes_available_ratio, meaning that the higher its distance to nearby venues, the less bikes will be available at the station

##### Interpretation
Its possible to interpret bike availability in 2 contradicting ways:

1. A higher availability ratio is due to that station being LESS popular, implying that the nearby venues are LESS desirable.
2. A higher availability ratio is due to that station is MORE popular: customers frequently ride their bikes TO the area and dropping them off at that station, implying the nearby venues are MORE desirable.

- It becomes difficult to draw meaningful conclusions without more supporting data that would help determine which of these two scenarios is most likely.

- Additionally, one has to consider that the station data was a snapshot, representing the state of the bike stations at the time of the API call. The station data for this project was taken on 2023-12-09 @ 13:39:21. (2PM on a Saturday during winter). The inferences drawn from the bike station data must be considered in the context of day vs weekday, season, local weather, and time of day. These will all have a massive influence on whether the bike availability is representative of use patterns for patrons of nearby venues.

Provide model output and an interpretation of the results. 

## Stretch - Multinomial Logistic Regression

How can you turn the regression model into a classification model?

To create a classification model from the data, the availability ratio can be transformed into classes (low, med, high):

In [None]:
# define bins for availability ratio
bins = [0, 0.33, 0.66, 1]
# Define category labels
labels = ['low', 'medium', 'high']
# Create a categorical variable
mod_df['availability_class'] = pd.cut(mod_df['bikes_available_ratio'], bins=bins, labels=labels, include_lowest=True)

In [129]:
# investigate distribution of availability classes to determine reference level
mod_df['availability_class'].value_counts()

availability_class
medium    3086
low       2705
high      2378
Name: count, dtype: int64

- medium availability (33-66% of slots have bikes) is most common in the data, set this as reference level

In [130]:
# reorder categories so that 'medium' is baseline for regression
mod_df['availability_class'] = mod_df['availability_class'].cat.reorder_categories(['medium', 'low', 'high'])

In [131]:



# omit price_2 to avoid multicollinearity
X = mod_df[['rating', 'review_count', 'price_1', 'price_3', 'price_4', 'venue_to_station_distance']]
y = mod_df['availability_class']

X = sm.add_constant(X)  # Add a constant to the model

mn_model = sm.MNLogit(y, X)
mn_result = mn_model.fit()
print(mn_result.summary())



Optimization terminated successfully.
         Current function value: 1.075808
         Iterations 5
                          MNLogit Regression Results                          
Dep. Variable:     availability_class   No. Observations:                 8169
Model:                        MNLogit   Df Residuals:                     8155
Method:                           MLE   Df Model:                           12
Date:                Mon, 11 Dec 2023   Pseudo R-squ.:                 0.01570
Time:                        22:07:00   Log-Likelihood:                -8788.3
converged:                       True   LL-Null:                       -8928.5
Covariance Type:            nonrobust   LLR p-value:                 6.077e-53
   availability_class=low       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                         1.5584      0.233      6.685      0.000       1.

#### Discussion of Output

##### 'Low' availability class (0-33% of station slots have bikes in them)

- const: The intercept for "low" is 1.5584. When all other variables are at zero, the log odds of being in the "low" category (as opposed to the "med" category) are 1.5584.

- rating: The coefficient is -0.4581. For each one-unit increase in rating, the log odds of being in the "low" availability class (vs. "med") decrease by 0.4581.

- review_count: The coefficient is -0.0013, indicating that as the review count increases by one, the log odds of being in the "low" availability class (vs. "med") decrease by 0.0013.

- price_1: The coefficient is 0.1858, suggesting that the log odds of being in the "low" availability class (vs. "med") increase by 0.1858 when the price category is price_1.

- price_3: The coefficient is -0.1651, which is marginally not significant (p = 0.053). It suggests a slight decrease in the log odds of being in the "low" availability class compared to "med" for price_3.

- price_4: The coefficient is -0.4817. This indicates that the log odds of being in the "low" availability class (vs. "med") decrease by 0.4817 when the price category is price_4.

- venue_to_station_distance: The coefficient is 0.0006, meaning for each one-unit increase in the distance to the station, the log odds of being in the "low" availability class (vs. "med") increase by 0.0006.

##### 'High' availability class (66-100% of station slots have bikes in them)
- const: The intercept for "high" is -1.0464, indicating that when all other variables are held at zero, the log odds of being in the "high" category (as opposed to "med") are -1.0464.

- rating: The coefficient is 0.1573. For each one-unit increase in rating, the log odds of being in the "high" availability class (vs. "med") increase by 0.1573.

- review_count: The coefficient is -0.0001, which is not significant (p = 0.493), suggesting that review count doesn't have a meaningful distinction between "high" and "med" availability classes.

- price_1: The coefficient is -0.0450, but it's not statistically significant (p = 0.617), suggesting that price_1 doesn't provide a meaningful difference in the odds between the "high" and "med" availability classes.

- price_3: The coefficient is 0.0591, which is not significant (p = 0.467), indicating that there's no clear distinction in the odds of being in the "high" versus "med" availability classes for price_3.

- price_4: The coefficient is 0.0207 and not statistically significant (p = 0.872), suggesting that price_4 does not significantly affect the odds of being in the "high" availability class compared to "med".

- venue_to_station_distance: The coefficient is 0.0003, indicating that for each one-unit increase in the distance to the station, the log odds of being in the "high" availability class (vs. "med") increase slightly by 0.0003.

##### Interpretation
- The Pseudo R-squared value is 0.01570, which is quite low, suggesting that the model has limited explanatory power.
- The LLR p-value is highly significant, indicating that the model as a whole fits significantly better than an empty model (one with no predictors).
- p-values for the high availability class for price and review count indicate these variables are not statistically significant predictors for distinguishing between 'high' and 'low' bike availability
- all p-values for low availability class are significant, indicating that the predictor variables have a meaningful (if limited) influence over whether a station has low availability