In [1]:
import sys
print(sys.executable)

# Verify statsmodels installation
try:
    import statsmodels.api as sm
    print("Statsmodels imported successfully.")
except ImportError as e:
    print("Error importing statsmodels:", e)


c:\Users\spencer.fargey\AppData\Local\anaconda3\python.exe
Statsmodels imported successfully.


In [12]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load the data
bike_stations_df = pd.read_csv("Bike_stations.csv")
merged_restaurants_df = pd.read_csv("merged_restaurants.csv")

In [11]:
# Filter the data to keep only Yelp data
yelp_df = merged_restaurants_df[merged_restaurants_df['source'] == 'Yelp']

# Drop rows with missing values in the target variable (rating)
yelp_df.dropna(subset=['rating'], inplace=True)

# Merge bike_stations data with Yelp data using 'name' instead of 'station_name'
merged_df = pd.merge(yelp_df, bike_stations_df[['name', 'free_bikes']], left_on='station_name', right_on='name', how='left')

# Handle missing values in features
merged_df['review_count'] = merged_df['review_count'].fillna(0)
merged_df['distance'] = merged_df['distance'].fillna(0)
merged_df['free_bikes'] = merged_df['free_bikes'].fillna(0)

# Define the target variable and features
target = 'rating'
features = ['free_bikes']

# Prepare the feature matrix (X) and target vector (y)
X = merged_df[features]
y = merged_df[target]

# Add a constant to the features (required for statsmodels)
X = sm.add_constant(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and train the regression model
model = sm.OLS(y_train, X_train).fit()

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = model.rsquared

print(model.summary())
print(f'MAE: {mae}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')


                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.009
Method:                 Least Squares   F-statistic:                     36.44
Date:                Fri, 12 Jul 2024   Prob (F-statistic):           1.72e-09
Time:                        10:12:32   Log-Likelihood:                -1768.5
No. Observations:                4030   AIC:                             3541.
Df Residuals:                    4028   BIC:                             3554.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.0140      0.009    427.547      0.0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yelp_df.dropna(subset=['rating'], inplace=True)


In [None]:
#Stretch
#How can you turn the regression model into a classification model?