Build a regression model.

In [9]:
import os
import requests
import numpy as np
from math import radians, sin, cos, sqrt, atan2
import pandas as pd
import statsmodels.api as sm


def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0
    
    # Conversion degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    
    # Differences in coordinates
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    
    # Conversion from km to m
    distance = R * c * 1000  
    return distance


url = "http://api.citybik.es/v2/networks/velib"
response = requests.get(url)

if response.status_code == 200:
    data = response.json()
    stations = data['network']['stations']
    
    station_details = []
    for station in stations:
        lat = station['latitude']
        lon = station['longitude']
        free_bikes = station['free_bikes']
        
        station_details.append({
            'latitude': lat,
            'longitude': lon,
            'free_bikes': free_bikes
        })
    
    df_bikes = pd.DataFrame(station_details)
else:
    print(f"Error: {response.status_code}")

yelp_api_key = os.getenv('YELP_API_KEY')
lat, lon = 48.879359419425, 2.3665961623192
yelp_url = f"https://api.yelp.com/v3/businesses/search?latitude={lat}&longitude={lon}&radius=1000"

headers = {
    'Authorization': f'Bearer {yelp_api_key}',
    'Accept': 'application/json'
}

response = requests.get(yelp_url, headers=headers)

if response.status_code == 200:
    data = response.json()

    places_list = []
    if 'businesses' in data:
        for place in data['businesses']:
            name = place.get('name', 'No name')
            location = place.get('location', {})
            address = ", ".join([str(location.get('address1', 'No address provided')),
                                 str(location.get('address2', '')),
                                 str(location.get('address3', ''))]).strip(', ')
            city = location.get('city', 'No city provided')
            country = location.get('country', 'No country provided')
            rating = place.get('rating', 'No rating available')
            coordinates = place.get('coordinates', {})
            lat = coordinates.get('latitude')
            lon = coordinates.get('longitude')
            
            if lat is not None and lon is not None:
                places_list.append({
                    'Name': name,
                    'Address': address,
                    'City': city,
                    'Country': country,
                    'Rating': rating,
                    'latitude': lat,
                    'longitude': lon
                })
            else:
                print(f"Missing latitude/longitude for place: {name}")
    
    df_places = pd.DataFrame(places_list)
else:
    print(f"Error: {response.status_code}")


# Drop rows with missing latitude or longitude
df_bikes.dropna(subset=['latitude', 'longitude'], inplace=True)
df_places.dropna(subset=['latitude', 'longitude'], inplace=True)

# Maximum distance threshold = 100 meters
max_distance = 100  

merged_data = []

for bike_index in range(len(df_bikes)):
    bike_lat = df_bikes.iloc[bike_index]['latitude']
    bike_lon = df_bikes.iloc[bike_index]['longitude']
    free_bikes = df_bikes.iloc[bike_index]['free_bikes']
    
    for place_index in range(len(df_places)):
        place_lat = df_places.iloc[place_index]['latitude']
        place_lon = df_places.iloc[place_index]['longitude']
        
        #Distance between the bike station and the place
        distance = haversine(bike_lat, bike_lon, place_lat, place_lon)
        
        # Check for distance is <= 100m 
        if distance <= max_distance:
            merged_data.append({
                'latitude': bike_lat,
                'longitude': bike_lon,
                'free_bikes': free_bikes,
                'Name': df_places.iloc[place_index]['Name'],
                'Address': df_places.iloc[place_index]['Address'],
                'City': df_places.iloc[place_index]['City'],
                'Country': df_places.iloc[place_index]['Country'],
                'Rating': df_places.iloc[place_index]['Rating'],
                'Distance (m)': distance
            })

df_merged = pd.DataFrame(merged_data)


X = df_merged[['Distance (m)', 'Rating']].values
y = df_merged['free_bikes'].values


X = sm.add_constant(X)  
model = sm.OLS(y, X).fit()

print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.069
Model:                            OLS   Adj. R-squared:                 -0.138
Method:                 Least Squares   F-statistic:                    0.3338
Date:                Mon, 18 Nov 2024   Prob (F-statistic):              0.725
Time:                        18:02:53   Log-Likelihood:                -40.435
No. Observations:                  12   AIC:                             86.87
Df Residuals:                       9   BIC:                             88.33
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -9.9931     21.340     -0.468      0.6

  res = hypotest_fun_out(*samples, **kwds)


Provide model output and an interpretation of the results. 

Low R-squaerd value (6.9%) suggests that the model is not explaining much of the variability in the number of free bikes. Neither distance nor rating are statistically significant predictors of the number of free bikes, as indicated by their high p-values. More data is needed to help improve the model's performance and significance.

# Stretch

How can you turn the regression model into a classification model?

In [None]:
# Categorize bikes to Low, Medium, High
bins = [0, 5, 15, float('inf')]
labels = ['Low', 'Medium', 'High']
df_merged['free_bikes_category'] = pd.cut(df_merged['free_bikes'], bins=bins, labels=labels, right=False)

# Convert the 'free_bikes_category' into a categorical variable
df_merged['free_bikes_category'] = pd.Categorical(df_merged['free_bikes_category'])

X = df_merged[['Distance (m)', 'Rating']].values  
y = df_merged['free_bikes_category'].cat.codes  

X = sm.add_constant(X)
model = sm.MNLogit(y, X).fit()

print(model.summary())



   free_bikes free_bikes_category
0        14.0              Medium
1        26.0                High
2         9.0              Medium
3        11.0              Medium
4         6.0              Medium
         Current function value: nan
         Iterations: 35
                          MNLogit Regression Results                          
Dep. Variable:                      y   No. Observations:                   12
Model:                        MNLogit   Df Residuals:                        6
Method:                           MLE   Df Model:                            4
Date:                Mon, 18 Nov 2024   Pseudo R-squ.:                     nan
Time:                        18:05:55   Log-Likelihood:                    nan
converged:                      False   LL-Null:                       -11.021
Covariance Type:            nonrobust   LLR p-value:                       nan
       y=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------

  logprob = np.log(self.cdf(np.dot(self.exog,params)))
  return np.sum(d * logprob)
  bse = np.sqrt(np.diag(self.cov_params()))
