In [15]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")

In [2]:
df_citybikes=pd.read_csv('Paris_data.csv')
df_citybikes=df_citybikes.head(500) # limiting my data to 500 to reduce the API request and saving it as a new data frame for this notebook 
api_df = pd.read_csv('Yelp_data.csv')
four_sq_df=pd.read_csv('four_sq_data.csv')

In [4]:
Paris_merged_df = pd.merge(df_citybikes,api_df,on="lat&lon")
Paris_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 820 entries, 0 to 819
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name_of_station   820 non-null    object 
 1   empty_slots       820 non-null    int64  
 2   free_bikes        820 non-null    int64  
 3   latitude          820 non-null    float64
 4   longitude         820 non-null    float64
 5   timestamp         820 non-null    object 
 6   lat&lon           820 non-null    object 
 7   name              820 non-null    object 
 8   rating            820 non-null    float64
 9   reviews           820 non-null    int64  
 10  price             728 non-null    float64
 11  address           813 non-null    object 
 12  distance_on_yelp  820 non-null    float64
dtypes: float64(5), int64(3), object(5)
memory usage: 83.4+ KB


In [27]:
Paris_merged_df.dropna(subset='price',inplace=True)
Paris_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 728 entries, 0 to 819
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name_of_station   728 non-null    object 
 1   empty_slots       728 non-null    int64  
 2   free_bikes        728 non-null    int64  
 3   latitude          728 non-null    float64
 4   longitude         728 non-null    float64
 5   timestamp         728 non-null    object 
 6   lat&lon           728 non-null    object 
 7   name              728 non-null    object 
 8   rating            728 non-null    float64
 9   reviews           728 non-null    int64  
 10  price             728 non-null    float64
 11  address           723 non-null    object 
 12  distance_on_yelp  728 non-null    float64
dtypes: float64(5), int64(3), object(5)
memory usage: 79.6+ KB


In [35]:
filtered_group_df=Paris_merged_df['distance_on_yelp']<=1000
new_group_df=Paris_merged_df[filtered_group_df]
new_group_df.shape

(604, 13)

In [36]:
grouping_df = new_group_df.groupby('name').agg({'name_of_station':'count' ,
                                     'free_bikes':'sum',
                                     'distance_on_yelp':'max',
                                     'price':'mean',
                                     'rating':'mean'}).sort_values(ascending=False,by='free_bikes')
grouping_df
# name of station is count of stations for a business in that set of coordinates (lat&lon)

Unnamed: 0_level_0,name_of_station,free_bikes,distance_on_yelp,price,rating
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
John Viande,8,48,941.0,2.0,4.500000
L'Orange Bleue,4,43,997.0,2.0,4.000000
Schwartz's Deli,3,37,931.0,2.0,3.666667
L'Elfe Assis,2,31,514.0,2.0,4.500000
La Maison Bleue,7,31,595.0,3.0,4.500000
...,...,...,...,...,...
Au Bon Coin,1,0,236.0,2.0,4.000000
Le Bistrot des Soupirs,1,0,218.0,2.0,4.000000
Café Martin,1,0,255.0,2.0,4.000000
Shingané,1,0,266.0,2.0,4.000000


#### we have already established in the joining data notebook that there is a not strong to weak correlation but we are still going to go ahead and create a model. There are definately assumptions that are ignored

In [37]:
grouping_df.corr()

Unnamed: 0,name_of_station,free_bikes,distance_on_yelp,price,rating
name_of_station,1.0,0.492873,0.373568,-0.01539,0.121266
free_bikes,0.492873,1.0,0.221025,-0.059305,0.096248
distance_on_yelp,0.373568,0.221025,1.0,0.09263,0.080099
price,-0.01539,-0.059305,0.09263,1.0,0.117011
rating,0.121266,0.096248,0.080099,0.117011,1.0


Build a regression model.

#### Backward Selection to see if the number of stations is related to any of the other columns

In [43]:
y= grouping_df['name_of_station']
X=grouping_df.drop('name_of_station',axis=1)
X=sm.add_constant(X)

model=sm.OLS(y,X)
results1=model.fit()
print(results1.summary())



                            OLS Regression Results                            
Dep. Variable:        name_of_station   R-squared:                       0.320
Model:                            OLS   Adj. R-squared:                  0.314
Method:                 Least Squares   F-statistic:                     50.68
Date:                Sun, 10 Dec 2023   Prob (F-statistic):           5.99e-35
Time:                        22:03:49   Log-Likelihood:                -422.41
No. Observations:                 435   AIC:                             854.8
Df Residuals:                     430   BIC:                             875.2
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                0.2859      0.253  

#### we see that the p-value for price is above thresh hold of 0.05 so we are going to drop that column and see the change

In [44]:
y= grouping_df['name_of_station']
X=grouping_df.drop(['name_of_station','price'],axis=1)
X=sm.add_constant(X)

model=sm.OLS(y,X)
results2=model.fit()
print(results2.summary())

                            OLS Regression Results                            
Dep. Variable:        name_of_station   R-squared:                       0.320
Model:                            OLS   Adj. R-squared:                  0.315
Method:                 Least Squares   F-statistic:                     67.58
Date:                Sun, 10 Dec 2023   Prob (F-statistic):           7.85e-36
Time:                        22:03:58   Log-Likelihood:                -422.57
No. Observations:                 435   AIC:                             853.1
Df Residuals:                     431   BIC:                             869.4
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                0.2386      0.239  

#### we see that the R-squared value has not changed and there is only a slight change in the Adj R-squared so we can move forwaard with droping the rating column

In [45]:
y= grouping_df['name_of_station']
X=grouping_df.drop(['name_of_station','price','rating'],axis=1)
X=sm.add_constant(X)

model=sm.OLS(y,X)
results3=model.fit()
print(results3.summary())

                            OLS Regression Results                            
Dep. Variable:        name_of_station   R-squared:                       0.317
Model:                            OLS   Adj. R-squared:                  0.313
Method:                 Least Squares   F-statistic:                     100.0
Date:                Sun, 10 Dec 2023   Prob (F-statistic):           1.98e-36
Time:                        22:04:08   Log-Likelihood:                -423.64
No. Observations:                 435   AIC:                             853.3
Df Residuals:                     432   BIC:                             865.5
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                0.5722      0.069  

Provide model output and an interpretation of the results. 

#### I will go with the result3 created above that shows, with the number of stations increased the count of free bikes increases by 0.03, and the distance coefficient suggest and increase by 0.0008 for an increase in the count of the number of stations in a set of latitudes and longitudes.

# Stretch

How can you turn the regression model into a classification model?