In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression
from yellowbrick.regressor import ResidualsPlot
from sklearn.metrics import mean_squared_error
import statsmodels.formula.api as smf

In [2]:
df = pd.read_excel("regression_1_final_data1.xlsx")
col_remain = df.drop(columns = ['Unnamed: 0', 'street_p', 'pickups', 'street_a', 'avail', 'part_of_day', 
                                'cluster_id']).columns

dict1 = {}
count = 0

for col in col_remain:
    dict1[col] = 'poi_'+str(count)
    count += 1

df = df.rename(columns = dict1)

df = pd.get_dummies(columns = ['cluster_id', 'part_of_day'], drop_first = True, data = df)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4185 entries, 0 to 4184
Columns: 126 entries, Unnamed: 0 to part_of_day_morning
dtypes: float64(1), int64(35), uint8(90)
memory usage: 1.5 MB


In [3]:
col_remain = df.drop(columns = ['Unnamed: 0', 'street_p', 'pickups', 'street_a', 'avail']).columns

In [4]:
eq_string = col_remain[0]
for i in range(1, len(col_remain)):
    eq_string = eq_string + "+" + col_remain[i]
    

In [5]:
eq_string

'poi_0+poi_1+poi_2+poi_3+poi_4+poi_5+poi_6+poi_7+poi_8+poi_9+poi_10+poi_11+poi_12+poi_13+poi_14+poi_15+poi_16+poi_17+poi_18+poi_19+poi_20+poi_21+poi_22+poi_23+poi_24+poi_25+poi_26+poi_27+poi_28+poi_29+poi_30+cluster_id_142+cluster_id_218+cluster_id_356+cluster_id_389+cluster_id_418+cluster_id_436+cluster_id_465+cluster_id_515+cluster_id_530+cluster_id_533+cluster_id_555+cluster_id_589+cluster_id_590+cluster_id_742+cluster_id_755+cluster_id_1249+cluster_id_1599+cluster_id_1651+cluster_id_1730+cluster_id_1786+cluster_id_1853+cluster_id_1930+cluster_id_1933+cluster_id_1998+cluster_id_2000+cluster_id_2005+cluster_id_2026+cluster_id_2028+cluster_id_2157+cluster_id_2160+cluster_id_2162+cluster_id_2165+cluster_id_2394+cluster_id_2398+cluster_id_2413+cluster_id_2448+cluster_id_2456+cluster_id_2542+cluster_id_2556+cluster_id_2670+cluster_id_2947+cluster_id_3141+cluster_id_3319+cluster_id_3697+cluster_id_3722+cluster_id_3803+cluster_id_3808+cluster_id_4042+cluster_id_4107+cluster_id_4208+cluster

In [6]:
print(len(df))
df = df[df['avail'] > 1]
print(len(df))

4185
2887


In [7]:
print(len(df))
df = df[df['pickups'] > 1]
print(len(df))

2887
2825


In [8]:
df['avail']

0       1.538976
1       2.783342
2       3.862148
4       2.408545
9       2.551539
          ...   
4178    3.291049
4179    3.126173
4180    1.622353
4182    1.220441
4184    4.819888
Name: avail, Length: 2825, dtype: float64

In [9]:
df['ln_mean_avail'] = np.log(df['avail'].values)
df['ln_pickups'] = np.log(df['pickups'].values)

In [10]:
formula = 'ln_pickups ~ ln_mean_avail' + '+' + eq_string

In [11]:
formula

'ln_pickups ~ ln_mean_avail+poi_0+poi_1+poi_2+poi_3+poi_4+poi_5+poi_6+poi_7+poi_8+poi_9+poi_10+poi_11+poi_12+poi_13+poi_14+poi_15+poi_16+poi_17+poi_18+poi_19+poi_20+poi_21+poi_22+poi_23+poi_24+poi_25+poi_26+poi_27+poi_28+poi_29+poi_30+cluster_id_142+cluster_id_218+cluster_id_356+cluster_id_389+cluster_id_418+cluster_id_436+cluster_id_465+cluster_id_515+cluster_id_530+cluster_id_533+cluster_id_555+cluster_id_589+cluster_id_590+cluster_id_742+cluster_id_755+cluster_id_1249+cluster_id_1599+cluster_id_1651+cluster_id_1730+cluster_id_1786+cluster_id_1853+cluster_id_1930+cluster_id_1933+cluster_id_1998+cluster_id_2000+cluster_id_2005+cluster_id_2026+cluster_id_2028+cluster_id_2157+cluster_id_2160+cluster_id_2162+cluster_id_2165+cluster_id_2394+cluster_id_2398+cluster_id_2413+cluster_id_2448+cluster_id_2456+cluster_id_2542+cluster_id_2556+cluster_id_2670+cluster_id_2947+cluster_id_3141+cluster_id_3319+cluster_id_3697+cluster_id_3722+cluster_id_3803+cluster_id_3808+cluster_id_4042+cluster_id_4

In [12]:
mod = smf.ols(formula= formula, data=df)
res = mod.fit()
results_summary = res.summary()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:             ln_pickups   R-squared:                       0.797
Model:                            OLS   Adj. R-squared:                  0.788
Method:                 Least Squares   F-statistic:                     87.67
Date:                Tue, 23 Mar 2021   Prob (F-statistic):               0.00
Time:                        21:14:24   Log-Likelihood:                -2024.9
No. Observations:                2825   AIC:                             4294.
Df Residuals:                    2703   BIC:                             5019.
Df Model:                         121                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept               

In [13]:
results_as_html = results_summary.tables[1].as_html()
df_results1 = pd.read_html(results_as_html, header=0, index_col=0)[0]

In [14]:
df_results1 = df_results1.reset_index()

In [15]:
df_results1

Unnamed: 0,index,coef,std err,t,P>|t|,[0.025,0.975]
0,Intercept,3.4014,0.100,33.976,0.000,3.205,3.598
1,ln_mean_avail,1.0925,0.019,58.979,0.000,1.056,1.129
2,poi_0,0.0452,0.022,2.082,0.037,0.003,0.088
3,poi_1,0.0339,0.014,2.455,0.014,0.007,0.061
4,poi_2,0.0021,0.007,0.283,0.778,-0.013,0.017
...,...,...,...,...,...,...,...
118,cluster_id_8304,-0.7805,0.127,-6.131,0.000,-1.030,-0.531
119,part_of_day_late_evening,-1.1530,0.030,-38.570,0.000,-1.212,-1.094
120,part_of_day_late_morning,-1.3531,0.030,-44.743,0.000,-1.412,-1.294
121,part_of_day_mid_day,-0.5438,0.030,-18.036,0.000,-0.603,-0.485


In [16]:
dict_p = dict(zip(df_results1['index'].values, df_results1['P>|t|'].values))

In [17]:
col_drop1 = []
for key, value in dict_p.items():
    if value > 0.005:
        col_drop1.append(key)

In [18]:
col_drop2 = []
for key, value in dict_p.items():
    if value > 0.05:
        col_drop2.append(key)

In [19]:
df = df.drop(columns = ['Unnamed: 0', 'street_p', 'pickups', 'street_a', 'avail'])
df.columns

Index(['poi_0', 'poi_1', 'poi_2', 'poi_3', 'poi_4', 'poi_5', 'poi_6', 'poi_7',
       'poi_8', 'poi_9',
       ...
       'cluster_id_8012', 'cluster_id_8075', 'cluster_id_8076',
       'cluster_id_8304', 'part_of_day_late_evening',
       'part_of_day_late_morning', 'part_of_day_mid_day',
       'part_of_day_morning', 'ln_mean_avail', 'ln_pickups'],
      dtype='object', length=123)

In [20]:
df1 = df.drop(columns = col_drop1)
col_formula1 = df1.drop(columns = ['ln_pickups', 'ln_mean_avail']).columns

eq_string1 = col_formula1[0]
for i in range(1, len(col_formula1)):
    eq_string1 = eq_string1 + "+" + col_formula1[i]

eq_string1

'poi_8+poi_10+poi_19+cluster_id_142+cluster_id_218+cluster_id_356+cluster_id_418+cluster_id_436+cluster_id_465+cluster_id_515+cluster_id_533+cluster_id_590+cluster_id_1599+cluster_id_1853+cluster_id_1930+cluster_id_2000+cluster_id_2005+cluster_id_2028+cluster_id_2160+cluster_id_2162+cluster_id_2398+cluster_id_2448+cluster_id_2456+cluster_id_2542+cluster_id_2670+cluster_id_3141+cluster_id_3319+cluster_id_3697+cluster_id_3722+cluster_id_4042+cluster_id_4208+cluster_id_4230+cluster_id_4271+cluster_id_4306+cluster_id_4329+cluster_id_5228+cluster_id_5266+cluster_id_5534+cluster_id_5709+cluster_id_5714+cluster_id_5760+cluster_id_6077+cluster_id_6510+cluster_id_6577+cluster_id_6593+cluster_id_6602+cluster_id_6606+cluster_id_6625+cluster_id_6637+cluster_id_7184+cluster_id_7335+cluster_id_7487+cluster_id_7584+cluster_id_7635+cluster_id_7933+cluster_id_8012+cluster_id_8075+cluster_id_8076+cluster_id_8304+part_of_day_late_evening+part_of_day_late_morning+part_of_day_mid_day+part_of_day_morning'

In [21]:
formula1 = 'ln_pickups ~ ln_mean_avail' + '+' + eq_string1
mod = smf.ols(formula= formula1, data=df1)
res = mod.fit()
results_summary = res.summary()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:             ln_pickups   R-squared:                       0.783
Model:                            OLS   Adj. R-squared:                  0.778
Method:                 Least Squares   F-statistic:                     156.0
Date:                Tue, 23 Mar 2021   Prob (F-statistic):               0.00
Time:                        21:14:24   Log-Likelihood:                -2115.6
No. Observations:                2825   AIC:                             4361.
Df Residuals:                    2760   BIC:                             4748.
Df Model:                          64                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept               

In [22]:
df2 = df.drop(columns = col_drop2)
col_formula2 = df2.drop(columns = ['ln_pickups', 'ln_mean_avail']).columns

eq_string2 = col_formula2[0]
for i in range(1, len(col_formula2)):
    eq_string2 = eq_string2 + "+" + col_formula2[i]

eq_string2

'poi_0+poi_1+poi_3+poi_4+poi_7+poi_8+poi_10+poi_14+poi_15+poi_19+poi_21+poi_22+poi_26+poi_28+poi_30+cluster_id_142+cluster_id_218+cluster_id_356+cluster_id_389+cluster_id_418+cluster_id_436+cluster_id_465+cluster_id_515+cluster_id_533+cluster_id_555+cluster_id_590+cluster_id_1249+cluster_id_1599+cluster_id_1651+cluster_id_1786+cluster_id_1853+cluster_id_1930+cluster_id_2000+cluster_id_2005+cluster_id_2026+cluster_id_2028+cluster_id_2157+cluster_id_2160+cluster_id_2162+cluster_id_2398+cluster_id_2448+cluster_id_2456+cluster_id_2542+cluster_id_2670+cluster_id_2947+cluster_id_3141+cluster_id_3319+cluster_id_3697+cluster_id_3722+cluster_id_3803+cluster_id_4042+cluster_id_4208+cluster_id_4230+cluster_id_4271+cluster_id_4306+cluster_id_4329+cluster_id_4619+cluster_id_5228+cluster_id_5266+cluster_id_5534+cluster_id_5709+cluster_id_5714+cluster_id_5760+cluster_id_6077+cluster_id_6510+cluster_id_6577+cluster_id_6593+cluster_id_6602+cluster_id_6606+cluster_id_6625+cluster_id_6637+cluster_id_7184

In [23]:
formula2 = 'ln_pickups ~ ln_mean_avail' + '+' + eq_string2
mod = smf.ols(formula= formula2, data=df2)
res = mod.fit()
results_summary = res.summary()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:             ln_pickups   R-squared:                       0.794
Model:                            OLS   Adj. R-squared:                  0.788
Method:                 Least Squares   F-statistic:                     119.9
Date:                Tue, 23 Mar 2021   Prob (F-statistic):               0.00
Time:                        21:16:09   Log-Likelihood:                -2044.2
No. Observations:                2825   AIC:                             4266.
Df Residuals:                    2736   BIC:                             4796.
Df Model:                          88                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept               