In [6]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import numpy as np

In [7]:
import pandas as pd
import statsmodels.api as sm


def forward_regression(X, y,
                       threshold_in,
                       verbose=False):
    initial_list = []
    included = list(initial_list)
    while True:
        changed=False
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        if not changed:
            break

    return included

def backward_regression(X, y,
                           threshold_out,
                           verbose=False):
    included=list(X.columns)
    while True:
        changed=False
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

In [8]:
df = pd.read_excel(r'data_statsproj.xlsx')
print (df)

    Section   BusStat1   BusStat2  Gender  SocialMediaHrs  SleepTime  \
0         1  25.000000  64.444444       1             7.0        8.0   
1         1  94.444444  95.000000       1             7.0        8.0   
2         1  50.000000  78.333333       0             7.0        9.0   
3         1  22.000000  82.194444       0             6.0        7.0   
4         1  69.444444  81.111111       1             5.5        8.5   
..      ...        ...        ...     ...             ...        ...   
57        4  66.666667  64.444444       1             7.0        9.0   
58        4  44.444444  81.111111       1             3.5        7.0   
59        4  41.666700  56.111100       0             3.0        7.0   
60        4  61.111111  61.666667       0             3.0        7.5   
61        4  58.333333  70.000000       1             3.0        8.0   

    SleepHrs  Attention  Type   Format  
0        7.0         60     0  Offline  
1        4.5         90     0  Offline  
2        6.5

In [9]:
y='BusStat2'
df_reg=df

In [15]:
lasso = Lasso()

parameters = {"alpha":[1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}
lasso_regression = GridSearchCV(lasso, parameters, scoring='neg_mean_squared_error', cv=3)
reg_out_sec1_lasso=lasso_regression.fit(df_reg[['SocialMediaHrs','SleepTime','SleepHrs','Attention','Gender','Type']].values, df_reg[[y]])
reg_out_sec1_lasso.best_estimator_.coef_



array([-0.        , -0.        , -0.        ,  0.08518315,  0.        ,
       -0.        ])

In [13]:
lasso_regression.best_params_

{'alpha': 10}

In [13]:
linreg=LinearRegression()
reg_out_sec1=linreg.fit(df_reg[['Type','SleepHrs','SleepTime','SocialMediaHrs','Attention','Gender']].values.astype(float), df_reg[[y]])
reg_out_sec1.coef_

array([[-10.73506901,  -0.89512101,   0.06028131,  -0.18255267,
          0.09999384,   1.18832987]])

In [14]:
from statsmodels.regression import linear_model
import statsmodels.api as sm
model = sm.OLS( df_reg[[y]], df_reg[['Type','SleepHrs','SleepTime','SocialMediaHrs','Attention','Gender']].values.astype(float))
results = model.fit()
print(results.params)
print(results.pvalues)
print(results.summary())

x1   -9.911430
x2    4.128690
x3    3.023989
x4    2.029305
x5    0.185701
x6    2.886581
dtype: float64
x1    0.022938
x2    0.000221
x3    0.001911
x4    0.059940
x5    0.007261
x6    0.496724
dtype: float64
                            OLS Regression Results                            
Dep. Variable:               BusStat2   R-squared:                       0.948
Model:                            OLS   Adj. R-squared:                  0.943
Method:                 Least Squares   F-statistic:                     171.9
Date:                Mon, 30 Nov 2020   Prob (F-statistic):           3.35e-34
Time:                        08:32:53   Log-Likelihood:                -255.05
No. Observations:                  62   AIC:                             522.1
Df Residuals:                      56   BIC:                             534.9
Df Model:                           6                                         
Covariance Type:            nonrobust                                         


In [18]:
results.params.values.round(4)

array([-9.9114,  4.1287,  3.024 ,  2.0293,  0.1857,  2.8866])

In [19]:
results.pvalues.values.round(4)

array([2.290e-02, 2.000e-04, 1.900e-03, 5.990e-02, 7.300e-03, 4.967e-01])

In [None]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(model, 'assists', fig=fig)
plt.savefig('test2full_resid.png',type=png,dpi=600)

In [10]:
out=forward_regression(df_reg[['SocialMediaHrs','SleepTime','SleepHrs','Attention','Gender','Type']],df_reg[[y]],threshold_in=0.9,verbose=True)

  del sys.path[0]


Add  Type                           with p-value 0.00222679
Add  Attention                      with p-value 0.0782229
Add  SleepHrs                       with p-value 0.488172
Add  Gender                         with p-value 0.733388
Add  SocialMediaHrs                 with p-value 0.844292
