## 1. Simple Linear Regression

In [1]:
import pandas as pd
import seaborn as sns

In [2]:
tips = sns.load_dataset('tips')

In [3]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


> ### i). Using Stats Model

In [6]:
import statsmodels.formula.api as smf

In [13]:
model = smf.ols(formula = 'tip ~ total_bill', data = tips)

In [14]:
results = model.fit()

In [28]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                    tip   R-squared:                       0.457
Model:                            OLS   Adj. R-squared:                  0.454
Method:                 Least Squares   F-statistic:                     203.4
Date:                Wed, 24 Aug 2022   Prob (F-statistic):           6.69e-34
Time:                        14:55:27   Log-Likelihood:                -350.54
No. Observations:                 244   AIC:                             705.1
Df Residuals:                     242   BIC:                             712.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.9203      0.160      5.761      0.0

### y = mx + b

y = response variable

m = slope (predicted parameter)

x = predictor

b = intercept

#### Using the predicted result above 

y = (0.1050)x + 0.9203 

#### Translated as: 
- For every one unit (one dollar) increase in total bill, the tip increases by 10.5 cents.

In [17]:
print(results.params)

Intercept     0.920270
total_bill    0.105025
dtype: float64


In [18]:
print(results.conf_int())

                   0         1
Intercept   0.605622  1.234918
total_bill  0.090517  0.119532


> ### ii). Using Sklearn

In [29]:
from sklearn  import linear_model

In [30]:
lr = linear_model.LinearRegression()

In [32]:
predicted = lr.fit(X= tips['total_bill'].values.reshape(-1, 1), y = tips['tip'])

In [33]:
print(predicted.coef_)

[0.10502452]


In [34]:
print(predicted.intercept_)

0.9202696135546735


## 2. Multiple Regression

> ### i). Using StatsModels

In [35]:
model = smf.ols(formula = 'tip ~ total_bill + size', data = tips).fit()

In [36]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    tip   R-squared:                       0.468
Model:                            OLS   Adj. R-squared:                  0.463
Method:                 Least Squares   F-statistic:                     105.9
Date:                Wed, 24 Aug 2022   Prob (F-statistic):           9.67e-34
Time:                        15:08:42   Log-Likelihood:                -347.99
No. Observations:                 244   AIC:                             702.0
Df Residuals:                     241   BIC:                             712.5
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.6689      0.194      3.455      0.0

### Interpreted: 

- For ever one unit increase (one dollar) in total bill, the tips increase by 9 cents as long as the size of group remains consistent.

> ### ii). Using Stats Model with categorical variables

In [38]:
model = smf.ols(formula = 'tip ~ total_bill + size + sex + smoker + day + time', data = tips).fit()

In [39]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    tip   R-squared:                       0.470
Model:                            OLS   Adj. R-squared:                  0.452
Method:                 Least Squares   F-statistic:                     26.06
Date:                Wed, 24 Aug 2022   Prob (F-statistic):           1.20e-28
Time:                        15:16:11   Log-Likelihood:                -347.48
No. Observations:                 244   AIC:                             713.0
Df Residuals:                     235   BIC:                             744.4
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.5908      0.256      2.

### Interpretation:

- Same as above for continuous variables. But for categorical variables- example `sex[t.Female]` means tip increased by a unit of 3 cents from male to female.

> ### iii). Using SkLearn

In [40]:
lr = linear_model.LinearRegression()

In [41]:
predicted  = lr.fit(X=tips[['total_bill', 'size']], y = tips['tip'])

In [46]:
print(predicted.coef_)

[0.09271334 0.19259779]


In [43]:
print(predicted.intercept_)

0.6689447408125031


> ### iv). Using sklearn with Categorical Variables

In [50]:
tips_dummy = pd.get_dummies(tips[['total_bill', 'size', 'sex', 'smoker', 'day', 'time']])

In [54]:
print(tips_dummy.head())

   total_bill  size  sex_Male  sex_Female  smoker_Yes  smoker_No  day_Thur  \
0       16.99     2         0           1           0          1         0   
1       10.34     3         1           0           0          1         0   
2       21.01     3         1           0           0          1         0   
3       23.68     2         1           0           0          1         0   
4       24.59     4         0           1           0          1         0   

   day_Fri  day_Sat  day_Sun  time_Lunch  time_Dinner  
0        0        0        1           0            1  
1        0        0        1           0            1  
2        0        0        1           0            1  
3        0        0        1           0            1  
4        0        0        1           0            1  


In [55]:
x_tips_dummy_ref = pd.get_dummies(tips[['total_bill', 'size', 'sex', 'smoker', 'day', 'time']], drop_first = True)

In [56]:
print(x_tips_dummy_ref.head())

   total_bill  size  sex_Female  smoker_No  day_Fri  day_Sat  day_Sun  \
0       16.99     2           1          1        0        0        1   
1       10.34     3           0          1        0        0        1   
2       21.01     3           0          1        0        0        1   
3       23.68     2           0          1        0        0        1   
4       24.59     4           1          1        0        0        1   

   time_Dinner  
0            1  
1            1  
2            1  
3            1  
4            1  


In [57]:
lr = linear_model.LinearRegression()
predicted = lr.fit(X= x_tips_dummy_ref, y = tips['tip'])

In [61]:
print(predicted.coef_)

[ 0.09448701  0.175992    0.03244094  0.08640832  0.1622592   0.04080082
  0.13677854 -0.0681286 ]


In [62]:
print(predicted.intercept_)

0.5908374259513773


### Keeping index Labels from SkLearn
        

In [76]:
import numpy as np

In [77]:
lr = linear_model.LinearRegression()
predicted = lr.fit(X= x_tips_dummy_ref, y = tips['tip'])

values = np.append(predicted.intercept_, predicted.coef_)

names = np.append('intercept', x_tips_dummy_ref.columns)

In [78]:
print(results)

<statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fa813242040>
