In [1]:
import pandas as pd
import numpy as np

In [2]:
print(f'pandas  version: {pd.__version__}')
print(f'numpy   version: {np.__version__}')

pandas  version: 0.23.4
numpy   version: 1.15.4


### An Introduction to Statistical Learning with R (ISLR)
"An Introduction to Statistical Learning, with applications in R"  (Springer, 2013) with permission from the authors: G. James, D. Witten,  T. Hastie and R. Tibshirani"
- http://www-bcf.usc.edu/~gareth/ISL/index.html
<img src="http://www-bcf.usc.edu/~gareth/ISL/ISL%20Cover%202.jpg" width="300"/>

In [3]:
# url='http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv'
url='https://github.com/prasertcbs/basic-dataset/raw/master/ISLR/Advertising.csv'
# df=pd.read_csv(url, index_col=0)
df=pd.read_csv(url, usecols=[1, 2, 3, 4])
# df=pd.read_csv(url, usecols=list(range(1, 5)))
df.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


## sklearn: train_test_split

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
len(df)

200

## method 1: split into (train, test)

In [6]:
train, test = train_test_split(df, train_size=0.7, random_state=7)
# train, test = train_test_split(df, test_size=0.3)



In [7]:
print(len(train))
print(len(test))

140
60


In [8]:
train.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
88,88.3,25.5,73.4,12.9
58,210.8,49.6,37.7,23.8
113,209.6,20.6,10.7,15.9
149,44.7,25.8,20.6,10.1
36,266.9,43.8,5.0,25.4


In [9]:
test.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
86,76.3,27.5,16.0,12.0
120,141.3,26.8,46.2,15.5
22,13.2,15.9,49.6,5.6
11,214.7,24.0,4.0,17.4
195,38.2,3.7,13.8,7.6


## method 2: split into (X_train, X_test, y_train, y_test)

In [10]:
df.columns

Index(['TV', 'Radio', 'Newspaper', 'Sales'], dtype='object')

In [11]:
X=df[['TV', 'Radio', 'Newspaper']]
y=df['Sales']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=7)

In [13]:
X_train.head()

Unnamed: 0,TV,Radio,Newspaper
88,88.3,25.5,73.4
58,210.8,49.6,37.7
113,209.6,20.6,10.7
149,44.7,25.8,20.6
36,266.9,43.8,5.0


In [14]:
X_test.head()

Unnamed: 0,TV,Radio,Newspaper
86,76.3,27.5,16.0
120,141.3,26.8,46.2
22,13.2,15.9,49.6
11,214.7,24.0,4.0
195,38.2,3.7,13.8


In [15]:
y_train.head()

88     12.9
58     23.8
113    15.9
149    10.1
36     25.4
Name: Sales, dtype: float64

In [16]:
y_test.head()

86     12.0
120    15.5
22      5.6
11     17.4
195     7.6
Name: Sales, dtype: float64

## Scikit-Learn: LinearRegression 

In [17]:
import sklearn
from sklearn.linear_model import LinearRegression

In [18]:
print(f'sklearn version: {sklearn.__version__}')

sklearn version: 0.20.2


In [19]:
model = LinearRegression()
model

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [20]:
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [21]:
model.score(X_train, y_train) # R-squared

0.8970470429900155

In [22]:
model.intercept_

2.597191399021307

In [23]:
model.coef_

array([ 4.71259657e-02,  1.90987993e-01, -1.93812266e-05])

In [24]:
X_train.head()

Unnamed: 0,TV,Radio,Newspaper
88,88.3,25.5,73.4
58,210.8,49.6,37.7
113,209.6,20.6,10.7
149,44.7,25.8,20.6
36,266.9,43.8,5.0


In [25]:
model.predict([[200, 40, 70]])

array([19.66054756])

In [26]:
model.predict([[200, 40, 70], 
               [100, 80, 50],
               [ 40, 20, 10]])

array([19.66054756, 22.58785835,  8.30179607])

In [27]:
y_hat=model.predict(X_train)
y_hat

array([11.62718541, 22.00361874, 16.40893908,  9.63081303, 23.54028882,
        4.19019633,  6.33372661,  9.16205331, 18.6252243 ,  9.30278314,
       10.02220298, 12.81990978, 16.3270569 , 15.55142217,  8.18788222,
       17.73045095, 24.32738343,  7.98376479, 12.34504657, 23.42048119,
       10.19514611, 17.25432468, 15.17745615, 16.45100472, 16.83421976,
       16.94913148, 14.92719588, 12.08456358, 17.31673623, 20.75868218,
       12.31079013, 14.68033357, 12.00487231,  4.23083643, 11.88897069,
       18.47946596, 17.82345369, 10.62147609, 21.24463139,  3.40353011,
       20.37338186,  9.53396325,  8.95973195, 10.92777341,  8.97930171,
       17.62344551, 20.54153871, 12.60435152, 13.71875285, 17.33050151,
       15.22001133, 20.47732861,  9.86186951, 14.93210984, 13.95134117,
       12.31718956, 16.32085396,  7.16859379, 18.16059791, 20.91470709,
       20.02490033, 18.17883475, 18.49821442, 17.0839661 , 19.35270232,
        4.19833505, 12.50911724, 17.95656145,  6.91422545,  9.90

In [28]:
train=pd.concat([X_train, y_train], axis='columns')

In [29]:
train.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
88,88.3,25.5,73.4,12.9
58,210.8,49.6,37.7,23.8
113,209.6,20.6,10.7,15.9
149,44.7,25.8,20.6,10.1
36,266.9,43.8,5.0,25.4


In [30]:
dc=pd.concat([train.reset_index(), pd.Series(y_hat, name='predicted')], axis='columns')
dc.head()

Unnamed: 0,index,TV,Radio,Newspaper,Sales,predicted
0,88,88.3,25.5,73.4,12.9,11.627185
1,58,210.8,49.6,37.7,23.8,22.003619
2,113,209.6,20.6,10.7,15.9,16.408939
3,149,44.7,25.8,20.6,10.1,9.630813
4,36,266.9,43.8,5.0,25.4,23.540289


In [31]:
y_hat_test=model.predict(X_test)
y_hat_test

array([11.44476229, 14.37367315,  6.25500193, 17.29877053,  5.1037914 ,
       12.17276377,  7.62662219, 10.33214076,  8.77992131, 13.09010986,
        3.29044059, 12.57762584,  9.56979963,  7.62505392, 18.72987897,
        5.0057581 ,  7.93923309, 16.93193167, 18.25136197,  8.71447974,
       16.07575914, 19.69512239,  8.61453225, 17.3519232 ,  9.57182285,
       18.01945212, 23.4962362 , 12.71455161, 19.77063344, 12.10705488,
       12.68239566,  5.87563284, 14.18270814, 20.87041368, 19.5800746 ,
       15.21483369, 10.39322617, 24.36171125, 11.29656239, 11.88775796,
        7.69691954, 22.33739127,  5.49999179, 13.61147654, 17.10119141,
       16.39861924, 15.14556732,  8.21775584, 11.24015588, 15.47711028,
        7.46373581,  8.64334225,  6.35011873,  8.56203042, 15.64802928,
       19.49746243,  9.7732074 , 18.48500068, 11.70700465, 15.32110391])

In [32]:
test=pd.concat([X_test, y_test], axis='columns')
test.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
86,76.3,27.5,16.0,12.0
120,141.3,26.8,46.2,15.5
22,13.2,15.9,49.6,5.6
11,214.7,24.0,4.0,17.4
195,38.2,3.7,13.8,7.6


In [33]:
dt=pd.concat([test.reset_index(), pd.Series(y_hat_test, name='predicted')], axis='columns')
dt.head()

Unnamed: 0,index,TV,Radio,Newspaper,Sales,predicted
0,86,76.3,27.5,16.0,12.0,11.444762
1,120,141.3,26.8,46.2,15.5,14.373673
2,22,13.2,15.9,49.6,5.6,6.255002
3,11,214.7,24.0,4.0,17.4,17.298771
4,195,38.2,3.7,13.8,7.6,5.103791


In [34]:
dt.corr()

Unnamed: 0,index,TV,Radio,Newspaper,Sales,predicted
index,1.0,0.216008,-0.091564,0.001199,0.095705,0.128056
TV,0.216008,1.0,0.128668,0.079286,0.812386,0.868771
Radio,-0.091564,0.128668,1.0,0.364127,0.587212,0.60288
Newspaper,0.001199,0.079286,0.364127,1.0,0.227696,0.245546
Sales,0.095705,0.812386,0.587212,0.227696,1.0,0.946814
predicted,0.128056,0.868771,0.60288,0.245546,0.946814,1.0


## Statsmodels: Simple Linear regression

In [35]:
train, test = train_test_split(df, train_size=0.7, random_state=7)
# train, test = train_test_split(df, test_size=0.3)



In [36]:
train.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
88,88.3,25.5,73.4,12.9
58,210.8,49.6,37.7,23.8
113,209.6,20.6,10.7,15.9
149,44.7,25.8,20.6,10.1
36,266.9,43.8,5.0,25.4


In [37]:
test.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
86,76.3,27.5,16.0,12.0
120,141.3,26.8,46.2,15.5
22,13.2,15.9,49.6,5.6
11,214.7,24.0,4.0,17.4
195,38.2,3.7,13.8,7.6


In [38]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [39]:
# formula: response ~ predictor1 + predictor2
model_a = smf.ols(formula='Sales ~ TV + Radio + Newspaper', data=train).fit()
# model_a = smf.ols(formula='Sales ~ TV + Radio', data=df).fit()

In [40]:
print(model_a.summary())

                            OLS Regression Results                            
Dep. Variable:                  Sales   R-squared:                       0.897
Model:                            OLS   Adj. R-squared:                  0.895
Method:                 Least Squares   F-statistic:                     395.0
Date:                Tue, 19 Feb 2019   Prob (F-statistic):           6.42e-67
Time:                        15:55:16   Log-Likelihood:                -271.78
No. Observations:                 140   AIC:                             551.6
Df Residuals:                     136   BIC:                             563.3
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.5972      0.398      6.533      0.0

In [41]:
model_a.params # coef

Intercept    2.597191
TV           0.047126
Radio        0.190988
Newspaper   -0.000019
dtype: float64

In [42]:
model_a.pvalues

Intercept    1.189005e-09
TV           1.024402e-57
Radio        7.808066e-40
Newspaper    9.978470e-01
dtype: float64

In [43]:
model.intercept_ # sklearn

2.597191399021307

In [44]:
model.coef_

array([ 4.71259657e-02,  1.90987993e-01, -1.93812266e-05])