In [1]:
# import pandas
import pandas as pd

In [2]:
import numpy as np
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

You can find the data [here](https://drive.google.com/file/d/1WvCUF4A4IVsF6chbqTJ3qEjtDwAmyJCM/view?usp=sharing).

In [3]:
# load data
df = pd.read_csv('salary_data.csv')

In [4]:
df.shape

(30, 2)

In [5]:
df.dtypes

YearsExperience    float64
Salary             float64
dtype: object

In [6]:
df.head(10)

Unnamed: 0,YearsExperience,Salary
0,1.1,39343.0
1,1.3,46205.0
2,1.5,37731.0
3,2.0,43525.0
4,2.2,39891.0
5,2.9,56642.0
6,3.0,60150.0
7,3.2,54445.0
8,3.2,64445.0
9,3.7,57189.0


In [7]:
# create X,y
X = df['YearsExperience']
y = df['Salary']

In [8]:
X.shape, y.shape

((30,), (30,))

In [9]:
# X and y need to be 2-D arrays to create the linear models for the Task.
# We will convert them from series to dataframes and that should fix that.

In [10]:
X.to_frame()

Unnamed: 0,YearsExperience
0,1.1
1,1.3
2,1.5
3,2.0
4,2.2
5,2.9
6,3.0
7,3.2
8,3.2
9,3.7


In [11]:
X = (X.to_frame())

In [12]:
X

Unnamed: 0,YearsExperience
0,1.1
1,1.3
2,1.5
3,2.0
4,2.2
5,2.9
6,3.0
7,3.2
8,3.2
9,3.7


In [13]:
y = (y.to_frame())

In [14]:
y

Unnamed: 0,Salary
0,39343.0
1,46205.0
2,37731.0
3,43525.0
4,39891.0
5,56642.0
6,60150.0
7,54445.0
8,64445.0
9,57189.0


In [15]:
X.shape, y.shape

((30, 1), (30, 1))

## Task
Create the linear regression model which predicts Salary using variable YearsExperience. 

- Use both sklear and statsmodel.
- Are the computed coeficients same?

1) First, we will use sklearn to create the linear regression model.

In [16]:
regressor = LinearRegression()
regressor.fit(X, y)

LinearRegression()

In [17]:
print(regressor.coef_)

[[9449.96232146]]


In [18]:
print(regressor.intercept_)

[25792.20019867]


In [19]:
regressor.score(X,y)

0.9569566641435086

2) Now, we will use statsmodel to create the linear regression model.

In [20]:
X = sm.add_constant(X) # adding a constant

In [21]:
X

Unnamed: 0,const,YearsExperience
0,1.0,1.1
1,1.0,1.3
2,1.0,1.5
3,1.0,2.0
4,1.0,2.2
5,1.0,2.9
6,1.0,3.0
7,1.0,3.2
8,1.0,3.2
9,1.0,3.7


In [22]:
lin_reg = sm.OLS(y,X)

In [23]:
model = lin_reg.fit()
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:                 Salary   R-squared:                       0.957
Model:                            OLS   Adj. R-squared:                  0.955
Method:                 Least Squares   F-statistic:                     622.5
Date:                Wed, 06 Jan 2021   Prob (F-statistic):           1.14e-20
Time:                        14:38:36   Log-Likelihood:                -301.44
No. Observations:                  30   AIC:                             606.9
Df Residuals:                      28   BIC:                             609.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const            2.579e+04   2273.053     

In [24]:
model.params

const              25792.200199
YearsExperience     9449.962321
dtype: float64

In [25]:
model.params["YearsExperience"]

9449.962321455077

3) Now, we will determine if the computed coeficients are the same.

In [26]:
coeffs1 = np.array([regressor.intercept_[0], regressor.coef_[0][0]])
coeffs1

array([25792.20019867,  9449.96232146])

In [27]:
coeffs2 = model.params.to_numpy()
coeffs2

array([25792.20019867,  9449.96232146])

In [28]:
np.array_equal(coeffs1, coeffs2)

False

In [29]:
coeffs1[0]

25792.20019866871

In [30]:
coeffs2[0]

25792.200198668703

In [31]:
coeffs1[1], coeffs2[1]

(9449.962321455074, 9449.962321455077)

Yes, the computed coefficients are the same but it is only up to a certain 
decimal place for each coefficent respectively.  
Since these are all approximations, then I think that it is okay to consider 
the computed coefficients to be the same.

# The following is extra.  

I just want to see the difference between the actual and predicted results.

In [32]:
predictions = model.predict(X) 


In [33]:
df2 = pd.DataFrame({"Actual" : df['Salary'], "Predicted" : predictions})

df2

Unnamed: 0,Actual,Predicted
0,39343.0,36187.158752
1,46205.0,38077.151217
2,37731.0,39967.143681
3,43525.0,44692.124842
4,39891.0,46582.117306
5,56642.0,53197.090931
6,60150.0,54142.087163
7,54445.0,56032.079627
8,64445.0,56032.079627
9,57189.0,60757.060788
