In [1]:
# For these lessons we will need NumPy, pandas, matplotlib and seaborn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# and of course the actual regression (machine learning) module
# from sklearn import linear_model
from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_csv("1.02.+Multiple+linear+regression.csv")
data.head()

Unnamed: 0,SAT,"Rand 1,2,3",GPA
0,1714,1,2.4
1,1664,3,2.52
2,1760,3,2.54
3,1685,3,2.74
4,1693,2,2.83


In [3]:
x = data[['SAT','Rand 1,2,3']]
y = data['GPA']
print(x.shape, y.shape)

(84, 2) (84,)


# Since the p-values are obtained through certain statistics, we need the 'stat' module from scipy.stats
import scipy.stats as stat

# Since we are using an object oriented language such as Python, we can simply define our own 
# LinearRegression class (the same one from sklearn)
# By typing the code below we will ovewrite a part of the class with one that includes p-values
# Here's the full source code of the ORIGINAL class: https://github.com/scikit-learn/scikit-learn/blob/7b136e9/sklearn/linear_model/base.py#L362


class LinearRegression(linear_model.LinearRegression):
    """
    LinearRegression class after sklearn's, but calculate t-statistics
    and p-values for model coefficients (betas).
    Additional attributes available after .fit()
    are `t` and `p` which are of the shape (y.shape[1], X.shape[1])
    which is (n_features, n_coefs)
    This class sets the intercept to 0 by default, since usually we include it
    in X.
    """
    
    # nothing changes in __init__
    def __init__(self, fit_intercept=True, normalize=False, copy_X=True,
                 n_jobs=1):
        self.fit_intercept = fit_intercept
        self.normalize = normalize
        self.copy_X = copy_X
        self.n_jobs = n_jobs
        self.positive = True
    
    def fit(self, X, y, n_jobs=1):
        self = super(LinearRegression, self).fit(X, y, n_jobs)
        
        # Calculate SSE (sum of squared errors)
        # and SE (standard error)
        sse = np.sum((self.predict(X) - y) ** 2, axis=0) / float(X.shape[0] - X.shape[1])
        se = np.array([np.sqrt(np.diagonal(sse * np.linalg.inv(np.dot(X.T, X))))])

        # compute the t-statistic for each feature
        self.t = self.coef_ / se
        # find the p-value for each feature
        self.p = np.squeeze(2 * (1 - stat.t.cdf(np.abs(self.t), y.shape[0] - X.shape[1])))
        return self

In [4]:
reg = LinearRegression()
reg.fit(x,y)

In [5]:
coef = reg.coef_
intercept = reg.intercept_
print(coef,intercept)

[ 0.00165354 -0.00826982] 0.29603261264909486


In [6]:
rSquared = reg.score(x,y)
rSquared

0.40668119528142843

In [7]:
reg.predict(pd.DataFrame({'SAT':[1740],'Rand 1,2,3':[3]}))

array([3.14838588])

In [8]:
#function to caculate adjusted R squared
def adjustedR2(n,p,r2):
    return 1-(1-r2)*(n-1)/(n-p-1)

In [9]:
observationsSize = x.shape[0]
featuresOrInputSize = x.shape[1]

adjustedR2(observationsSize,featuresOrInputSize,rSquared)

0.39203134825134023

In [10]:
from sklearn.feature_selection import f_regression
fRegression = f_regression(x,y)
pValues = fRegression[1]
pValues

array([7.19951844e-11, 6.76291372e-01])

In [11]:
# Above values in array cant be read, So make them understandable we can round them
# if you find any p-value of feature/input more than 0.5 (value>0.5) just remove that feature/input.
pValues.round(3)

array([0.   , 0.676])

# As you can see above "Rand 1,2,3"'s p-value is more than 0.5 (0.676>0.5) so we can ignore that feature