In [166]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

<h3>Data Splitting Methods

In [167]:
###########################################
# from pandas library,
#   (a) x = pd.read_csv("filepath") -- reads csv files from the specified path nd stores in x
#   (b) x.shape() -- returns a tuple of the dimensions of the data in x
#   (c) x.head(int n) -- returns first n(default 5) entries in the data x  
#   (d) pd.DataFrame -- it is the object type of x used above and can be viewed as a data holder
###########################################
df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),columns=['a', 'b', 'c'])
print(df2)
print(df2.head(1))
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)
print(df)
print(df.shape, df2.shape)
##########################################
# from numpy library,
#   (a)np.random.rand(#dimension#) -- random values arranged in the given shape
#               .randn(#dimension#) -- random values from std normal distribution
#               .shuffle(x) -- in-place shuffling of contents of seq. x  
#               .permutation(x) -- randomly permute a seq.
#               .randint(lo,hi,size,dtype),.ranf(size) -- returns random ints between lo and hi of the dtype, 
#                                                         returns random floats in [0,1)
#   (b)np.sum(a,axis = None,out = None,keepdims=<no value>)
#            axis -- None - sum of all elements
#                 -- otherwise it sums along the axis number
#            keepdims -- True broadcasts correctly against input array
#             -- False puts it in a np array without
#      np.mean works in a similar way except it computes mean
#   (c)np.dot(matrix1, matrix2) -- matrix multiplication or inner product for scalar val its just the product
#   (d)np.identity(int n) -- returns identity of n X n dimensions
#
##########################################
arr = np.array(df2)
print(arr)
np.random.seed(17)
arr = np.random.rand(2,3)
print(arr)
arr2 = arr.reshape((6,1))#np.reshape(arr)
print(arr2)
sd = np.sum(arr,axis = 1,keepdims = True)
print(sd)
md = np.mean(arr,axis = 0,keepdims = True)
print(md)
arr3 = np.ones((2,3))
print(arr3 - arr)

   a  b  c
0  1  2  3
1  4  5  6
2  7  8  9
   a  b  c
0  1  2  3
   col1  col2
0     1     3
1     2     4
(2, 2) (3, 3)
[[1 2 3]
 [4 5 6]
 [7 8 9]]
[[0.294665   0.53058676 0.19152079]
 [0.06790036 0.78698546 0.65633352]]
[[0.294665  ]
 [0.53058676]
 [0.19152079]
 [0.06790036]
 [0.78698546]
 [0.65633352]]
[[1.01677255]
 [1.51121934]]
[[0.18128268 0.65878611 0.42392715]]
[[0.705335   0.46941324 0.80847921]
 [0.93209964 0.21301454 0.34366648]]


In [187]:
def split_Data(data, ratio):
    np.random.seed(47)
    #Shuffling data to divide between train and test sets
    rndm_data = np.random.permutation(len(data))
    train_size = int(len(data)*ratio)
    test = rndm_data[train_size:]
    train = rndm_data[:train_size]
    return data.iloc[train],data.iloc[test]

In [188]:
data = pd.read_csv("Advertising Dataset.csv")
train_set,test_set = split_Data(data, 0.75)
print("Train Set Dims : ", train_set.shape)
print("Test Set Dims : ", test_set.shape)

Train Set Dims :  (150, 5)
Test Set Dims :  (50, 5)


In [189]:
print("Original Data : ")
print(data.head(3))
print("Train Data : ")
print(train_set.head(3))
print("Test Data : ")
print(test_set.head(3))

Original Data : 
   Unnamed: 0     TV  radio  newspaper  sales
0           1  230.1   37.8       69.2   22.1
1           2   44.5   39.3       45.1   10.4
2           3   17.2   45.9       69.3    9.3
Train Data : 
    Unnamed: 0     TV  radio  newspaper  sales
5            6    8.7   48.9       75.0    7.2
91          92   28.6    1.5       33.0    7.3
0            1  230.1   37.8       69.2   22.1
Test Data : 
     Unnamed: 0     TV  radio  newspaper  sales
174         175  222.4    3.4       13.1   11.5
165         166  234.5    3.4       84.8   11.9
144         145   96.2   14.8       38.9   11.4


In [190]:
X_train = np.array([np.ones((150,)),train_set['TV'].values, train_set['radio'].values, train_set['newspaper'].values])
Y_train = train_set['sales'].values.reshape((1,150))
print(Y_train.shape, X_train.shape)
#theta = np.zeros(X_train.shape[0])
#print(theta.shape)
#print((np.dot((np.dot(theta.T, X_train) - Y_train),X_train.T)).shape)

(1, 150) (4, 150)


<h3>Linear Regression Using Iterative Gradient Descent

In [191]:
def Iterative_Linear_Regression(n, alpha, X, Y):
    theta = np.zeros(X.shape[0]).reshape((1,4))
    #print(theta)
    for i in range(n):
        #print((np.dot(theta,X) - Y))
        beta = alpha * (np.dot(np.dot(theta, X) - Y, X.T))
        theta = theta - beta
        #print(np.dot(X,(np.dot(theta.T, X) - Y).T))
    return theta

In [192]:
theta = Iterative_Linear_Regression(1000, alpha = 0.0000001, X = X_train, Y = Y_train)

In [193]:
print(theta)

[[0.00845456 0.05328747 0.20729808 0.03067559]]


In [194]:
#Testing
X_test = np.array([np.ones((50,)),test_set['TV'].values, test_set['radio'].values, test_set['newspaper'].values])
Y_test = test_set['sales'].values.reshape((1,50))
Y_pred = np.dot(theta,X_test)
err = (np.sum((Y_pred - Y_test)**2))/50
print(err)

4.297953140785725


<h3>Linear Regression Using Normal Equations

In [195]:
theta2 = np.dot(np.linalg.inv(np.dot(X_train,X_train.T)), np.dot(Y_train,X_train.T).T)

In [196]:
#Comparing parameter values
print(theta)
print(theta2)

[[0.00845456 0.05328747 0.20729808 0.03067559]]
[[2.78784865]
 [0.04592847]
 [0.18642685]
 [0.00390952]]


In [197]:
#Testing
Y_pred2 = np.dot(theta2.T,X_test)
err2 = (np.sum((Y_pred2 - Y_test)**2))/50
print(err2)

2.244136558294949


<h3>Linear Regression Using Library Function

In [200]:
#Usinf Sci-kit's Linear Regression
X_train1 = np.array([train_set['TV'].values, train_set['radio'].values, train_set['newspaper'].values])
Y_train1 = train_set['sales'].values.reshape((1,150))
reg = LinearRegression().fit(X_train1.T, Y_train1.T)
reg.score(X_train1.T, Y_train1.T)
print(reg.intercept_)
print(reg.coef_)
#Note: The normal equations and the Scikit library gives the same optimal parameter.

[2.78784865]
[[0.04592847 0.18642685 0.00390952]]
