In [1]:
import pandas as pd
import numpy as np


In [2]:
data = pd.read_csv('/home/kasat/startups.csv')
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
data = pd.get_dummies(data, ['State'])
target = data['Profit']
del data['Profit']
features = data
features.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,0,0,1
1,162597.7,151377.59,443898.53,1,0,0
2,153441.51,101145.55,407934.54,0,1,0
3,144372.41,118671.85,383199.62,0,0,1
4,142107.34,91391.77,366168.42,0,1,0


In [4]:
col_names = ['R&D Spend', 'Administration', 'Marketing Spend']
for col in col_names:
    data[col] = (data[col] - data[col].min())/(data[col].max() - data[col].min())

In [5]:
target = (target - target.min()) / (target.max() - target.min())
print('printing data')
print(data.head())

printing data
   R&D Spend  Administration  Marketing Spend  State_California  \
0   1.000000        0.651744         1.000000                 0   
1   0.983359        0.761972         0.940893                 1   
2   0.927985        0.379579         0.864664                 0   
3   0.873136        0.512998         0.812235                 0   
4   0.859438        0.305328         0.776136                 0   

   State_Florida  State_New York  
0              0               1  
1              0               0  
2              1               0  
3              0               1  
4              1               0  


In [6]:
def predict(features, theta):
    X = np.matrix(features)
    rows, cols = X.shape
    extra_ones = np.ones((rows,1))
    X = np.hstack((extra_ones, X))
    result = np.dot(X, theta)
    return result
    

In [7]:
def linear_regression(features, target, alpha, iteration):
    X = np.matrix(features.values)
    y = np.matrix(target).transpose()
    rows, cols = features.shape
    extra_ones = np.ones((rows,1))
    X = np.hstack((extra_ones, X))
    XT = X.transpose()
#     print('shape of x', X.shape)
#     print('shape of y', y.shape)
    beta = np.matrix(np.ones(cols + 1)).transpose()
#     print('shape of beta', beta.shape)
    for i in range(iteration):
        y_hat = np.dot(X, beta)
        error = y_hat - y
        gradient = np.dot(XT, error) / rows
        beta = beta - alpha * gradient
    
    return beta

In [8]:
def normal_equation(features, target):
    X = np.matrix(features.values)
    y = np.matrix(target).transpose()
    rows, cols = features.shape
    extra_ones = np.ones((rows,1))
    X = np.hstack((extra_ones, X))
    XT = X.transpose()
#     print('shape of x', X.shape)
#     print('shape of y', y.shape)
    XTX = np.dot(XT, X)
    XTX_inv = np.linalg.inv(XTX)
#     print('shape of xtx_inv', XTX_inv.shape)
    XTy = np.dot(XT, y)
    beta = np.dot(XTX_inv, XTy)
    return beta


In [9]:
def main():
    theta = linear_regression(features, target, 0.005, 2000)
    result1 = predict([[144372.41, 118671.85, 383199.62, 0, 0, 1],[142107.34,91391.77, 366168.42, 0, 1, 0]], theta)
    print('result using gradient descent')
    print(result1)
    
    theta2 = normal_equation(features, target)
    result = predict([[144372.41, 118671.85, 383199.62, 0, 0, 1],[142107.34, 91391.77, 366168.42, 0, 1, 0]], theta)
    print('result using normal equation')
    print(result)

#     print('theta1')
#     print(theta)
#     print('theta2')
#     print(theta2)

In [10]:
main()

result using gradient descent
[[214183.69141495]
 [202497.18477106]]
result using normal equation
[[214183.69141495]
 [202497.18477106]]
