# using gradient descent for the CCPP dataset

In [5]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [6]:
def gd_cost(X, Y, coef):
    cost = 0
    N = len(Y)
    cost = np.sum((Y - (X.dot(coef))) **2)
    cost = cost/N
    return cost

In [7]:
# batch gradient descent 
def gd_step(X, Y, l_rate, coef):
    Y = np.reshape(Y, (len(Y), 1))
    N = len(Y)
    hox = X.dot(coef)
    loss = hox - Y
    # vector multiplication to calculate gradient:
    c = X.T
    grad = c.dot(loss)
    grad = grad*(2/N)
    
    # updated values of coefficients:
    coef = coef - l_rate*(grad)
    return coef

In [8]:
def gd_runner(X, Y, learning_rate=0.02, num_iter=1):
    #initialising to random values in interval - [0,1)
    coef = np.random.rand(5,1)
    print("cost before gd = " + str(gd_cost(X, Y, coef)))
    for i in range(num_iter):
        coef = gd_step(X, Y, learning_rate, coef)
    print("cost = " + str(gd_cost(X, Y, coef)))
    return coef

In [9]:
def predict(X_test, coef):
    Y_pred = X_test.dot(coef)
    print(Y_pred.shape)
    return Y_pred    

In [14]:
# LOADING DATA 
train_data = np.genfromtxt('/Users/prachigarg/Documents/MLninja/ML/GradientDescent/training_ccpp.csv', delimiter = ',')
X_test = np.genfromtxt('/Users/prachigarg/Documents/MLninja/ML/GradientDescent/test_ccpp.csv', delimiter = ',')

In [15]:
X_train = train_data[:,0:4]
Y_train = train_data[:,4]
print(X_train.shape)
print(len(Y_train))

(7176, 4)
7176


In [16]:
# PREPROCESSING 
Scaler = preprocessing.StandardScaler().fit(X_train)
X_train = Scaler.transform(X_train)
X_test = Scaler.transform(X_test)

In [17]:
num_iter = 70
rate = 0.05
arr = np.full((len(Y_train), 1), 1)
X = np.append(arr, X_train, axis=1)
arr = np.full((len(X_test), 1), 1)
X_test1 = np.append(arr, X_test, axis=1)


coef = gd_runner(X, Y_train, rate, num_iter)
Y_pred = predict(X_test1, coef)
print(Y_pred.shape)
for i in Y_pred[:10]:
    print(i)

cost before gd = 1482855558.6301448
cost = 3999483.5974098127
(2392, 1)
(2392, 1)
[470.5642738]
[471.75702859]
[432.26085214]
[458.7555672]
[466.11437971]
[447.95918654]
[478.01723864]
[444.22218918]
[481.74319573]
[438.85938815]


In [18]:
np.savetxt('/Users/prachigarg/Documents/results2.csv', Y_pred)

In [19]:
from sklearn import linear_model

In [20]:
clf = linear_model.SGDRegressor(alpha=0.05, max_iter=70, n_iter=1000)
clf.fit(X_train, Y_train)
Y_pred2 = clf.predict(X_test)

for i in Y_pred2[:10]:
    print(i)



470.2018289548771
471.34660832420724
433.61978609314224
457.76884473518317
465.1494397559101
448.3198059374654
477.97131607100266
445.7259935459965
482.2105009174596
439.8011642243105


In [21]:
clf.coef_

array([-12.07379503,  -4.44974782,   1.0004359 ,  -1.33569224])

In [289]:
# max score i got from this is 0.928
# this is much faster as compared to the one where vectorised implementation hasn't been used