In [29]:
import numpy as np
from sklearn import preprocessing
import pandas as pd

def add_features(x):    
    
    df = pd.DataFrame(x)
    n = x.shape[1]
    curr = n
    

#   power 2 terms
    for i in range(n):
        for j in range(i, n):
            df[curr] = df[i] * df[j]
            curr += 1
    
    # power 3 terms
    for i in range(n):
        for j in range(i, n):
            for k in range(j, n):
                df[curr] = df[i] * df[j] * df[k]
                curr += 1
    
    df[curr] = df[0] ** 4
    df[curr + 1] = df[0] ** 4
    

    x = df.values
    return x

def step_gradient(x, y, learning_rate, m):
    m_slope = np.zeros(m.shape)
    
    num_of_features = x.shape[1]
    num_of_data_points = x.shape[0]
    
    for i in range(num_of_data_points):
        mx = np.sum(m * x[i])
        for j in range(num_of_features):
            m_slope[j] += (-2/num_of_data_points * (y[i][0] - mx) * x[i, j])
                
    for j in range(num_of_features):
        m[j] = m[j] - learning_rate*m_slope[j]

    return m

def cost(x, y, m):
    total_cost = 0
    num_of_data_points = x.shape[0]

    for i in range(num_of_data_points):
        mx = np.sum(m * x[i])
        total_cost += (1/num_of_data_points)*((y[i][0] - mx)**2)
    return total_cost


def gd(x, y, learning_rate, num_iterations):
    num_of_features = x.shape[1]
    m = np.array([0.0 for i in range(num_of_features)])  
    for i in range(num_iterations):
        m = step_gradient(x, y, learning_rate, m)
        print(i, "cost: ", cost(x, y, m))
        
    cst = cost(x, y, m)
    return m, cst

def score(Y_true, Y_pred):
    u = ((Y_true - Y_pred) ** 2).sum()
    v = ((Y_true - Y_true.mean()) ** 2).sum()
    return 1 - (u / v)


def get_adjusted_x(x):
#     add features
    x = add_features(x)
#     feature scaling
    scaler = preprocessing.StandardScaler()
    scaler.fit(x)
    x = scaler.transform(x)
    
#     add 1 to last colm for c
    x = np.append(x, np.ones((x.shape[0], 1)), axis = 1)

    return x
    

def predict(x, m):
    num_of_data_points = x.shape[0]
    y_predict=[]
 
    for i in range(num_of_data_points):
        y_predict.append((m*x[i]).sum())
        
    np.savetxt("output_ccpp.csv", y_predict, fmt="%.5f")
    return np.array(y_predict).reshape(-1,1)
    

In [None]:
# actual ouput
def run():
    file_path_train = "training_ccpp_x_y_train.csv"
    learning_rate = 0.04
    num_iterations = 1000
    
    data = np.loadtxt(file_path_train, delimiter=",")
    
    x_train = data[:,0:-1]
    y_train = data[:, -1:]
    
    x_train_adjusted = get_adjusted_x(x_train)
    
    m, cst = gd(x_train_adjusted, y_train, learning_rate, num_iterations)
    
    y_train_predicted =  predict(x_train_adjusted, m)
    s1 = score(y_train, y_train_predicted)
    
    
    print("cost for actual training data: ", cst)
    print("score on actual training data: ", s1)
    
    x_test = np.loadtxt("test_ccpp_x_test.csv", delimiter=",")
    x_test_adjusted  = get_adjusted_x(x_test)
    y_test_predict = predict(x_test_adjusted, m)
    
#     print(y_test_predict)
    
run()

0 cost:  174999.16263014532
1 cost:  148094.64435558778
2 cost:  125331.41190795664
3 cost:  106070.62996606906
4 cost:  89772.44548915455
5 cost:  75980.51084942077
6 cost:  64308.97541850761
7 cost:  54431.532278535924
8 cost:  46072.186212872024
9 cost:  38997.46648225483
10 cost:  33009.854155463916
11 cost:  27942.231711669814
12 cost:  23653.193945496096
13 cost:  20023.08515701055
14 cost:  16950.64919301437
15 cost:  14350.196912212152
16 cost:  12149.210706272454
17 cost:  10286.31833029127
18 cost:  8709.578892976517
19 cost:  7375.032766696638
20 cost:  6245.4746779252055
21 cost:  5289.415558754626
22 cost:  4480.204070088262
23 cost:  3795.2832049812096
24 cost:  3215.561178354745
25 cost:  2724.8790173618636
26 cost:  2309.559977588393
27 cost:  1958.0282017596287
28 cost:  1660.485975050097
29 cost:  1408.6405694945238
30 cost:  1195.473055749239
31 cost:  1015.042632669467
32 cost:  862.3210168607468
33 cost:  733.0522734068318
34 cost:  623.6341789192886
35 cost:  531.

281 cost:  18.79457619244196
282 cost:  18.793522789340667
283 cost:  18.792481696187032
284 cost:  18.791452745098308
285 cost:  18.79043577054279
286 cost:  18.789430609306496
287 cost:  18.7884371004603
288 cost:  18.78745508532795
289 cost:  18.786484407454015
290 cost:  18.785524912573337
291 cost:  18.784576448578658
292 cost:  18.78363886549188
293 cost:  18.78271201543273
294 cost:  18.781795752590135
295 cost:  18.780889933192395
296 cost:  18.779994415478466
297 cost:  18.779109059670134
298 cost:  18.778233727943583
299 cost:  18.777368284401852
300 cost:  18.77651259504793
301 cost:  18.775666527757945
302 cost:  18.774829952254326
303 cost:  18.774002740080387
304 cost:  18.77318476457455
305 cost:  18.772375900844153
306 cost:  18.771576025742178
307 cost:  18.770785017841053
308 cost:  18.770002757409394
309 cost:  18.76922912638751
310 cost:  18.768464008364496
311 cost:  18.76770728855432
312 cost:  18.766958853773403
313 cost:  18.7662185924178
314 cost:  18.765486394

559 cost:  18.691306965152368
560 cost:  18.69116978275531
561 cost:  18.6910330418818
562 cost:  18.69089673865175
563 cost:  18.690760869232882
564 cost:  18.690625429839926
565 cost:  18.690490416733915
566 cost:  18.69035582622226
567 cost:  18.69022165465711
568 cost:  18.69008789843548
569 cost:  18.689954553998298
570 cost:  18.68982161783001
571 cost:  18.68968908645781
572 cost:  18.689556956451362
573 cost:  18.689425224421655
574 cost:  18.689293887021012
575 cost:  18.689162940942545
576 cost:  18.689032382919027
577 cost:  18.688902209723192
578 cost:  18.688772418166142
579 cost:  18.688643005097948
580 cost:  18.688513967406255
581 cost:  18.68838530201652
582 cost:  18.688257005890726
583 cost:  18.688129076027543
584 cost:  18.68800150946131
585 cost:  18.68787430326201
586 cost:  18.687747454534676
587 cost:  18.687620960418847
588 cost:  18.68749481808794
589 cost:  18.68736902474909
590 cost:  18.687243577642445
591 cost:  18.687118474041153
592 cost:  18.6869937112

In [None]:
# just for testing data by splitting
from sklearn.model_selection import train_test_split

def test_data():
    file_path_train = "training_boston_x_y_train.csv"
    learning_rate = 0.1
    num_iterations = 1000
    
    data = np.loadtxt(file_path_train, delimiter=",")
    
    x = data[:,0:-1]
    y = data[:, -1:]
    
    adjusted_x = get_adjusted_x(x)
    
    
    x_train, x_test, y_train, y_test = train_test_split(adjusted_x, y, test_size = 0.25)
    m, cst = gd(x_train, y_train,  learning_rate, num_iterations)



    y_train_predicted = predict(x_train, m)
    y_test_predicted = predict(x_test, m)
    
    s1 = score(y_train, y_train_predicted)
    s2 = score(y_test,  y_test_predicted)
    
    print("(spliting traning data into train and test) - cost: ", cst)
    print("(spliting traning data into train and test) - score on training data: ", s1)
    print("(spliting traning data into train and test) - score on testing data: ", s2)

test_data()

In [28]:
# file_path_train = "training_ccpp_x_y_train.csv"
# data = np.loadtxt(file_path_train, delimiter=",")
    
# x = data[:,0:-1]
# y_train = data[:, -1:]
    
# df = pd.DataFrame(x)


# n = x.shape[1]
# curr = n


# #   power 2 terms
# for i in range(n):
#     for j in range(i, n):
#         df[curr] = df[i] * df[j]
#         curr += 1

# # power 3 terms
# for i in range(n):
#     for j in range(i, n):
#         for k in range(j, n):
#             df[curr] = df[i] * df[j] * df[k]
#             curr += 1

# df[curr] = df[0] ** 4
# df[curr + 1] = df[0] ** 4


# df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,8.58,38.38,1021.03,84.37,73.6164,329.3004,8760.4374,723.8946,1473.0244,39187.1314,...,124279.068628,4.001124e+07,3.306218e+06,273200.235022,1.064426e+09,8.795592e+07,7.267995e+06,600570.709453,5.419374e+03,5.419374e+03
1,21.79,58.20,1017.21,66.74,474.8041,1268.1780,22165.0059,1454.2646,3387.2400,59201.6220,...,226064.397600,6.022048e+07,3.951116e+06,259236.046320,1.052524e+09,6.905696e+07,4.530885e+06,297275.150024,2.254389e+05,2.254389e+05
2,16.64,48.92,1011.55,78.76,276.8896,814.0288,16832.1920,1310.5664,2393.1664,49485.0260,...,188485.785664,5.005658e+07,3.897441e+06,303457.491392,1.035052e+09,8.058986e+07,6.274784e+06,488559.117376,7.666785e+04,7.666785e+04
3,31.38,71.32,1009.17,60.42,984.7044,2238.0216,31667.7546,1895.9796,5086.5424,71974.0044,...,307328.891808,7.263401e+07,4.348669e+06,260359.108848,1.027763e+09,6.153318e+07,3.684052e+06,220567.826088,9.696428e+05,9.696428e+05
4,9.20,40.03,1017.05,92.46,84.6400,368.2760,9356.8600,850.6320,1602.4009,40712.5115,...,148157.987214,4.140666e+07,3.764279e+06,342210.529548,1.052027e+09,9.563976e+07,8.694610e+06,790426.818936,7.163930e+03,7.163930e+03
5,26.82,69.23,1013.28,50.86,719.3124,1856.7486,27176.1696,1364.0652,4792.7929,70149.3744,...,243761.446894,7.108096e+07,3.567797e+06,179079.982508,1.040371e+09,5.221981e+07,2.621092e+06,131561.576056,5.174103e+05,5.174103e+05
6,9.48,40.80,1023.82,78.98,89.8704,386.7840,9705.8136,748.7304,1664.6400,41771.8560,...,131473.267200,4.276686e+07,3.299141e+06,254503.888320,1.073176e+09,8.278742e+07,6.386426e+06,492664.634792,8.076689e+03,8.076689e+03
7,9.41,41.54,1019.48,82.19,88.5481,390.8914,9593.3068,773.4079,1725.5716,42349.1992,...,141824.729804,4.317416e+07,3.480681e+06,280610.845994,1.059586e+09,8.542331e+07,6.886787e+06,555209.567459,7.840766e+03,7.840766e+03
8,31.03,69.59,1007.77,63.96,962.8609,2159.3777,31271.1031,1984.6788,4842.7681,70130.7143,...,309743.447676,7.067563e+07,4.485560e+06,284684.450544,1.023492e+09,6.495780e+07,4.122668e+06,261652.787136,9.271011e+05,9.271011e+05
9,18.24,59.15,1012.00,84.40,332.6976,1078.8960,18458.8800,1539.4560,3498.7225,59859.8000,...,295292.179000,6.057812e+07,5.052167e+06,421346.744000,1.036434e+09,8.643775e+07,7.208840e+06,601211.584000,1.106877e+05,1.106877e+05
