In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from collections import defaultdict

In [149]:
x_train = pd.read_csv('train.csv')

In [150]:
x_train.shape

(1460, 81)

In [151]:
del x_train['Id']

In [152]:
y_train = x_train['SalePrice']

In [153]:
del x_train['SalePrice']

In [154]:
y_train = y_train.to_numpy()

In [155]:
y_train.shape

(1460,)

In [156]:
def differentiate_features(x_train):
    discrete_features = []
    continuous_features = []
    for i,j in zip(x_train,x_train.dtypes):
        if j.type==np.int64 or j.type==np.float64:
            continuous_features.append(i)
        else:
            discrete_features.append(i)
    return discrete_features,continuous_features

In [157]:
discrete_features,continuous_features = differentiate_features(x_train)

In [158]:
def generate_index_map(x_train,discrete_features):
    index_map = {}
    for i in x_train.filter(discrete_features):
        index_map[i]={}
        for item in x_train[i].fillna('').unique():
            if item!='' and item.lower() not in index_map[i]:
                index_map[i][item.lower()] = len(index_map[i]) + 1
    return index_map

In [159]:
index_map = generate_index_map(x_train,discrete_features)

In [160]:
def generate_feature_mean(x_train,continuous_features,discrete_features,index_map):
    feat_mean_dict = {}
    for col in continuous_features:
        feat_mean_dict[col] = x_train[col].mean()
    for col in discrete_features:
        feat_mean_dict[col] = sum(list(index_map[col].values()))/len(index_map[col])
    return feat_mean_dict

In [161]:
feat_mean_dict = generate_feature_mean(x_train,continuous_features,discrete_features,index_map)

In [162]:
def remove_noise_cont_feature(data,continuous_features,feat_mean_dict):
    for col in continuous_features:
        data = data.fillna(value={col:feat_mean_dict[col]})
    return data

In [163]:
x_train = remove_noise_cont_feature(x_train,continuous_features,feat_mean_dict)

In [164]:
def remove_noise_disc_feature(data,discrete_features,feat_mean_dict,index_map):
    for col in discrete_features:
        data[col] = data[col].fillna('').apply(lambda x: index_map[col][x.lower()] if x.lower() in index_map[col] else feat_mean_dict[col])
    return data

In [165]:
x_train = remove_noise_disc_feature(x_train,discrete_features,feat_mean_dict,index_map)

In [166]:
def get_feature_mean(x_train,feat_mean_dict):
    feature_mean = []
    for col in x_train.columns:
        try:
            feature_mean.append(feat_mean_dict[col])
        except:
            feature_mean.append(feat_mean_dict[col])

    feature_mean = np.array(feature_mean)
    return feature_mean

In [167]:
feature_mean = get_feature_mean(x_train,feat_mean_dict)

In [168]:
x = x_train.to_numpy()

In [169]:
xstd = x.std(0)
xstd

array([4.22860820e+01, 7.98035153e-01, 2.20164789e+01, 9.97784611e+03,
       6.39742160e-02, 1.24790592e-01, 5.82096688e-01, 6.41790325e-01,
       2.61622319e-02, 1.01001250e+00, 2.76137848e-01, 5.83578349e+00,
       1.20819277e+00, 3.74062912e-01, 1.06774188e+00, 1.32430032e+00,
       1.38252284e+00, 1.11241818e+00, 3.01925588e+01, 2.06383353e+01,
       6.11693689e-01, 5.39417807e-01, 2.50623645e+00, 2.66143012e+00,
       6.30364387e-01, 1.80507263e+02, 5.74565686e-01, 4.40071025e-01,
       8.30892497e-01, 7.28083400e-01, 4.63862373e-01, 1.12950294e+00,
       1.47813595e+00, 4.55941866e+02, 1.07477212e+00, 1.61264017e+02,
       4.41715605e+02, 4.38555057e+02, 3.06073965e-01, 9.59172174e-01,
       2.46646679e-01, 5.32355275e-01, 3.86455322e+02, 4.36378914e+02,
       4.86064268e+01, 5.25300394e+02, 5.18732867e-01, 2.38670868e-01,
       5.50727099e-01, 5.02713131e-01, 8.15498620e-01, 2.20262727e-01,
       7.06129505e-01, 1.62483655e+00, 7.16968556e-01, 6.44445572e-01,
      

In [170]:
x = (x - feature_mean)/xstd

In [171]:
x.shape

(1460, 79)

In [172]:
x = np.concatenate([np.ones((len(x),1)),x],axis=1)

In [173]:
m,n = x.shape
m,n

(1460, 80)

In [174]:
ymean = y_train.mean()
ystd = y_train.std()
ymean,ystd

(180921.19589041095, 79415.29188606751)

In [175]:
y_train = (y_train-y_train.mean())/y_train.std()

In [176]:
# mean square error
def mse(preds,actual):
    return ((preds-actual)**2).sum()/(2*len(preds))

In [177]:
def derivative_w(y,y_train,x):
    dw = np.matmul((y-y_train),x)/m
    return dw

In [178]:
epochs = 50
w = np.random.random((n))
lr = 1e-1

In [179]:
y_train

array([ 0.34727322,  0.00728832,  0.53615372, ...,  1.07761115,
       -0.48852299, -0.42084081])

In [180]:
loss=0
for epoch in range(epochs):
    y = np.matmul(x,w)
    loss = mse(y,y_train)
    print('Epoch ',epoch,':-',loss)
    dw = derivative_w(y,y_train,x)
    print(dw.shape)
    w = w - lr*dw
    break

Epoch  0 :- 1815.1500663987924
(80,)


In [101]:
x_test = pd.read_csv('test.csv')

In [102]:
y_test = pd.read_csv('sample_submission.csv')

In [103]:
y_test = y_test['SalePrice'].to_numpy()

In [104]:
del x_test['Id']

In [105]:
x_test = remove_noise_cont_feature(x_test,continuous_features,feat_mean_dict)

In [106]:
x_test = remove_noise_disc_feature(x_test,discrete_features,feat_mean_dict,index_map)

In [107]:
x_test = x_test.to_numpy()

In [108]:
x_test = (x_test - feature_mean)/xstd

In [109]:
x_test = np.concatenate([np.ones((len(x_test),1)),x_test],axis=1)

In [110]:
preds = np.matmul(x_test,w)

In [111]:
y_test = (y_test-ymean)/ystd

In [112]:
mse(preds,y_test)

4.870142399367679e+193

In [113]:
preds

array([-9.89224886e+96, -9.97286431e+96, -1.02085759e+97, ...,
       -1.01339575e+97, -9.74616783e+96, -1.00393682e+97])

In [114]:
y_test

array([-0.14662344,  0.08609423,  0.03352613, ...,  0.48229033,
        0.05040696,  0.08588611])