In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [4]:
dataset = pd.read_csv('train.csv')

In [5]:
dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
x_train = dataset.iloc[:,1:-1]

In [7]:
y = dataset.iloc[:,-1].values

In [8]:
def differentiate_features(x_train):
    discrete_features = []
    continuous_features = []
    for i,j in zip(x_train,x_train.dtypes):
        if j.type==np.int64 or j.type==np.float64:
            continuous_features.append(i)
        else:
            discrete_features.append(i)
    return discrete_features,continuous_features

In [9]:
discrete_features,continuous_features = differentiate_features(x_train)

In [10]:
def get_imputer(x_continuous):
    imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0)
    imputer.fit(x_continuous)
    return imputer

In [11]:
x_continuous = dataset.filter(continuous_features).values
imputer = get_imputer(x_continuous)



In [12]:
def get_data_from_imputer(imputer,x_continuous):
    x_continuous = imputer.transform(x_continuous)
    return x_continuous

In [13]:
x_continuous = get_data_from_imputer(imputer,x_continuous)

In [14]:
x_continuous.shape

(1460, 36)

In [15]:
def get_label_encoder(discrete_features,x_discrete):
    label_encoder_dict={}
    for col in discrete_features:
        label_encoder_dict[col] = LabelEncoder()
        x_discrete[col] = x_discrete[col].fillna('')
        s = x_discrete[col].astype(str).unique()
        if '' not in s: s = np.append(s,[''])
        label_encoder_dict[col].fit(s)
    return label_encoder_dict

In [16]:
x_discrete = dataset.filter(discrete_features)

In [17]:
label_encoder_dict = get_label_encoder(discrete_features,x_discrete)

In [18]:
def get_discrete_encoder(label_encoder_dict,x_discrete):
    for col in label_encoder_dict:
        x_discrete[col] = label_encoder_dict[col].transform(x_discrete[col])
    return x_discrete

In [19]:
x_discrete = get_discrete_encoder(label_encoder_dict,x_discrete)

In [20]:
try:
    n_discrete = x_discrete.shape[1]
except:
    n_discrete = 0
n_discrete

43

In [21]:
def generate_onehot_encoder(n_discrete):
    onehotencoder = OneHotEncoder(categorical_features = list(range(n_discrete)))
    onehotencoder.fit(x_discrete)
    return onehotencoder

In [22]:
onehotencoder = generate_onehot_encoder(n_discrete)



In [23]:
def get_onehot_encoder(onehotencoder,x_continuous,x_discrete):
    x_discrete = onehotencoder.transform(x_discrete).toarray()
    x = np.concatenate([x_continuous,x_discrete],axis=1)
    x = np.concatenate([np.ones((len(x),1)),x],axis=1)
    return x

In [24]:
x = get_onehot_encoder(onehotencoder,x_continuous,x_discrete)

In [25]:
def get_standardscaler(x,y):
    sc_X = StandardScaler()
    sc_X.fit(x)
    sc_y = StandardScaler()
    sc_y.fit(y.reshape(-1,1))
    return sc_X,sc_y

In [26]:
sc_X,sc_Y = get_standardscaler(x,y)

In [27]:
def get_normalized(sc_X,sc_Y,x,y):
    x = sc_X.transform(x)
    y = sc_Y.transform(y.reshape(-1,1))
    return x,y.reshape(-1)

In [28]:
x_train,y_train = get_normalized(sc_X,sc_Y,x,y)

In [29]:
num_features = x.shape[1]
num_features

305

In [30]:
# mean square error
def mse(preds,actual):
    return ((preds-actual)**2).sum()/(2*len(preds))

In [31]:
def derivative_w(y,y_train,x):
    dw = np.matmul((y-y_train),x)/len(x)
    return dw

In [32]:
epochs = 100
w = np.random.random((num_features))
lr = 1e-1

In [33]:
loss=0
for epoch in range(epochs):
    pred_y = np.matmul(x_train,w)
    loss = mse(pred_y,y_train)
    print('Epoch ',epoch,':-',loss)
    dw = derivative_w(pred_y,y_train,x_train)
    w = w - lr*dw

Epoch  0 :- 46.23505027350291
Epoch  1 :- 19.366891220419816
Epoch  2 :- 13.399753746102759
Epoch  3 :- 10.396575039114754
Epoch  4 :- 8.352821638972772
Epoch  5 :- 6.8423904269761096
Epoch  6 :- 5.685237546757652
Epoch  7 :- 4.77770384297777
Epoch  8 :- 4.053177787297743
Epoch  9 :- 3.4664550773937677
Epoch  10 :- 2.985709793593149
Epoch  11 :- 2.587890439690364
Epoch  12 :- 2.25590913921902
Epoch  13 :- 1.9768486019223093
Epoch  14 :- 1.740778739053541
Epoch  15 :- 1.5399535725339761
Epoch  16 :- 1.368253181069503
Epoch  17 :- 1.220787836582441
Epoch  18 :- 1.093611990709451
Epoch  19 :- 0.9835141744043612
Epoch  20 :- 0.8878603107696557
Epoch  21 :- 0.8044752303111989
Epoch  22 :- 0.7315519250133325
Epoch  23 :- 0.6675812284765317
Epoch  24 :- 0.6112967361573427
Epoch  25 :- 0.5616312373455659
Epoch  26 :- 0.5176819435625092
Epoch  27 :- 0.4786825114199808
Epoch  28 :- 0.4439803665587961
Epoch  29 :- 0.41301820218465135
Epoch  30 :- 0.3853187934336294
Epoch  31 :- 0.3604724662955873

In [34]:
x_train.shape

(1460, 305)

In [35]:
def get_normalized_test_data(x_test,
                             y_test,
                             continuous_features,
                             discrete_features,
                             imputer,
                             label_encoder_dict,
                             onehotencoder,
                             sc_X,
                             sc_Y
                            ):
    x_test_continuous = x_test.filter(continuous_features)
    x_test_discrete = x_test.filter(discrete_features)
    
    x_test_continuous = get_data_from_imputer(imputer,x_test_continuous)
    
    x_test_discrete = x_test_discrete.fillna('')
    
    x_test_discrete = get_discrete_encoder(label_encoder_dict,x_test_discrete)
    
    x = get_onehot_encoder(onehotencoder,x_test_continuous,x_test_discrete)
    
    x_test,y_test = get_normalized(sc_X,sc_Y,x,y_test)
    
    return x_test,y_test
    

In [36]:
x_test = pd.read_csv('test.csv')

In [37]:
y_test = pd.read_csv('sample_submission.csv')

In [38]:
y_test = y_test['SalePrice'].to_numpy()

In [39]:
x_test, y_test = get_normalized_test_data(x_test,
                             y_test,
                             continuous_features,
                             discrete_features,
                             imputer,
                             label_encoder_dict,
                             onehotencoder,
                             sc_X,
                             sc_Y
                            )

In [40]:
pred_y = np.matmul(x_test,w)

In [41]:
mse(pred_y,y_test)

1.1904110775449077