In [1]:
#importing the libraries
import numpy as np
import pandas as pd

In [2]:
#reading the data
df = pd.read_csv("FoDS-Assignment-2.csv")

# shuffle the DataFrame rows
df = df.sample(frac = 1)

df.isnull().sum()

bedrooms          0
bathrooms         0
sqft_living      14
sqft_lot          0
floors           13
waterfront        0
view              0
condition         0
grade             0
sqft_above       14
sqft_basement     0
sqft_living15     0
sqft_lot15        0
price             0
dtype: int64

In [3]:
#handling missing values-----dropping rows method
df = df.dropna(axis = 0)
df.isnull().sum()

bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
sqft_living15    0
sqft_lot15       0
price            0
dtype: int64

In [4]:
Features_list = ["bedrooms", "bathrooms", "sqft_living", "sqft_lot", "floors", "waterfront", "view", "condition", "grade", "sqft_above", "sqft_basement", "sqft_living15", "sqft_lot15"]

#Feature Scaling
#normalising the data values
df = (df - df.min()) / (df.max() - df.min())

In [5]:
for x in Features_list:
    q90, q10 = np.percentile(df.loc[:, x],[90, 10])
    IQR = q90-q10
 
    max = q90 + (2 * IQR)
    min = q10 - (2 * IQR)
 
    df.loc[df[x] < min, x] = np.nan
    df.loc[df[x] > max, x] = np.nan

print(df.isnull().sum())
print("\n")
df = df.dropna(axis = 0)
print(df.isnull().sum())

bedrooms          0
bathrooms         0
sqft_living       1
sqft_lot         35
floors            0
waterfront        4
view             11
condition         0
grade             0
sqft_above        0
sqft_basement     1
sqft_living15     0
sqft_lot15       35
price             0
dtype: int64


bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
sqft_living15    0
sqft_lot15       0
price            0
dtype: int64


In [6]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

#splitting the dataset into training data and testing data
splitData = int(0.7*len(X))
train_X, test_X, train_y, test_y = X[:splitData], X[splitData:], y[:splitData], y[splitData:]

In [7]:
WeightV = np.zeros(13)
for n in range(13):
    WeightV[n] = np.random.randn()

In [8]:
#finding the weights based on the training data
def fit(X, Y, iters, learning_rate, F_selected, F_trial):
    global bias
    bias = 0  
    
    for itr in range(iters):
        sumItrError = 0   
        for z in range(len(Y)):        # each row in input data
            column = Y[z]
            dataP_error = 0            # calculating error in each data point
            
            for m in range(13):
                dataP_error += (F_selected[m] + F_trial[m]) * WeightV[m] * X[z][m]    # summation of (w1*x1 + w2*x2 + w3*x3 + w4*x4 ...)
                
            dataP_error += bias
            dataP_error -= column      # (w0 + w1*x1 + w2*x1^2 + w3*x1*x2 + w4*x2^2 ...) - yn

            # for each parameter(w0, w1, w2,...)    
            for m in range(13):                   
                    WeightV[m] -= (learning_rate/len(X)) * dataP_error * X[z][m]   # calculating w0, w1, w2,... for each iteration
            bias -= (learning_rate/len(Y)) * dataP_error
            sumItrError +=  0.5*dataP_error**2/len(Y)
            
        sumItrError = (sumItrError)**0.5
    return sumItrError    

In [9]:
def predict(X, Y, F_selected, F_trial):
    testingError = 0
    for z in range(len(Y)):  
        dataP_error = 0            
            
        for m in range(13):
            dataP_error += (F_selected[m] + F_trial[m]) * WeightV[m] * X[z][m]   
        dataP_error += bias
        
        dataP_error -= Y[z]      
        testingError += (dataP_error**2)/(2*len(Y))
    testingError = (testingError)**0.5
    return testingError

In [10]:
F_selected = np.zeros(13)
F_trial = np.zeros(13)
finalFeatures = np.zeros(13)
finalMinE = float('inf')

for i in range(13):
    minErrorIn_i = float('inf')
    minIndex = i
    for j in range(13):
        if(F_selected[j]==1):
            continue
        F_trial[j] = 1
        error_j = fit(train_X, train_y, 500, 0.1, F_selected, F_trial)
        if(error_j <= minErrorIn_i):
            minErrorIn_i = error_j
            minIndex = j
        F_trial[j] = 0
    F_selected[minIndex] = 1 
    if(minErrorIn_i <= finalMinE):
        finalMinE = minErrorIn_i
        finalIndex = i
        for u in range(13):
            finalFeatures[u] = F_selected[u]
    print("Minimum error for", i+1, "feature(s) is", minErrorIn_i)

print("-----------------------------------------------------------------------------------------------------------------------")    
print("Minimum training error is", finalMinE) 
print("Number of features in the optimal model is/are", finalIndex+1)  
print("\nList of features giving minimum training error are")
for i in range(13):
    if(finalFeatures[i]==1):
        print(Features_list[i], end=", ")

Minimum error for 1 feature(s) is 0.03441623556617811
Minimum error for 2 feature(s) is 0.033271898736949246
Minimum error for 3 feature(s) is 0.03297106900495711
Minimum error for 4 feature(s) is 0.032995278532587394
Minimum error for 5 feature(s) is 0.03305601627595661
Minimum error for 6 feature(s) is 0.03239778626255488
Minimum error for 7 feature(s) is 0.03093537520928108
Minimum error for 8 feature(s) is 0.03072508976359503
Minimum error for 9 feature(s) is 0.03124878925690184
Minimum error for 10 feature(s) is 0.031779568759163615
Minimum error for 11 feature(s) is 0.034375854939350095
Minimum error for 12 feature(s) is 0.03486378224096311
Minimum error for 13 feature(s) is 0.04140851341534551
-----------------------------------------------------------------------------------------------------------------------
Minimum training error is 0.03072508976359503
Number of features in the optimal model is/are 8

List of features giving minimum training error are
bedrooms, bathrooms, fl

In [11]:
fit(train_X, train_y, 500, 0.1, finalFeatures, F_trial)

#making predictions on test data
testingE = predict(test_X, test_y, finalFeatures, F_trial)
print("Minimum testing error considering these", finalIndex+1, "features is", testingE)

Minimum testing error considering these 8 features is 0.036602420215217805
