In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from copy import deepcopy

In [2]:
def R2_coef(y_true,y_pred):
    """
    Calculates coefficient of determination 
    """
    u,v = 0,0
    for i in range(len(y_true)):
        u += (y_true[i] - y_pred[i])**2
        v += (y_true[i] - np.mean(y_true))**2
    print("Coefficient of determination is: ",float(1 - u/v))

In [3]:
## Loading data
TRAIN_FILEPATH = "Data/train.csv"
TEST_FILEPATH  = "Data/test.csv"

Training_data = pd.read_csv(TRAIN_FILEPATH)
Test_data = pd.read_csv(TEST_FILEPATH)

In [4]:
a = np.array([1,2,3,4,5])
[i for i in range(2*len(a)) if i not in a]

[0, 6, 7, 8, 9]

In [5]:
## Viewing data
Training_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [6]:
## Extracting sales prices (labels) as numpy array
Labels = Training_data["SalePrice"].values
Labels

array([208500, 181500, 223500, ..., 266500, 142125, 147500])

In [7]:
## Extraxting feature names 
Feature_names = Training_data.columns[1:-1]
Feature_names

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [8]:
## Getting features not represented by a number 
NaN_features = []
for index, datatype in enumerate(Training_data.dtypes):
    if datatype == "O" or datatype == object:
        NaN_features.append(Training_data.columns[index])
print("Nr. of NaN features:",len(NaN_features))
NaN_features

Nr. of NaN features: 43


['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [9]:
## Dictionary of dictionaries
"""A dictionary for each NaN feature, mapping k'th possible val for that feature to int(k)"""
NaN_feature_dicts = {}
for feature_idx,feature in enumerate(NaN_features):
    ith_feature_dict = {}
    for val_idx, possible_val in enumerate(np.unique(list(Training_data[NaN_features[feature_idx]]))):
        ith_feature_dict[str(possible_val)] = val_idx
    NaN_feature_dicts[feature] = ith_feature_dict   


In [10]:
## For instance
feature = NaN_features[0]
print("#"*20+" First NaN feature:", feature,"#"*20)
print("")
feature_dict = NaN_feature_dicts[feature]
print("Dict of possible vals for first NaN feature:\n", feature_dict)
print("")
print("List of keys: \n",list(feature_dict.keys()))
print("")
print("list of values:\n",list(feature_dict.values()))

#################### First NaN feature: MSZoning ####################

Dict of possible vals for first NaN feature:
 {'C (all)': 0, 'FV': 1, 'RH': 2, 'RL': 3, 'RM': 4}

List of keys: 
 ['C (all)', 'FV', 'RH', 'RL', 'RM']

list of values:
 [0, 1, 2, 3, 4]


In [11]:
## Mapping all NaN features to numbers in compliance with dictionaries above
for feature in NaN_features:
    current_dict = NaN_feature_dicts[feature] 
    for index in range(len(list(Training_data[feature]))):
        value = current_dict[str(Training_data.at[index,feature])] ## Getting
        Training_data.at[index,feature] = value                    ## Setting

In [12]:
## Viewing mapped data
Training_data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,65.0,8450,1,2,3,3,0,...,0,3,4,4,0,2,2008,8,4,208500
1,2,20,3,80.0,9600,1,2,3,3,0,...,0,3,4,4,0,5,2007,8,4,181500
2,3,60,3,68.0,11250,1,2,0,3,0,...,0,3,4,4,0,9,2008,8,4,223500
3,4,70,3,60.0,9550,1,2,0,3,0,...,0,3,4,4,0,2,2006,8,0,140000
4,5,60,3,84.0,14260,1,2,0,3,0,...,0,3,4,4,0,12,2008,8,4,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,3,62.0,7917,1,2,3,3,0,...,0,3,4,4,0,8,2007,8,4,175000
1456,1457,20,3,85.0,13175,1,2,3,3,0,...,0,3,2,4,0,2,2010,8,4,210000
1457,1458,70,3,66.0,9042,1,2,3,3,0,...,0,3,0,2,2500,5,2010,8,4,266500
1458,1459,20,3,68.0,9717,1,2,3,3,0,...,0,3,4,4,0,4,2010,8,4,142125


In [13]:
## Transforming from Pandas datafram to numpy array 
Training_data   = Training_data.values
## Removing first column of data (only a numbering), and last column (labels)
Feature_vectors = Training_data[:,[i+1 for i in range(Training_data.shape[1]-2)]] 
## Assuring appropriate dtype of all entries in array (also setting any Nan = 0)
Feature_vectors = Feature_vectors.astype("float64")
Feature_vectors = np.nan_to_num(Feature_vectors)
Feature_vectors


array([[  60.,    3.,   65., ..., 2008.,    8.,    4.],
       [  20.,    3.,   80., ..., 2007.,    8.,    4.],
       [  60.,    3.,   68., ..., 2008.,    8.,    4.],
       ...,
       [  70.,    3.,   66., ..., 2010.,    8.,    4.],
       [  20.,    3.,   68., ..., 2010.,    8.,    4.],
       [  20.,    3.,   75., ..., 2008.,    8.,    4.]])

## Normal linear regression
\begin{equation}
\bold{W} = (\bold{X}^T\bold{X})^{-1}\bold{X}^T\bold{y}
\end{equation}

In [14]:
############ LINEAR REGRESSION ############
X_init = deepcopy(Feature_vectors)
# Adding 1 as first coord to all feature vectors
X = np.ones((X_init.shape[0],X_init.shape[1]+1)) 
for col in range(1,X.shape[1]):
    X[:,col] = X_init[:,col-1]
# Computing optimal weight vector
Y = Labels
XT = X.T
XTX = XT @ X
XTX_INVERSE = np.linalg.inv(XTX)
W_optimal = (XTX_INVERSE @ XT) @ Y
# Checking predictions
def predict(feature_vector,weight):
    prediction = 0
    # Computing inner product
    for i in range(len(feature_vector)):
        prediction += feature_vector[i]*weight[i]
    return prediction

deviation = 0
for i in range(len(X)):
    deviation += np.abs(predict(X[i],W_optimal)-Labels[i])/np.abs(Labels[i])
deviation *= 1./X.shape[0]
print("Avg. deviation before normalization =",np.round(deviation,2),"(",np.round(deviation*100,2),"%)")

######## Normalizing s.t. each feature (column) has mean = 0 and then variance = 1 ########
# Labels
Label_mean, Label_var = np.mean(Labels), np.std(Labels) 
Labels = Labels - Label_mean     ## mean
Labels = Labels * 1./Label_var   ## variance
# Feature vectors
for col in range(Feature_vectors.shape[1]):
    Feature_vectors[:,col] = Feature_vectors[:,col] - np.mean(Feature_vectors[:,col])       ## mean
    Feature_vectors[:,col] = Feature_vectors[:,col] * 1./np.std(Feature_vectors[:,col])     ## variance


X_init_norm = deepcopy(Feature_vectors)
# Adding 1 as first coord to all feature vectors
X_norm = np.ones((X_init_norm.shape[0],X_init_norm.shape[1]+1)) 
for col in range(1,X_norm.shape[1]):
    X_norm[:,col] = X_init_norm[:,col-1]
# Computing optimal weight vector
Y_norm = Labels
XT_norm = X_norm.T
XTX_norm = XT_norm @ X_norm
XTX_INVERSE_norm = np.linalg.inv(XTX_norm)
W_optimal_norm = (XTX_INVERSE_norm @ XT_norm) @ Y_norm

deviation = 0
for i in range(len(X_norm)):
    deviation += np.abs(predict(X_norm[i],W_optimal_norm)*Label_var+Label_mean-(Y_norm[i]*Label_var+Label_mean))/np.abs(Y_norm[i]*Label_var+Label_mean)
deviation *= 1./X.shape[0]
print("Avg. deviation after normalization =",np.round(deviation,2),"(",np.round(deviation*100,2),"%)")
y_pred = np.array([predict(X_norm[i],W_optimal_norm)*Label_var+Label_mean for i in range(X_norm.shape[0])]) 
y_true = Y_norm*Label_var+Label_mean
R2_coef(y_true,y_pred)


Avg. deviation before normalization = 18.65 ( 1864.98 %)
Avg. deviation after normalization = 0.21 ( 21.01 %)
Coefficient of determination is:  0.6186502417886162


In [15]:
# Single predictions (accounting for normalization)
nr = 10
print("-"*40)
print("#### Predicted price, actual price ####")
print("    ",np.round(predict(X_norm[nr],W_optimal_norm)*Label_var+Label_mean,1),"      ,",(Y_norm[nr]*Label_var)+Label_mean,"")
print("-"*40)
print("####           Deviation           ####")
dev = np.round(100*np.abs(predict(X_norm[nr],W_optimal_norm)*Label_var+Label_mean-(Y_norm[nr]*Label_var+Label_mean))/np.abs(Y_norm[nr]*Label_var+Label_mean),1)
print(f'                 {dev} %')
print("-"*40)

----------------------------------------
#### Predicted price, actual price ####
     164322.4       , 129500.0 
----------------------------------------
####           Deviation           ####
                 26.9 %
----------------------------------------


In [16]:
######### Variance-Covariance matrix #########

### Removing ones
X_norm_new = X_norm[:,[i+1 for i in range(X_norm.shape[1]-1)]]

### Calculating square inverse matrix
XT_norm_new = X_norm_new.T
XTX_norm_new = XT_norm_new @ X_norm_new
XTX_INVERSE_norm_new = np.linalg.inv(XTX_norm_new)

### Calculating Covariance-variance matrix
N, D = X_norm_new.shape[0], X_norm_new.shape[1]-1
Y_predictions_norm_new = X_norm_new @ W_optimal_norm[1:]
variance = 1./(N-D-1)*np.sum((Y_norm-Y_predictions_norm_new)**2)
variance_matrix = XTX_INVERSE_norm_new * variance

Plot_n_save = False
if Plot_n_save:
    ### Plotting and saving 
    from heatmap_functions import heatmap
    from heatmap_functions import annotate_heatmap

    fig, ax = plt.subplots(1,1,figsize=(20,20))
    ax.set_title("Covariance matrix of normed features",size=20)
    im, cbar = heatmap(data = variance_matrix, row_labels = Feature_names, 
                       col_labels = Feature_names, ax=ax, cbarlabel=" ",cmap="YlGn")
    texts = annotate_heatmap(im, valfmt="{x:.1e}",size=2)
    fig.tight_layout()
    plt.savefig("Covariance-MatrixV2.jpg",dpi=800)

## Regularization by Ridge regression
\begin{equation}
\bold{W} = (\bold{X}^T\bold{X}+\lambda\bold{I})^{-1}\bold{X}^T\bold{y},\quad \lambda\geq0
\end{equation}

In [17]:
## Loading data
TRAIN_FILEPATH = "Data/train.csv"
TEST_FILEPATH  = "Data/test.csv"

Training_data = pd.read_csv(TRAIN_FILEPATH)
Test_data = pd.read_csv(TEST_FILEPATH)

## Extracting sales prices (labels) as numpy array
Labels = Training_data["SalePrice"].values

## Extraxting feature names 
Feature_names = Training_data.columns[1:-1]

## Getting features not represented by a number 
NaN_features = []
for index, datatype in enumerate(Training_data.dtypes):
    if datatype == "O" or datatype == object:
        NaN_features.append(Training_data.columns[index])
print("Nr. of NaN features:",len(NaN_features))

## Dictionary of dictionaries for integer embedding of NaN features
"""A dictionary for each NaN feature, mapping k'th possible val for that feature to int(k)"""
NaN_feature_dicts = {}
for feature_idx,feature in enumerate(NaN_features):
    ith_feature_dict = {}
    for val_idx, possible_val in enumerate(np.unique(list(Training_data[NaN_features[feature_idx]]))):
        ith_feature_dict[str(possible_val)] = val_idx
    NaN_feature_dicts[feature] = ith_feature_dict   

## Mapping all NaN features to numbers in compliance with dictionaries above (embedding)
for feature in NaN_features:
    current_dict = NaN_feature_dicts[feature] 
    for index in range(len(list(Training_data[feature]))):
        value = current_dict[str(Training_data.at[index,feature])] ## Getting
        Training_data.at[index,feature] = value                    ## Setting

## Transforming from Pandas datafram to numpy array 
Training_data   = Training_data.values
## Removing first column of data (only a numbering), and last column (labels)
Feature_vectors = Training_data[:,[i+1 for i in range(Training_data.shape[1]-2)]] 
## Assuring appropriate dtype of all entries in array (also setting any Nan = 0)
Feature_vectors = Feature_vectors.astype("float64")
Feature_vectors = np.nan_to_num(Feature_vectors)


Nr. of NaN features: 43


In [18]:

######## Normalizing s.t. each feature (column) has mean = 0 and then variance = 1 ########
# Labels
Label_mean, Label_var = np.mean(Labels), np.std(Labels) 
Labels = Labels - Label_mean     ## mean
Labels = Labels * 1./Label_var   ## variance
# Feature vectors
for col in range(Feature_vectors.shape[1]):
    Feature_vectors[:,col] = Feature_vectors[:,col] - np.mean(Feature_vectors[:,col])       ## mean
    Feature_vectors[:,col] = Feature_vectors[:,col] * 1./np.std(Feature_vectors[:,col])     ## variance


X_init_norm_ridge = deepcopy(Feature_vectors)
# Adding 1 as first coord to all feature vectors
X_norm_ridge = np.ones((X_init_norm_ridge.shape[0],X_init_norm_ridge.shape[1]+1)) 
for col in range(1,X_norm_ridge.shape[1]):
    X_norm_ridge[:,col] = X_init_norm_ridge[:,col-1]

lmbda = 0.1
I = np.identity(X_norm_ridge.shape[1])
# Computing optimal weight vector
Y_norm_ridge = Labels
XT_norm_ridge = X_norm_ridge.T
XTX_norm_ridge = XT_norm_ridge @ X_norm_ridge
XTX_INVERSE_norm_ridge = np.linalg.inv(XTX_norm_ridge + lmbda * I)
W_optimal_norm_ridge = ((XTX_INVERSE_norm_ridge @ XT_norm_ridge) @ Y_norm_ridge)

deviation = 0
for i in range(len(X_norm)):
    deviation += np.abs(predict(X_norm_ridge[i],W_optimal_norm_ridge)*Label_var+Label_mean-(Y_norm_ridge[i]*Label_var+Label_mean))/np.abs(Y_norm_ridge[i]*Label_var+Label_mean)
deviation *= 1./X.shape[0]
print("Avg. deviation after normalization (ridge regression) =",np.round(deviation,4),"(",np.round(deviation*100,2),"%)")
y_pred = np.array([predict(X_norm_ridge[i],W_optimal_norm_ridge)*Label_var+Label_mean for i in range(X_norm_ridge.shape[0])]) 
y_true = Y_norm_ridge*Label_var+Label_mean
R2_coef(y_true,y_pred)

Avg. deviation after normalization (ridge regression) = 0.1089 ( 10.89 %)
Coefficient of determination is:  0.8546261977386495


In [19]:
# Single predictions (accounting for normalization)
nr = 10
print("-"*40)
print("#### Predicted price, actual price ####")
print("    ",np.round(predict(X_norm_ridge[nr],W_optimal_norm_ridge)*Label_var+Label_mean,1),"      ,",(Y_norm[nr]*Label_var)+Label_mean,"")
print("-"*40)
print("####           Deviation           ####")
dev = np.round(100*np.abs(predict(X_norm_ridge[nr],W_optimal_norm_ridge)*Label_var+Label_mean-(Y_norm_ridge[nr]*Label_var+Label_mean))/np.abs(Y_norm_ridge[nr]*Label_var+Label_mean),1)
print(f'                 {dev} %')
print("-"*40)

----------------------------------------
#### Predicted price, actual price ####
     124685.4       , 129500.0 
----------------------------------------
####           Deviation           ####
                 3.7 %
----------------------------------------


## Regularization by Lasso regression (no closed form)

In [20]:
from sklearn import linear_model

## Loading data
TRAIN_FILEPATH = "Data/train.csv"
TEST_FILEPATH  = "Data/test.csv"

Training_data = pd.read_csv(TRAIN_FILEPATH)
Test_data = pd.read_csv(TEST_FILEPATH)

## Extracting sales prices (labels) as numpy array
Labels = Training_data["SalePrice"].values
 
## Mapping all NaN features to numbers in compliance with dictionaries above (embedding)
for feature in NaN_features:
    current_dict = NaN_feature_dicts[feature] 
    for index in range(len(list(Training_data[feature]))):
        value = current_dict[str(Training_data.at[index,feature])] ## Getting
        Training_data.at[index,feature] = value                    ## Setting

## Transforming from Pandas datafram to numpy array 
Training_data   = Training_data.values
## Removing first column of data (only a numbering), and last column (labels)
Feature_vectors = Training_data[:,[i+1 for i in range(Training_data.shape[1]-2)]] 
## Assuring appropriate dtype of all entries in array (also setting any Nan = 0)
Feature_vectors = Feature_vectors.astype("float64")
Feature_vectors = np.nan_to_num(Feature_vectors)

######## Normalizing s.t. each feature (column) has mean = 0 and then variance = 1 ########
# Labels
Label_mean, Label_var = np.mean(Labels), np.std(Labels) 
Labels = Labels - Label_mean     ## mean
Labels = Labels * 1./Label_var   ## variance
# Feature vectors
for col in range(Feature_vectors.shape[1]):
    Feature_vectors[:,col] = Feature_vectors[:,col] - np.mean(Feature_vectors[:,col])       ## mean
    Feature_vectors[:,col] = Feature_vectors[:,col] * 1./np.std(Feature_vectors[:,col])     ## variance

# Adding 1 as first coord to all feature vectors
X_norm_lasso = np.ones((Feature_vectors.shape[0],Feature_vectors.shape[1]+1)) 
for col in range(1,X_norm_lasso.shape[1]):
    X_norm_lasso[:,col] = Feature_vectors[:,col-1]

## Performing lasso regression
clf = linear_model.Lasso(alpha=0.007)
clf.fit(X=X_norm_lasso,y=Labels)
W_optimal_norm_lasso = np.concatenate([[clf.intercept_],clf.coef_[1:]])

deviation = 0
for i in range(len(X_norm)):
    deviation += np.abs(predict(X_norm_lasso[i],W_optimal_norm_lasso)*Label_var+Label_mean-(Y_norm_ridge[i]*Label_var+Label_mean))/np.abs(Labels[i]*Label_var+Label_mean)
deviation *= 1./X.shape[0]
print("Avg. deviation after normalization (lasso regression) =",np.round(deviation,4),"(",np.round(deviation*100,2),"%)")
y_pred = np.array([predict(X_norm_lasso[i],W_optimal_norm_lasso)*Label_var+Label_mean for i in range(X_norm_lasso.shape[0])]) 
y_true = Y_norm_ridge*Label_var+Label_mean
R2_coef(y_true,y_pred)


Avg. deviation after normalization (lasso regression) = 0.1066 ( 10.66 %)
Coefficient of determination is:  0.8505634691722745


In [21]:
# Single predictions (accounting for normalization)
nr = 10
print("-"*40)
print("#### Predicted price, actual price ####")
print("    ",np.round(predict(X_norm_lasso[nr],W_optimal_norm_lasso)*Label_var+Label_mean,1),"      ,",(Y_norm[nr]*Label_var)+Label_mean,"")
print("-"*40)
print("####           Deviation           ####")
dev = np.round(100*np.abs(predict(X_norm_lasso[nr],W_optimal_norm_lasso)*Label_var+Label_mean-(Y_norm_ridge[nr]*Label_var+Label_mean))/np.abs(Y_norm_ridge[nr]*Label_var+Label_mean),1)
print(f'                 {dev} %')
print("-"*40)

----------------------------------------
#### Predicted price, actual price ####
     124204.2       , 129500.0 
----------------------------------------
####           Deviation           ####
                 4.1 %
----------------------------------------


## Using Regression Tree with AdaBoost

In [22]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

## Loading data
TRAIN_FILEPATH = "Data/train.csv"
TEST_FILEPATH  = "Data/test.csv"

Training_data = pd.read_csv(TRAIN_FILEPATH)
Test_data = pd.read_csv(TEST_FILEPATH)

## Extracting sales prices (labels) as numpy array
Labels = Training_data["SalePrice"].values
 
## Mapping all NaN features to numbers in compliance with dictionaries above (embedding)
for feature in NaN_features:
    current_dict = NaN_feature_dicts[feature] 
    for index in range(len(list(Training_data[feature]))):
        value = current_dict[str(Training_data.at[index,feature])] ## Getting
        Training_data.at[index,feature] = value                    ## Setting

## Transforming from Pandas datafram to numpy array 
Training_data   = Training_data.values
## Removing first column of data (only a numbering), and last column (labels)
Feature_vectors = Training_data[:,[i+1 for i in range(Training_data.shape[1]-2)]] 
## Assuring appropriate dtype of all entries in array (also setting any Nan = 0)
Feature_vectors = Feature_vectors.astype("float64")
Feature_vectors = np.nan_to_num(Feature_vectors)

######## Normalizing s.t. each feature (column) has mean = 0 and then variance = 1 ########
# Labels
Label_mean, Label_var = np.mean(Labels), np.std(Labels) 
Labels = Labels - Label_mean     ## mean
Labels = Labels * 1./Label_var   ## variance

# Feature vectors
for col in range(Feature_vectors.shape[1]):
    Feature_vectors[:,col] = Feature_vectors[:,col] - np.mean(Feature_vectors[:,col])       ## mean
    Feature_vectors[:,col] = Feature_vectors[:,col] * 1./np.std(Feature_vectors[:,col])     ## variance

# Adding 1 as first coord to all feature vectors
X_norm_Ada = np.ones((Feature_vectors.shape[0],Feature_vectors.shape[1]+1)) 
for col in range(1,X_norm_Ada.shape[1]):
    X_norm_Ada[:,col] = Feature_vectors[:,col-1]

# Fit regression model
regression_loss_funcs = ['linear','square','exponential']
base_reg = DecisionTreeRegressor(max_depth=15)
regr_2 = AdaBoostRegressor(base_estimator=base_reg, n_estimators=68, 
                           learning_rate = 0.775, loss=regression_loss_funcs[1], random_state=0)

regr_2.fit(X_norm_Ada, Labels.flatten())

# Predict
y_pred = regr_2.predict(X_norm_Ada)*Label_var+Label_mean
y_true = Labels*Label_var+Label_mean

# Score
deviation = 0
for i in range(len(X_norm)):
    deviation += np.abs(y_pred[i]-y_true[i])/np.abs(y_true[i])
deviation *= 1./X_norm_Ada.shape[0]
print("Avg. deviation after normalization (Reg. tree w. AdaBoost) =",np.round(deviation,4),"(",np.round(deviation*100,2),"%)")
R2_coef(y_true,y_pred)


Avg. deviation after normalization (Reg. tree w. AdaBoost) = 0.0124 ( 1.24 %)
Coefficient of determination is:  0.9977179786400762


## Testing Adaboost model

In [23]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor


########## TRAINING DATA ##########
## Loading data
TRAIN_FILEPATH = "Data/train.csv"
Training_data = pd.read_csv(TRAIN_FILEPATH)

## Getting features not represented by a number 
NaN_features = []
for index, datatype in enumerate(Training_data.dtypes):
    if datatype == "O" or datatype == object:
        NaN_features.append(Training_data.columns[index])

## Dictionary of dictionaries
"""A dictionary for each NaN feature, mapping k'th possible val for that feature to int(k)"""
NaN_feature_dicts = {}
for feature_idx,feature in enumerate(NaN_features):
    ith_feature_dict = {}
    for val_idx, possible_val in enumerate(np.unique(list(Training_data[NaN_features[feature_idx]]))):
        ith_feature_dict[str(possible_val)] = val_idx
    NaN_feature_dicts[feature] = ith_feature_dict   

## Mapping all NaN features to numbers in compliance with dictionaries above (embedding)
for feature in NaN_features:
    current_dict = NaN_feature_dicts[feature] 
    for index in range(len(list(Training_data[feature]))):
        if str(Training_data.at[index,feature]) not in list(current_dict.keys()):
            print(str(Training_data.at[index,feature]),'Not in dictionary')
        value = current_dict[str(Training_data.at[index,feature])] ## Getting
        Training_data.at[index,feature] = value                    ## Setting

## Extracting sales prices (labels) as numpy array
Training_Labels = Training_data["SalePrice"].values

## Transforming from Pandas datafram to numpy array 
Training_data = Training_data.values
## Removing first column of data (only a numbering), and last column (labels)
Training_Feature_vectors = Training_data[:,[i+1 for i in range(Training_data.shape[1]-2)]] 
## Assuring appropriate dtype of all entries in array (also setting any Nan = 0)
Training_Feature_vectors = Training_Feature_vectors.astype("float64")
Training_Feature_vectors = np.nan_to_num(Training_Feature_vectors)

## Normalizing s.t. each feature (column) has mean = 0 and then variance = 1 
# Labels
Training_Label_mean, Training_Label_var = np.mean(Training_Labels), np.std(Training_Labels) 
Training_Labels = Training_Labels - Training_Label_mean     ## mean
Training_Labels = Training_Labels * 1./Training_Label_var   ## variance

# Feature vectors
for col in range(Training_Feature_vectors.shape[1]):
    Training_Feature_vectors[:,col] = Training_Feature_vectors[:,col] - np.mean(Training_Feature_vectors[:,col])       ## mean
    Training_Feature_vectors[:,col] = Training_Feature_vectors[:,col] * 1./np.std(Training_Feature_vectors[:,col])     ## variance

Training_X_norm_Ada = Training_Feature_vectors


########## TEST DATA ##########
## Loading data
TEST_FILEPATH = "Data/test.csv"
Test_data = pd.read_csv(TEST_FILEPATH)
## Getting features not represented by a number 
NaN_features = []
for index, datatype in enumerate(Test_data.dtypes):
    if datatype == "O" or datatype == object:
        NaN_features.append(Test_data.columns[index])

## Dictionary of dictionaries
"""A dictionary for each NaN feature, mapping k'th possible val for that feature to int(k)"""
NaN_feature_dicts = {}
for feature_idx,feature in enumerate(NaN_features):
    ith_feature_dict = {}
    for val_idx, possible_val in enumerate(np.unique(list(Test_data[NaN_features[feature_idx]]))):
        ith_feature_dict[str(possible_val)] = val_idx
    NaN_feature_dicts[feature] = ith_feature_dict   

## Mapping all NaN features to numbers in compliance with dictionaries above (embedding)
for feature in NaN_features:
    current_dict = NaN_feature_dicts[feature] 
    for index in range(len(list(Test_data[feature]))):
        if str(Test_data.at[index,feature]) not in list(current_dict.keys()):
            print(str(Test_data.at[index,feature]),'Not in dictionary')
        value = current_dict[str(Test_data.at[index,feature])] ## Getting
        Test_data.at[index,feature] = value                    ## Setting

## Transforming from Pandas datafram to numpy array 
Test_data = Test_data.values
## Removing first column of data (only a numbering), and last column (labels)
Test_Feature_vectors = Test_data[:,[i+1 for i in range(Test_data.shape[1]-1)]] 
## Assuring appropriate dtype of all entries in array (also setting any Nan = 0)
Test_Feature_vectors = Test_Feature_vectors.astype("float64")
Test_Feature_vectors = np.nan_to_num(Test_Feature_vectors)

# Feature vectors
for col in range(Test_Feature_vectors.shape[1]):
    Test_Feature_vectors[:,col] = Test_Feature_vectors[:,col] - np.mean(Test_Feature_vectors[:,col])       ## mean
    Test_Feature_vectors[:,col] = Test_Feature_vectors[:,col] * 1./np.std(Test_Feature_vectors[:,col])     ## variance

Test_X_norm_Ada = Test_Feature_vectors

######### Fit regression model #########
regression_loss_funcs = ['linear','square','exponential']
base_reg = DecisionTreeRegressor(max_depth=15)
reg = AdaBoostRegressor(base_estimator=base_reg, n_estimators=65, 
                        learning_rate = 0.775, loss=regression_loss_funcs[1], random_state=0)

## Training model on training data
reg.fit(Training_X_norm_Ada, Training_Labels.flatten())

## Predict
y_pred_test = reg.predict(Test_X_norm_Ada)*Training_Label_var+Training_Label_mean
y_pred_training = reg.predict(Training_X_norm_Ada)*Training_Label_var+Training_Label_mean
y_true_training = Training_Labels*Training_Label_var+Training_Label_mean

# Score
deviation = 0
for i in range(len(X_norm)):
    deviation += np.abs(y_pred_training[i]-y_true_training[i])/np.abs(y_true_training[i])
deviation *= 1./Training_X_norm_Ada.shape[0]
print("Avg. deviation after normalization (Reg. tree w. AdaBoost) =",np.round(deviation,4),"(",np.round(deviation*100,2),"%)")
R2_coef(y_true_training,y_pred_training)


Avg. deviation after normalization (Reg. tree w. AdaBoost) = 0.0118 ( 1.18 %)
Coefficient of determination is:  0.9977963065695571


In [24]:
import csv  

TEST_FILEPATH = "Data/test.csv"
Test_data = pd.read_csv(TEST_FILEPATH)
Test_data_Id = Test_data.values[:,0]
id_n_prediction = []
for i in range(len(Test_data_Id)):
    id_n_prediction.append([int(Test_data_Id[i]),y_pred_test[i]])
id_n_prediction = np.array(id_n_prediction)

header = ['Id', 'SalePrice']
with open('Submissions/predictions4.csv', 'w', ) as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    # write the data
    for i in range(len(Test_data_Id)):
        writer.writerow([int(id_n_prediction[i][0]),id_n_prediction[i][1]])


In [268]:
id_n_prediction.shape

(1459, 2)