In [41]:
import os
import math
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

In [42]:
file_to_target = {
    "salary": "salary",
    "maisons": "price"
}

In [43]:
def normalize(train_data, test_data, col_regr, method='mean_std'):
    """
    Normalizes all the features by linear transformation *except* for the target regression column
    specified as `col_regr`.
    Two normalization methods are implemented:
      -- `mean_std` shifts by the mean and divides by the standard deviation
      -- `maxmin` shifts by the min and divides by the difference between max and min
      *Note*: mean/std/max/min are computed on the training data
    The function returns a pair normalized_train, normalized_test. For example,
    if you had `train` and `test` pandas DataFrames with the regression col stored in column `Col`, you can do

        train_norm, test_norm = normalize(train, test, 'Col')

    to get the normalized `train_norm` and `test_norm`.
    """
    # removing the class column so that it is not scaled
    no_class_train = train_data.drop(col_regr, axis=1)
    no_class_test = test_data.drop(col_regr, axis=1)

    # scaling
    normalized_train, normalized_test = None, None
    if method == 'mean_std':
        normalized_train = (no_class_train - no_class_train.mean()) / no_class_train.std()
        normalized_test = (no_class_test - no_class_train.mean()) / no_class_train.std()
    elif method == 'maxmin':
        normalized_train = (no_class_train - no_class_train.min()) / (no_class_train.max() - no_class_train.min())
        normalized_test = (no_class_test - no_class_train.min()) / (no_class_train.max() - no_class_train.min())
    else:
        raise f"Unknown method {method}"

    # gluing back the class column and returning
    return pd.concat([train_data[col_regr], normalized_train], axis=1), pd.concat([test_data[col_regr], normalized_test], axis=1)

In [44]:
def get_data(base_path="../csv", file_prefix="maisons", feature_cols=None, target=None, norm=False):
    assert file_prefix in ["salary", "maisons"], "Unknown file"
    if target is None:
        target = file_to_target[file_prefix]
    train_path = os.path.join(base_path, f"{file_prefix}_train.csv")
    test_path = os.path.join(base_path, f"{file_prefix}_test.csv")

    train_dataset = pd.read_csv(train_path, header=0)
    summary(train_dataset)
    test_dataset = pd.read_csv(test_path, header=0)
    summary(test_dataset)

    if norm:
        train_dataset, test_dataset = normalize(train_dataset, test_dataset, target)
    # The features used to build matrix X
    if feature_cols is None:
        feature_cols = train_dataset.columns.to_list()
        feature_cols.remove(target)
    assert isinstance(feature_cols, list), "feature_cols must be a list"

    assert set(feature_cols).intersection(train_dataset.columns) == set(feature_cols),\
        f"Missing columns {set(feature_cols).difference(train_dataset.columns)}"

    X_train = train_dataset[feature_cols]
    y_train = train_dataset[target]

    X_test = test_dataset[feature_cols]
    y_test = test_dataset[target]

    return X_train, y_train, X_test, y_test

In [45]:
def summary(dataset):
    print(f'Shape of the data {dataset.shape}')
    print(dataset.head(5))
    print(dataset.describe())
    print('\n\n')

In [46]:
def fit_and_predict(X_train, y_train, X_test, y_test, regressor, verbose=False):
    assert isinstance(regressor, LinearRegression) or isinstance(regressor, KNeighborsRegressor)
    regressor.fit(X_train, y_train)

    if isinstance(regressor, LinearRegression):
        print(f'\tintercept = {regressor.intercept_}')
        print(f'\tcoefficient = {regressor.coef_}')

    y_pred = regressor.predict(X_test)
    if verbose:
        for a, b in zip(y_test, y_pred):
            print(f'  true value: {a} \t predicted value: {b}')
    return y_pred

In [47]:
def evaluate_performance(y_test, y_pred):
    print('\n\n')
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print('\n')

In [50]:
X_train, y_train, X_test, y_test = get_data(file_prefix="maisons", norm=False)
regressor = KNeighborsRegressor(n_neighbors=1)
y_pred = fit_and_predict(X_train, y_train, X_test, y_test, regressor)
evaluate_performance(y_test, y_pred)

Shape of the data (33, 8)
   price  sqft  age  feats  ne  cust  cor   tax
0    699  1400   45      1   0     1    1   481
1   1900  2580    4      4   1     0    0  1534
2    870  1273    4      4   0     0    0   638
3    940  1305    5      3   0     0    0   647
4    975  1739   13      3   0     0    0   880
             price         sqft        age      feats         ne       cust  \
count    33.000000    33.000000  33.000000  33.000000  33.000000  33.000000   
mean   1154.484848  1794.696970  14.454545   3.878788   0.575758   0.242424   
std     374.196650   530.913804  12.530191   1.430856   0.501890   0.435194   
min     699.000000  1083.000000   2.000000   1.000000   0.000000   0.000000   
25%     900.000000  1400.000000   6.000000   3.000000   0.000000   0.000000   
50%    1050.000000  1700.000000   8.000000   4.000000   1.000000   0.000000   
75%    1270.000000  1920.000000  18.000000   4.000000   1.000000   0.000000   
max    2150.000000  2931.000000  45.000000   8.000000 

In [51]:
X_train, y_train, X_test, y_test = get_data(file_prefix="maisons", norm=True)
regressor = KNeighborsRegressor(n_neighbors=1)
y_pred = fit_and_predict(X_train, y_train, X_test, y_test, regressor)
evaluate_performance(y_test, y_pred)

Shape of the data (33, 8)
   price  sqft  age  feats  ne  cust  cor   tax
0    699  1400   45      1   0     1    1   481
1   1900  2580    4      4   1     0    0  1534
2    870  1273    4      4   0     0    0   638
3    940  1305    5      3   0     0    0   647
4    975  1739   13      3   0     0    0   880
             price         sqft        age      feats         ne       cust  \
count    33.000000    33.000000  33.000000  33.000000  33.000000  33.000000   
mean   1154.484848  1794.696970  14.454545   3.878788   0.575758   0.242424   
std     374.196650   530.913804  12.530191   1.430856   0.501890   0.435194   
min     699.000000  1083.000000   2.000000   1.000000   0.000000   0.000000   
25%     900.000000  1400.000000   6.000000   3.000000   0.000000   0.000000   
50%    1050.000000  1700.000000   8.000000   4.000000   1.000000   0.000000   
75%    1270.000000  1920.000000  18.000000   4.000000   1.000000   0.000000   
max    2150.000000  2931.000000  45.000000   8.000000 

In [34]:
print(f'\tintercept = {regressor.intercept_}')
print(f'\tcoefficient = {regressor.coef_}')

	intercept = 19099.67826777992
	coefficient = [  523.77577788  -148.38565385 -1834.00035723  5017.27702189
 10866.00460673 -1515.0452763 ]
