## Least-squares linear regression

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

h_train = pd.read_csv('./data/train.csv')
h_test = pd.read_csv('./data/test.csv')

h_train = h_train.fillna(h_train.mean())
h_test = h_test.fillna(h_test.mean())

train_columns = h_test.columns
X = h_train[train_columns]
y = h_train['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
h_test.shape

(1459, 80)

In [2]:
from sklearn.linear_model import LinearRegression

X_train_cols = ['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF',
                'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea']

linreg = LinearRegression().fit(X_train[X_train_cols], y_train)
h_test['SalePrice'] = linreg.predict(h_test[X_train_cols])
h_test = h_test.set_index('Id')
h_test['SalePrice'].to_csv("./data/least_squares_1_subm.csv", index=True, header=True)

Least-squares linear regression
'OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea'


## Ridge linear regression

In [3]:
from sklearn.linear_model import Ridge

h_test = pd.read_csv('./data/test.csv')
h_test = h_test.fillna(h_test.mean())

X_train_cols = ['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF',
                'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea']

linridge = Ridge(alpha=20.0).fit(X_train[X_train_cols], y_train)

h_test['SalePrice'] = linridge.predict(h_test[X_train_cols])
h_test = h_test.set_index('Id')
h_test['SalePrice'].to_csv("./data/ridge_1_subm.csv", index=True, header=True)


Ridge linear regression 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea'

## Ridge linear regression with feature normalization

In [5]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

from sklearn.linear_model import Ridge

h_test = pd.read_csv('./data/test.csv')
h_test = h_test.fillna(h_test.mean())

X_train_cols = ['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF',
                'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea']

X_train_scaled = scaler.fit_transform(X_train[X_train_cols])
X_test_scaled = scaler.transform(h_test[X_train_cols])

linridge = Ridge(alpha=20.0).fit(X_train_scaled, y_train)

h_test['SalePrice'] = linridge.predict(X_test_scaled)
h_test = h_test.set_index('Id')
h_test['SalePrice'].to_csv("./data/ridge_2_subm.csv", index=True, header=True)


  return self.partial_fit(X, y)


Ridge linear regression with feature normalization 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea'

In [6]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

from sklearn.linear_model import Ridge

h_test = pd.read_csv('./data/test.csv')
h_test = h_test.fillna(h_test.mean())

X_train_cols = ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold']

X_train_scaled = scaler.fit_transform(X_train[X_train_cols])
X_test_scaled = scaler.transform(h_test[X_train_cols])

linridge = Ridge(alpha=20.0).fit(X_train_scaled, y_train)

h_test['SalePrice'] = linridge.predict(X_test_scaled)
h_test = h_test.set_index('Id')
h_test['SalePrice'].to_csv("./data/ridge_3_subm.csv", index=True, header=True)

  return self.partial_fit(X, y)


Ridge linear regression with feature normalization  
'Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold'

## Ridge linear regression with regularization parameter: alpha

In [None]:
import numpy as np

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

from sklearn.linear_model import Ridge

h_test = pd.read_csv('./data/test.csv')
h_test = h_test.fillna(h_test.mean())

X_train_cols = ['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF',
                'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea']

X_train_scaled = scaler.fit_transform(X_train[X_train_cols])
X_test_scaled = scaler.transform(X_test[X_train_cols])

for this_alpha in [0, 1, 10, 20, 50, 100, 1000]:
    linridge = Ridge(alpha = this_alpha).fit(X_train_scaled, y_train)
    r2_train = linridge.score(X_train_scaled, y_train)
    r2_test = linridge.score(X_test_scaled, y_test)
    num_coeff_bigger = np.sum(abs(linridge.coef_) > 1.0)
    print('Alpha = {:.2f}\nnum abs(coeff) > 1.0: {}, r-squared training: {:.2f}, r-squared test: {:.2f}\n'
         .format(this_alpha, num_coeff_bigger, r2_train, r2_test))
    
h_test_scaled = scaler.transform(h_test[X_train_cols])
    

## Lasso linear regression