
**Authors:** Zofia Walczewska

In [1]:
#Setup / Imports
import pandas as pd
import matplotlib as plt
import numpy as np
import math
import statsmodels.api as sm
# Train / Test split:
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

  import pandas.util.testing as tm


In [2]:
houses_dataset = pd.read_csv("/content/houses (1).csv", index_col=0)
houses_dataset

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
6681,3.500,3,2.25,1860,8378,2.0,0.0,0.0,3,7,1860,0,1995,0.0,98038,47.3875,-122.032,1870,8378
17798,5.925,4,3.00,2170,8240,1.0,0.0,0.0,4,8,1370,800,1968,0.0,98052,47.6291,-122.093,2020,7944
18854,2.555,2,1.00,1440,43560,1.0,0.0,0.0,4,7,1150,290,1965,0.0,98027,47.4916,-122.082,1870,56628
13478,13.300,4,2.25,3260,4640,2.0,0.0,0.0,5,9,2360,900,1907,0.0,98112,47.6272,-122.312,3240,5800
10509,3.891,2,1.00,840,5400,1.0,0.0,0.0,4,7,840,0,1948,0.0,98118,47.5489,-122.271,1340,5400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16125,2.900,2,1.00,930,7740,1.0,0.0,0.0,3,6,930,0,1924,0.0,98125,47.7091,-122.292,1250,7740
19004,3.150,3,2.50,1730,6368,2.0,0.0,0.0,3,7,1730,0,1993,0.0,98038,47.3505,-122.032,1780,6597
9094,6.850,3,2.50,3450,8000,3.0,0.0,0.0,4,8,2970,480,1927,1975.0,98116,47.5605,-122.402,1880,6135
3537,3.260,6,1.50,1930,8400,1.0,0.0,0.0,3,7,1030,900,1971,0.0,98146,47.4869,-122.340,1780,9520


In [3]:
houses_dataset.shape

(13397, 19)

1. A quick statistical study about the dataset : predictive varibales, target variable, link between the variables.

In [4]:
def my_regression(data, idx_p, idx_t):
    X = data.iloc[:,idx_p]
    X = sm.add_constant(X) 
    Y = data.iloc[:,idx_t]
    model = sm.OLS(Y, X).fit() 
    return model

In [5]:
#target variable- Y
Y = houses_dataset['price']
columns = houses_dataset.columns
max_R_2 = 0
min_I_r = 1000000000000000000000000000000000000
for i in range(1, 19):
  model = my_regression(houses_dataset, [i], 0) # fit the model to predict Y using X
  print(columns[i])
  print("R^2",model.rsquared)
  R_2 = model.rsquared
  I_t=sum(model.resid**2)
  print("I_t", I_t)
  I_m=model.rsquared*I_t
  print('I_m',I_m)
  I_r=I_t-I_m
  print('I_r', I_r)
  print('critical probability', model.pvalues)
  critic_prob = model.pvalues
  print()
  if R_2 > max_R_2 and I_r < min_I_r:
    min_r_2 = R_2
    min_I_r = I_r
    min_criti_prob = critic_prob
    best_variable = i
  print('best var', best_variable)
print('best var based on statictics:', columns[13])

significant_vars = np.array([])
#check how the best variable is behaving with other variables
for i in range(1, 19):
  if i != best_variable:
    model = my_regression(houses_dataset, [best_variable, i], 0)
    if(model.pvalues[1] < 0.05):
      significant_vars = np.append(significant_vars, i)

print('variables that are significant in the presence of the best variable:', significant_vars)

bedrooms
R^2 0.0918534076592672
I_t 169063.80737826164
I_m 15529.086819543292
I_r 153534.72055871834
critical probability const        1.305121e-31
bedrooms    1.276790e-282
dtype: float64

best var 1
bathrooms
R^2 0.27690175512124326
I_t 134614.54727550482
I_m 37275.00440543886
I_r 97339.54287006595
critical probability const        0.678856
bathrooms    0.000000
dtype: float64

best var 2
sqft_living
R^2 0.49640102806105757
I_t 93751.78006044979
I_m 46538.48000456143
I_r 47213.30005588836
critical probability const          6.023324e-22
sqft_living    0.000000e+00
dtype: float64

best var 3
sqft_lot
R^2 0.007529065473369179
I_t 184761.92755494983
I_m 1391.0846495471103
I_r 183370.8429054027
critical probability const       0.000000e+00
sqft_lot    8.191470e-24
dtype: float64

best var 3
floors
R^2 0.06673028102799827
I_t 173740.81820156818
I_m 11593.773624625002
I_r 162147.04457694318
critical probability const     1.885723e-190
floors    3.539163e-203
dtype: float64

best var 3
wate

2. Simple regression models : you can use only one predictive variable from the 18. You are asked to choose the best one and evaluate its generalization error.

In [6]:
model = my_regression(houses_dataset, [13], 0)
model.summary()

train , test = train_test_split(houses_dataset, test_size = 0.25) 
print(train.shape)
print(test.shape)

def my_prediction(my_model, data):
    X_new = data[my_model.model.exog_names[1:]]
    X_new = sm.add_constant(X_new)
    predictions = my_model.predict(X_new)
    return predictions

def generalization_error_split_1(train, test, idx_p, idx_t):
  model = my_regression(train, idx_p, idx_t)
  predictions = my_prediction(model, test)
  MSE=np.mean((predictions-test.iloc[:,idx_t])**2)
  return MSE

#generalization_error_split(train, test, [13], 0)
min_gen_error_simple_regression = 384441531451994100000000
for var in range(1,19):
  error = generalization_error_split_1(train, test, [var], 0)
  print(columns[var],generalization_error_split_1(train, test, [var], 0))
  if error < min_gen_error_simple_regression:
    best_var = columns[var]
    min_gen_error_simple_regression = error
print()
print('smallest gen error has: ', best_var)
print('the error for simple regression: ', min_gen_error_simple_regression)

(10047, 19)
(3350, 19)
bedrooms 12.823332355833903
bathrooms 10.208452025657175
sqft_living 7.017563087396969
sqft_lot 14.037867664522041
floors 13.271413353320012
waterfront 13.024297177462978
view 12.491076142353661
condition 14.179990129729239
grade 8.052645393015863
sqft_above 8.658522831401445
sqft_basement 12.914233273027044
yr_built 14.131748555142527
yr_renovated 14.03828772373208
zipcode 14.125603818244285
lat 12.844807958972153
long 14.168979434868318
sqft_living15 9.505683089667501
sqft_lot15 14.055965688816476

smallest gen error has:  sqft_living
the error for simple regression:  7.017563087396969


3. Multiple regression : use the 18 variables to predict the sale price. Estimate the generalization error of this model.

In [7]:
error_multiple_regression_18_vars = generalization_error_split_1(train, test, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 
                                                      11, 12, 13, 14, 15, 16, 17, 18], 0)

print('error of prediction using all 18 variables:', error_multiple_regression_18_vars)

error of prediction using all 18 variables: 4.270437542475231


4. Variable selection : apply the different variable selection techniques seen during the course to select interesting models. Can you find better models than before ? (according to the estimation of the generalization error)

In [8]:
def my_prediction(my_model, data):
  X_new = data[my_model.model.exog_names[1:]]
  X_new = sm.add_constant(X_new)
  predictions = my_model.predict(X_new)
  return predictions

def generalization_error_split(model, test, idx_t):
  predictions = my_prediction(model, test)
  MSE=np.mean((predictions-test.iloc[:,idx_t])**2)
  return MSE 

def step_selection_gen_error(train, test, v_s, v_nu, idx_t):
  best_error = 1000000000000
  best_v_s = []
  for var in v_nu:
    new_v_s = np.append(v_s,[var])
    model2=my_regression(train, new_v_s, idx_t)
    error = generalization_error_split(model2, test, idx_t)
    if(error < best_error):
      best_error = error
      best_v_s = var
  return best_v_s, best_error, model
     
def forward_selection_gen_error(data, idx_p, idx_t):
  train, test = train_test_split(data, test_size=0.25, random_state=20)
  v_s = []
  best_error = 10000000000
  v_nu = np.array(idx_p)
  stop = False
  while stop == False:
    result = step_selection_gen_error( train, test, v_s, v_nu, idx_t )
    var = int(result[0])
    error = result[1]
    if (result[1] < best_error):
      v_s = np.append( v_s , [ var ] ).astype(int)
      best_error = error
      v_nu = np.delete( v_nu, np.where(v_nu == var) )
    else:
      stop = True
  return v_s, best_error
    
best_vars = forward_selection_gen_error(train, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
                                                12, 13, 14, 15, 16, 17, 18], 0)
print('the array of best prediction variables:', best_vars[0])
error_multiple_regression_best_choice = best_vars[1]
print('error for thiose chosen variables:', error_multiple_regression_best_choice)

the array of best prediction variables: [ 3 15  7  9 12  6 14 16  2 17  8  5]
error for thiose chosen variables: 3.4994216578150064


5. Non-linear models : add some non-linear variables (that you can couple with variable selection techniques) to try to design more accurate models. First - strict stopping criteria.

The model with all the variables and their power 2:

In [9]:
def add_polynomial_feature(data, idx_p, power):
  new_data = data.copy(deep = True)
  for i in range(0, len(idx_p)):
      for j in power:
          for k in range(2, j+1):
              new_data['{}_pow_{}'.format(new_data.columns[idx_p[i]],k)] = new_data.iloc[:,idx_p[i]]**k
  return(new_data)

train_poly_all_var_2_pow = train
test_poly_all_var_2_pow = test
for i in range(1, 19):
    train_poly_all_var_2_pow = add_polynomial_feature(train_poly_all_var_2_pow, [i], [2])
    test_poly_all_var_2_pow = add_polynomial_feature(test_poly_all_var_2_pow, [i], [2])

model=my_regression(train_poly_all_var_2_pow,[*range(1, 37)],0)
Y_pred = my_prediction(model, test_poly_all_var_2_pow)
error_poly_all_variables = generalization_error_split(model, test_poly_all_var_2_pow, 0)
print('generalization error for second degree polynomials for every variable:', error_poly_all_variables)

generalization error for second degree polynomials for every variable: 3.3708521209557523


The model with 9 polynomials for each variable:

In [10]:
train_poly_2 = train
test_poly_2 = test
min = 351134689174875540000
print()
print('generalization errors for predictions based on 9 polynomials for each variable:')
for i in range(1, 19):
  for j in range(2, 10):
    train_poly2 = add_polynomial_feature(train_poly_2, [i], [j])
    test_poly2 = add_polynomial_feature(test_poly_2, [i], [j])
  model2=my_regression(train_poly2,[i,19, 20, 21, 22, 23, 24, 25, 26],0)
  Y_pred_2 = my_prediction(model2, test_poly2)
  error2 = generalization_error_split(model2, test_poly2, 0)
  if (error2 < min):
    min = error2
    min_column = columns[i]
    min_column_nb = i
  print(columns[i])
  print(error2)

print()
print('the smallest error:', min_column)
print(min)


generalization errors for predictions based on 9 polynomials for each variable:
bedrooms
12.729314852944812
bathrooms
9.239283997629402
sqft_living
7.536283058415072
sqft_lot
31.508485699724638
floors
13.042625004906043
waterfront
13.02429717746298
view
12.425719437856076
condition
14.073576342838045
grade
7.277724871663395
sqft_above
9.614725170591603
sqft_basement
28.46334027914823
yr_built
13.80728230433764
yr_renovated
41.299505267735405
zipcode
13.734210931526887
lat
11.642130734302112
long
14.1226878729471
sqft_living15
9.603850057551691
sqft_lot15
22.894910479820705

the smallest error: grade
7.277724871663395


The model with the best amount of polynomials for the chosen variable that had the smallest error with the polynomial 2.

In [11]:
### checking the best amount of polynomials for the best variable chosen before 
for j in range(3, 10):
    train_poly = add_polynomial_feature(train, [min_column_nb], [j])
    test_poly = add_polynomial_feature(test, [min_column_nb], [j])

v_s = [min_column_nb,19]
X = train_poly.iloc[:,min_column_nb] 
X = sm.add_constant(X) 
Y = train_poly['price']
model2=my_regression(train_poly,v_s,0)
error_poly_one_variable = generalization_error_split(model2, test_poly, 0)
best_v_s = v_s

for i in range(20, 27):
  v_s = np.append(v_s,[i])
  X = train_poly.iloc[:,min_column_nb] 
  X = sm.add_constant(X) 
  Y = train_poly['price']
  model2=my_regression(train_poly,v_s,0)

  if (generalization_error_split(model2, test_poly, 0) < min):
    error_poly_one_variable = generalization_error_split(model2, test_poly, 0)
    best_v_s = v_s

print('best min error for this variable: ', error_poly_one_variable)
print('the v_s that won: ', best_v_s)

best min error for this variable:  7.276393148195483
the v_s that won:  [ 9 19 20 21 22 23 24 25]


In [12]:
print('error for polynomial regression with best variable power 2: ', error_poly_one_variable)
print('error for polynomial regression with all variables power 2: ', error_poly_all_variables)
print('error for multiple regression best choice: ', error_multiple_regression_best_choice)
print('error for multiple regression all variables: ', error_multiple_regression_18_vars)
print('error for simple regression and best variable: ', min_gen_error_simple_regression)

error for polynomial regression with best variable power 2:  7.276393148195483
error for polynomial regression with all variables power 2:  3.3708521209557523
error for multiple regression best choice:  3.4994216578150064
error for multiple regression all variables:  4.270437542475231
error for simple regression and best variable:  7.017563087396969


Generation of predtictions for the competition dataset based on the model that had the smallest error out of all the models tested above:

In [14]:
competition = pd.read_csv("/content/houses_competition.csv", index_col=0)

for i in range(2, 19):
    houses_dataset = add_polynomial_feature(houses_dataset, [i], [2])
    competition = add_polynomial_feature(competition, [i-1], [2])

model=my_regression(houses_dataset,[*range(1, 36)],0)

# If you have one regression model named ’my_model’
pred = my_prediction(model, competition)

pred = pd.DataFrame({'ID': pred.index, 'Price':pred})
pred.to_csv('my_submission.csv', index=False)
# This will create a csv file ’my_submission.csv’ with the predictions
# of the competition dataset

6. Non-linear model with the second stopping criteria. Stop the forward selection until the 3rd unsuccesfull iteration (meaning that the error isnt getting better).


In [15]:
def forward_selection_gen_error_succes_iteration(data, idx_p, idx_t, max_nb_of_iter):
  train, test = train_test_split(data, test_size=0.25, random_state=20)
  v_s = []
  best_error = 10000000000
  v_nu = np.array(idx_p)
  stop = False
  nb_of_iterations = 0
  while stop == False and v_nu != []:
    result = step_selection_gen_error( train, test, v_s, v_nu, idx_t )
    var = int(result[0])
    error = result[1]
    if (result[1] < best_error):
      best_error = error
      best_v_s = v_s
      model = result[2]
    else:
      #count the number of iterations that havent found a better error
      nb_of_iterations = nb_of_iterations +1

    v_s = np.append( v_s , [ var ] ).astype(int)
    v_nu = np.delete( v_nu, np.where(v_nu == var) )

    #if the i-th unsuccesfull iteration isn't better, stop the search.
    if( nb_of_iterations == max_nb_of_iter):
      stop = True

  return best_v_s, best_error, model

best_vars = forward_selection_gen_error_succes_iteration(train, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
                                                12, 13, 14, 15, 16, 17, 18], 0, 5)
print('the array of best prediction variables with 3rd iteration stopping criteria:', best_vars)
error_multiple_regression_with_stopping_criteria_1 = best_vars[1]
print(error_multiple_regression_with_stopping_criteria_1)


the array of best prediction variables with 3rd iteration stopping criteria: (array([ 3, 15,  7,  9, 12,  6, 14, 16,  2, 17,  8]), 3.4994216578150064, <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fbbf817f490>)
3.4994216578150064


Using forward selection with iteration stopping criteria for finding the best variables from all the variables and their polynomials of 2nd degree:

In [17]:
#generating 2nd polynomials of all variables existing in the table
train_poly_all_var_2_pow = train
test_poly_all_var_2_pow = test
for i in range(1, 20):
    train_poly_all_var_2_pow = add_polynomial_feature(train_poly_all_var_2_pow, [i], [2])
    test_poly_all_var_2_pow = add_polynomial_feature(test_poly_all_var_2_pow, [i], [2])

best_vars_2nd_polys_iter_criteria = forward_selection_gen_error_succes_iteration(train_poly_all_var_2_pow, [*range(1, 37)], 0, 5)
print()
print('vars chosen from the array with all 2nd polynomials chosen by forward selection with 3rd non-success iteration')
print(best_vars_2nd_polys_iter_criteria[0])
print('error for this model', best_vars_2nd_polys_iter_criteria[1])


vars chosen from the array with all 2nd polynomials chosen by forward selection with 3rd non-success iteration
[27 12  3 15  6 33  7 14 34 35  2  8 30 16 13  4 32 22 18 24 23]
error for this model 3.1639397777785296


7. Non-linear model with the third stopping criteria- exhaustive method. Check all the possible variations and chose the best one. Not suitable for dataset with a lot of variables.

In [18]:
def forward_selection_gen_error_exhaustive(data, idx_p, idx_t):
  train, test = train_test_split(data, test_size=0.25, random_state=20)
  v_s = []
  best_error = 10000000000
  v_nu = np.array(idx_p)
  stop = False
  nb_of_iterations = 0
  #check the error while v_nu has still values -> exhaustive method, trying 
  #every possibility and chosing the one with the smallest error
  while v_nu.size > 0 :
    result = step_selection_gen_error( train, test, v_s, v_nu, idx_t )
    var = int(result[0])
    error = result[1]
    if (result[1] < best_error):
      best_error = error
      best_v_s = v_s
      model = result[2]

    v_s = np.append( v_s , [ var ] ).astype(int)
    v_nu = np.delete( v_nu, np.where(v_nu == var) )

  return best_v_s, best_error, model

best_vars = forward_selection_gen_error_exhaustive(train, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
                                                12, 13, 14, 15, 16, 17, 18], 0)
print('the array of best prediction variables with exhastive stopping criteria:', best_vars)
error_multiple_regression_with_stopping_criteria_2 = best_vars[1]
print(error_multiple_regression_with_stopping_criteria_2)

the array of best prediction variables with exhastive stopping criteria: (array([ 3, 15,  7,  9, 12,  6, 14, 16,  2, 17,  8]), 3.4994216578150064, <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fbbf817f490>)
3.4994216578150064


Using exhaustive forward selection to choose the best variables among all the variables and their 2nd degree polynomials:

In [19]:
best_vars_2nd_polys_exhauxtive = forward_selection_gen_error_exhaustive(train_poly_all_var_2_pow, [*range(1, 37)], 0)
print()
print('vars chosen from the array with all 2nd polynomials chosen by exhaustive forward selection')
print(best_vars_2nd_polys_exhauxtive[0])
print('error for this model', best_vars_2nd_polys_exhauxtive[1])


vars chosen from the array with all 2nd polynomials chosen by exhaustive forward selection
[27 12  3 15  6 33  7 14 34 35  2  8 30 16 13  4 32 22 18 24 23]
error for this model 3.1639397777785296


8. Use the critical probability as a performance criteria for variable selection (for this performance
criteria, only one particular stopping criterion can be used : stop when all selected variables are significant and no one from the other variables are significant when added). Significant means with a critical probability less than 0.05.

In [20]:
def step_selection_gen_error_critical_probability(train, test, v_s, v_nu, idx_t):
  best_error = 1000000000000
  best_v_s = []
  for var in v_nu:
    new_v_s = np.append(v_s,[var])
    model2=my_regression(train, new_v_s, idx_t)
    error = generalization_error_split(model2, test, idx_t)
    significance_of_addeded_var = model2.pvalues[-1]
    if(error < best_error):
      best_error = error
      best_v_s = var
  return best_v_s, best_error, significance_of_addeded_var
     
def forward_selection_gen_error_critical_probability(data, idx_p, idx_t):
  train, test = train_test_split(data, test_size=0.25, random_state=20)
  v_s = []
  best_error = 10000000000000000000000000000000
  v_nu = np.array(idx_p)
  stop = False
  while stop == False:
    result = step_selection_gen_error_critical_probability( train, test, v_s, v_nu, idx_t )
    var = int(result[0])
    error = result[1]
    significance = result[2]
    if (result[1] < best_error and significance < 0.05):
      v_s = np.append( v_s , [ var ] ).astype(int)
      best_error = error
      v_nu = np.delete( v_nu, np.where(v_nu == var) )
    else:
      stop = True
  return v_s, best_error
    
best_vars = forward_selection_gen_error_critical_probability(train, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
                                                12, 13, 14, 15, 16, 17, 18], 0)
print('the array of best prediction variables:', best_vars[0])
error_forward_selection_with_critical_probability = best_vars[1]
print('error for thiose chosen variables:', error_forward_selection_with_critical_probability)

the array of best prediction variables: [ 3 15  7  9 12  6 14 16  2]
error for thiose chosen variables: 3.539976938346405


9.  Implement the backward selection procedure with exhautive stopping criteria.

In [22]:
def backward_step_selection_gen_error(train, test, v_s, idx_t):
  best_error = 1000000000000000000000000000000000
  for var in v_s:
    new_v_s = np.delete( v_s, np.where(v_s == var) )
    model2=my_regression(train, new_v_s, idx_t)
    error = generalization_error_split(model2, test, idx_t)
    if(error < best_error):
      best_error = error
      best_var = var
  return best_var, best_error, model2

def backward_selection_gen_error(data, idx_p, idx_t):
  train, test = train_test_split(data, test_size=0.25, random_state=20)
  v_s = np.array(idx_p)
  model2=my_regression(train, v_s, idx_t)
  best_error = generalization_error_split(model2, test, idx_t)

  #check the error while v_nu has still values -> exhaustive method, trying 
  #every possibility and chosing the one with the smallest error
  while v_s.size > 0 :
    result = backward_step_selection_gen_error( train, test, v_s, idx_t )
    var = int(result[0])
    error = result[1]
    if (result[1] < best_error):
      best_error = error
      best_v_s = np.delete( v_s, np.where(v_s == var))
      model = result[2]
    v_s = np.delete( v_s, np.where(v_s == var))
  return best_v_s, best_error, model

train, test = train_test_split(houses_dataset, test_size=0.25, random_state=20)    
best_vars = backward_selection_gen_error(train, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
                                                12, 13, 14, 15, 16, 17, 18], 0)
print('the array of best prediction variables chosen in backward selection:', best_vars[0])
error_backward_selection = best_vars[1]
print('error of the backward selection', error_backward_selection)

the array of best prediction variables chosen in backward selection: [ 1  2  3  6  7  8  9 10 11 12 13 14 15 16 18]
error of the backward selection 4.633464308049005


Using backward selection to find the best variables among all the variables and theis 2nd degree polynomials:

In [23]:
best_vars_2nd_polys_backward_selection = backward_selection_gen_error(train_poly_all_var_2_pow, [*range(1, 37)], 0)
print()
print('vars chosen from the array with all 2nd polynomials chosen by exhaustive forward selection')
print(best_vars_2nd_polys_backward_selection[0])
print('error for this model', best_vars_2nd_polys_backward_selection[1])


vars chosen from the array with all 2nd polynomials chosen by exhaustive forward selection
[ 2  3  4  5  6  7  8 12 13 14 15 16 18 22 23 24 27 30 32 33 34 35]
error for this model 3.163939679836004


Errors from the models with different stopping criteria and backward selection used on the set of all the variables and their 2nd degree polynomials:

In [25]:
print('backward selection', best_vars_2nd_polys_backward_selection[1])
print('exhaustive forward selection',  best_vars_2nd_polys_exhauxtive[1])
print('forward selection with 5 iterations criteria', best_vars_2nd_polys_iter_criteria[1])

backward selection 3.163939679836004
exhaustive forward selection 3.1639397777785296
forward selection with 5 iterations criteria 3.1639397777785296


Backward selection seems to have the best results- for finding the best variable among all the variables and 2nd polynomials. 

Lets try to generate the predictions of prices for the competition dataset based on this model that choses the best variables among all the variables and theis 2nd degree polynomials:

In [28]:
result = backward_selection_gen_error(houses_dataset, [*range(1, 36)], 0)
model = result[2]

# If you have one regression model named ’my_model’
pred = my_prediction(model, competition)
print(pred)
pred = pd.DataFrame({'ID': pred.index, 'Price':pred})
pred.to_csv('my_submission.csv', index=False)
# This will create a csv file ’my_submission.csv’ with the predictions
# of the competition dataset

15529     3.129591
3233      4.055427
14381     5.350966
3201     12.147862
3425      2.054009
           ...    
7989      5.073475
9973      3.057983
9007      3.378706
16345    14.711418
13582     1.719877
Length: 2365, dtype: float64
