## Import packages

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from evolutionary_search import EvolutionaryAlgorithmSearchCV
from hyperopt import fmin, tpe, hp, STATUS_OK
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
from sklearn.ensemble import BaggingRegressor

## Step I. Load data

In [3]:
train = pd.read_csv('train.csv')
X_test = pd.read_csv('X_test.csv')

X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1]

## Build the model and perform parameter tuning

In [4]:
def modelfit(clf, train, target, performCV=True, cv=10):
    clf.fit(train, target)
    pred = clf.predict(train)
    train_r2_score = r2_score(target, pred)
    
    print('\nModel Report')
    print('Train_coef_of_det: %0.6f' %(train_r2_score))
    
    if performCV:
        cv_r2_score = cross_val_score(clf, train, target, cv=cv, scoring='r2', n_jobs=-1)
        print('CV_coef_of_det: Mean-%0.6f | Std-%0.6f | Min-%0.6f | Max-%0.6f' %(np.mean(cv_r2_score),
            np.std(cv_r2_score), np.min(cv_r2_score), np.max(cv_r2_score)))   

### Baseline model

In [5]:
params = {}
grid = GridSearchCV(estimator=BaggingRegressor(), param_grid=params, cv=10, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)
print grid.best_params_
print grid.best_score_

{}
0.85706230754


In [6]:
params = {}
grid = GridSearchCV(estimator=BaggingRegressor(base_estimator=None, n_estimators=10, max_samples=1.0,
        max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False,
        n_jobs=1, random_state=10, verbose=0), param_grid=params, cv=10, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)
print grid.best_params_
print grid.best_score_

{}
0.850850639252


### Grid search

In [25]:
params = {}
grid = GridSearchCV(estimator=BaggingRegressor(n_estimators=10, max_samples=1.0,
        max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, base_estimator=None,
        verbose=0, warm_start=False, n_jobs=-1, random_state=10), param_grid=params, cv=10, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)
print grid.best_params_
print grid.best_score_

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


{}
0.850850639252


In [7]:
params = {'n_estimators': range(10,210,10)}
grid = GridSearchCV(estimator=BaggingRegressor(n_estimators=10, max_samples=1.0,
        max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, base_estimator=None,
        verbose=0, warm_start=False, n_jobs=-1, random_state=10), param_grid=params, cv=10, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)
print grid.best_params_
print grid.best_score_

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


{'n_estimators': 70}
0.867074503041


In [8]:
params = {'n_estimators': range(70,700,50)}
grid = GridSearchCV(estimator=BaggingRegressor(n_estimators=10, max_samples=1.0,
        max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, base_estimator=None,
        verbose=0, warm_start=False, n_jobs=1, random_state=10), param_grid=params, cv=10, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)
print grid.best_params_
print grid.best_score_

{'n_estimators': 70}
0.867074503041


In [9]:
params = {'max_samples': [i/10.0 for i in range(1,11)]}
grid = GridSearchCV(estimator=BaggingRegressor(n_estimators=70, max_samples=1.0,
        max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, base_estimator=None,
        verbose=0, warm_start=False, n_jobs=1, random_state=10), param_grid=params, cv=10, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)
print grid.best_params_
print grid.best_score_

{'max_samples': 0.9}
0.867136741441


In [10]:
params = {'max_samples': [i/100.0 for i in range(84,96)]}
grid = GridSearchCV(estimator=BaggingRegressor(n_estimators=70, max_samples=1.0,
        max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, base_estimator=None,
        verbose=0, warm_start=False, n_jobs=1, random_state=10), param_grid=params, cv=10, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)
print grid.best_params_
print grid.best_score_

{'max_samples': 0.9}
0.867136741441


In [11]:
params = {'max_samples': [i/1000.0 for i in range(894,906)]}
grid = GridSearchCV(estimator=BaggingRegressor(n_estimators=70, max_samples=1.0,
        max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, base_estimator=None,
        verbose=0, warm_start=False, n_jobs=1, random_state=10), param_grid=params, cv=10, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)
print grid.best_params_
print grid.best_score_

{'max_samples': 0.896}
0.868980362984


In [12]:
params = {'max_samples': [i/10000.0 for i in range(8954,8966)]}
grid = GridSearchCV(estimator=BaggingRegressor(n_estimators=70, max_samples=1.0,
        max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, base_estimator=None,
        verbose=0, warm_start=False, n_jobs=1, random_state=10), param_grid=params, cv=10, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)
print grid.best_params_
print grid.best_score_

{'max_samples': 0.8958}
0.868980362984


In [13]:
params = {'max_features': [i/10.0 for i in range(1,11)]}
grid = GridSearchCV(estimator=BaggingRegressor(n_estimators=70, max_samples=0.896,
        max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, base_estimator=None,
        verbose=0, warm_start=False, n_jobs=1, random_state=10), param_grid=params, cv=10, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)
print grid.best_params_
print grid.best_score_

{'max_features': 0.5}
0.87535570631


In [14]:
params = {'max_features': [i/100.0 for i in range(44,56)]}
grid = GridSearchCV(estimator=BaggingRegressor(n_estimators=70, max_samples=0.896,
        max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, base_estimator=None,
        verbose=0, warm_start=False, n_jobs=1, random_state=10), param_grid=params, cv=10, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)
print grid.best_params_
print grid.best_score_

{'max_features': 0.53}
0.876106932589


In [15]:
params = {'max_features': [i/1000.0 for i in range(524,536)]}
grid = GridSearchCV(estimator=BaggingRegressor(n_estimators=70, max_samples=0.896,
        max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, base_estimator=None,
        verbose=0, warm_start=False, n_jobs=1, random_state=10), param_grid=params, cv=10, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)
print grid.best_params_
print grid.best_score_

{'max_features': 0.529}
0.876106932589


In [16]:
params = {'max_features': [i/10000.0 for i in range(5284,5296)]}
grid = GridSearchCV(estimator=BaggingRegressor(n_estimators=70, max_samples=0.896,
        max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, base_estimator=None,
        verbose=0, warm_start=False, n_jobs=1, random_state=10), param_grid=params, cv=10, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)
print grid.best_params_
print grid.best_score_

{'max_features': 0.529}
0.876106932589


In [17]:
params = {'bootstrap': [True,False]}
grid = GridSearchCV(estimator=BaggingRegressor(n_estimators=70, max_samples=0.896,
        max_features=0.529, bootstrap=True, bootstrap_features=False, oob_score=False, base_estimator=None,
        verbose=0, warm_start=False, n_jobs=1, random_state=10), param_grid=params, cv=10, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)
print grid.best_params_
print grid.best_score_

{'bootstrap': True}
0.876106932589


In [18]:
params = {'bootstrap_features': [True,False]}
grid = GridSearchCV(estimator=BaggingRegressor(n_estimators=70, max_samples=0.896,
        max_features=0.529, bootstrap=True, bootstrap_features=False, oob_score=False, base_estimator=None,
        verbose=0, warm_start=False, n_jobs=1, random_state=10), param_grid=params, cv=10, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)
print grid.best_params_
print grid.best_score_

{'bootstrap_features': False}
0.876106932589


In [19]:
params = {'oob_score': [True,False]}
grid = GridSearchCV(estimator=BaggingRegressor(n_estimators=70, max_samples=0.896,
        max_features=0.529, bootstrap=True, bootstrap_features=False, oob_score=False, base_estimator=None,
        verbose=0, warm_start=False, n_jobs=1, random_state=10), param_grid=params, cv=10, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)
print grid.best_params_
print grid.best_score_

{'oob_score': True}
0.876106932589


### Final model

In [20]:
params = {}
grid = GridSearchCV(estimator=BaggingRegressor(n_estimators=70, max_samples=0.896,
        max_features=0.529, bootstrap=True, bootstrap_features=False, oob_score=False, base_estimator=None,
        verbose=0, warm_start=False, n_jobs=1, random_state=10), param_grid=params, cv=10, scoring='r2', n_jobs=-1)
grid.fit(X_train, y_train)
print grid.best_params_
print grid.best_score_

{}
0.876106932589


In [21]:
reg1 = BaggingRegressor(n_estimators=70, max_samples=0.896,
        max_features=0.529, bootstrap=True, bootstrap_features=False, oob_score=False, base_estimator=None,
        verbose=0, warm_start=False, n_jobs=1, random_state=10)
modelfit(reg1, X_train, y_train)


Model Report
Train_coef_of_det: 0.976409
CV_coef_of_det: Mean-0.876107 | Std-0.035142 | Min-0.792982 | Max-0.909830


### Grid search + Random search

In [7]:
params = {
          'n_estimators': range(10,150,10),
          'max_samples': [i/100.0 for i in range(85,94)],
          'max_features': [i/100.0 for i in range(49,58)],
          'bootstrap': [True,False],
          'bootstrap_features': [True,False],
          'oob_score': [False],
          'base_estimator': [None],
          'warm_start': [True],
          'verbose': [0],
          'n_jobs': [1],
          'random_state': [10]
         }
rand = RandomizedSearchCV(estimator=BaggingRegressor(), param_distributions=params, cv=10, scoring='r2',
         n_iter=1000, random_state=10, n_jobs=-1)
rand.fit(X_train, y_train)
print 'Best parameters: \n', rand.best_params_
print '\tBest score: ', rand.best_score_

  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


Best parameters: 
{'warm_start': True, 'max_samples': 0.9, 'oob_score': False, 'n_jobs': -1, 'verbose': 0, 'bootstrap': False, 'base_estimator': None, 'n_estimators': 110, 'random_state': 10, 'max_features': 0.48, 'bootstrap_features': False}
	Best score:  0.878835856234


In [None]:
params = {}
reg1 = BaggingRegressor(
                        n_estimators = params['n_estimators'],
                        max_samples = params['max_samples'],
                        max_features = params['max_features'],
                        bootstrap = params['bootstrap'],
                        bootstrap_features = params['bootstrap_features'],
                        oob_score = params['oob_score'],
                        base_estimator = params['base_estimator'],
                        warm_start = params['warm_start'],
                        verbose = params['verbose'],
                        n_jobs = params['n_jobs'],
                        random_state = params['random_state']  
                        )
modelfit(reg1, X_train, y_train)

### Grid search + Bayesian optimization ( 'hyperopt' package )

In [24]:
min_score = float('inf')
best_params = ''
c = 0

def score(params):
    global c, min_score, best_params
    c += 1
    print "Interation no.: ", c
    print "Training with params : "
    print params
    reg = BaggingRegressor(
                            n_estimators = params['n_estimators'],
                            max_samples = params['max_samples'],
                            max_features = params['max_features'],
                            bootstrap = params['bootstrap'],
                            bootstrap_features = params['bootstrap_features'],
                            oob_score = params['oob_score'],
                            base_estimator = params['base_estimator'],
                            warm_start = params['warm_start'],
                            verbose = params['verbose'],
                            n_jobs = params['n_jobs'],
                            random_state = params['random_state']        
                            )
    cv_r2_score = cross_val_score(reg, X_train, y_train, cv=10, scoring='r2', n_jobs=-1)
    score = 1 - np.mean(cv_r2_score) 
    if score < min_score:
        min_score = score
        best_params = params     
    print "\tScore: {0}".format(1-score)
    print "\tBest score: {0}\n".format(1-min_score)
    return {'loss': score, 'status': STATUS_OK}

def optimize():
    space = {
              'n_estimators': hp.choice('n_estimators', range(10,150,10)),
              'max_samples': hp.uniform('max_samples', 0.85, 0.94),
              'max_features': hp.uniform('max_features', 0.49, 0.58),
              'bootstrap': hp.choice('bootstrap', [True,False]),
              'bootstrap_features': hp.choice('bootstrap_features', [True,False]),
              'oob_score': hp.choice('oob_score', [False]),
              'base_estimator': hp.choice('base_estimator', [None]),
              'warm_start': hp.choice('warm_start', [True]),
              'verbose': hp.choice('verbose', [0]),
              'n_jobs': hp.choice('n_jobs', [1]),
              'random_state': hp.choice('random_state', [10])        
             }
    best = fmin(score, space, algo=tpe.suggest, max_evals=1000)    
  
optimize()
print 'Best parameters:'
print best_params
print "\tScore {0}".format(1-min_score)

Interation no.:  1
Training with params : 
{'warm_start': True, 'max_samples': 0.9177721030539293, 'base_estimator': None, 'n_jobs': 1, 'verbose': 0, 'bootstrap': False, 'oob_score': False, 'n_estimators': 120, 'random_state': 10, 'max_features': 0.5419723659990853, 'bootstrap_features': True}
	Score: 0.876591206057
	Best score: 0.876591206057

Interation no.:  2
Training with params : 
{'warm_start': True, 'max_samples': 0.911932804200888, 'base_estimator': None, 'n_jobs': 1, 'verbose': 0, 'bootstrap': False, 'oob_score': False, 'n_estimators': 40, 'random_state': 10, 'max_features': 0.5423947088349592, 'bootstrap_features': True}
	Score: 0.877033499488
	Best score: 0.877033499488

Interation no.:  3
Training with params : 
{'warm_start': True, 'max_samples': 0.8929581495632761, 'base_estimator': None, 'n_jobs': 1, 'verbose': 0, 'bootstrap': False, 'oob_score': False, 'n_estimators': 10, 'random_state': 10, 'max_features': 0.5283738286578658, 'bootstrap_features': True}
	Score: 0.8454

In [26]:
params = {'warm_start': True, 'max_samples': 0.9030810827110213, 'base_estimator': None, 'n_jobs': 1,
          'verbose': 0, 'bootstrap': False, 'oob_score': False, 'n_estimators': 110, 'random_state': 10,
          'max_features': 0.5351477142010489, 'bootstrap_features': False}
reg1 = BaggingRegressor(
                        n_estimators = params['n_estimators'],
                        max_samples = params['max_samples'],
                        max_features = params['max_features'],
                        bootstrap = params['bootstrap'],
                        bootstrap_features = params['bootstrap_features'],
                        oob_score = params['oob_score'],
                        base_estimator = params['base_estimator'],
                        warm_start = params['warm_start'],
                        verbose = params['verbose'],
                        n_jobs = params['n_jobs'],
                        random_state = params['random_state']  
                        )
modelfit(reg1, X_train, y_train)


Model Report
Train_coef_of_det: 0.998682
CV_coef_of_det: Mean-0.882370 | Std-0.036003 | Min-0.783940 | Max-0.911897


### Grid search + Genetic programming ( 'sklearn-deap' package )

In [None]:
params = {
          'n_estimators': range(10,150,10),
          'max_samples': [i/100.0 for i in range(85,94)],
          'max_features': [i/100.0 for i in range(49,58)],
          'bootstrap': [True,False],
          'bootstrap_features': [True,False],
          'oob_score': [False],
          'base_estimator': [None],
          'warm_start': [True],
          'verbose': [0],
          'n_jobs': [1],
          'random_state': [10]
         }
evol = EvolutionaryAlgorithmSearchCV(estimator=BaggingRegressor(), params=params, cv=10,
        scoring='r2', population_size=50, gene_mutation_prob=0.1, gene_crossover_prob=0.5, tournament_size=3,
        generations_number=10, verbose=True, n_jobs=-1)
evol.fit(X_train, y_train)

In [None]:
params = {}
reg1 = BaggingRegressor(
                        n_estimators = params['n_estimators'],
                        max_samples = params['max_samples'],
                        max_features = params['max_features'],
                        bootstrap = params['bootstrap'],
                        bootstrap_features = params['bootstrap_features'],
                        oob_score = params['oob_score'],
                        base_estimator = params['base_estimator'],
                        warm_start = params['warm_start'],
                        verbose = params['verbose'],
                        n_jobs = params['n_jobs'],
                        random_state = params['random_state']  
                        )
modelfit(reg1, X_train, y_train)

### Make predictions and save results

In [15]:
test_pred = reg1.predict(X_test)
dictn = {'Id': range(1461,2920), 'SalePrice': test_pred}
res = pd.DataFrame(dictn)
res.to_csv('submission.csv', index=0)