<a href="https://colab.research.google.com/github/somilasthana/MachineLearningSkills/blob/master/Heamy_WalkThru.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
!pip install -U heamy

Requirement already up-to-date: heamy in /usr/local/lib/python3.6/dist-packages (0.0.7)


In [3]:
!pip3 install -U xgboost

Requirement already up-to-date: xgboost in /usr/local/lib/python3.6/dist-packages (0.90)


In [0]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score, max_error, mean_squared_log_error, median_absolute_error
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBClassifier

In [0]:
from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline

In [0]:
np.set_printoptions(precision=6)
np.set_printoptions(suppress=True)

np.random.seed(1000)

In [0]:
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111)

In [0]:
# Load Dataset
dataset = Dataset(X_train, y_train, X_test)

In [0]:
# Stacking

In [0]:
rf = Regressor(dataset = dataset, estimator=RandomForestRegressor, parameters = {'n_estimators': 50}, name='rf')
lr = Regressor(dataset = dataset, estimator=LinearRegression, parameters = { 'normalize': True }, name='lr')

In [0]:
pipeline = ModelsPipeline(rf, lr)
stack_ds = pipeline.stack(k=10, seed=111)

In [0]:
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()

In [42]:
vresults = stacker.validate(k=10, scorer=mean_absolute_error)

Metric: mean_absolute_error
Folds accuracy: [2.5485272320904206, 1.585947203705934, 1.9646762148757155, 1.9104764887776098, 2.6955293634588458, 2.685735645061458, 1.6081967683742928, 2.5313994479366007, 2.5056647989554, 2.2947297623230067]
Mean accuracy: 2.2330882925559288
Standard Deviation: 0.4087462156004626
Variance: 0.16707346876769985


In [26]:
mean_absolute_error(results, y_test)

2.410665001260684

In [28]:
mean_squared_error(results, y_test)

10.198920797936402

In [30]:
explained_variance_score(y_test, results)

0.9266825841328894

In [32]:
max_error(results, y_test)

8.153455088353994

In [45]:
mean_squared_log_error(results, y_test)

0.02119252764908774

In [48]:
median_absolute_error(results, y_test)

1.8758802030729207

In [50]:
r2_score(results, y_test)

0.8866574102398905

In [0]:
knn = Regressor(dataset = dataset, estimator=KNeighborsRegressor, parameters = { "n_neighbors" : 5}, name='knn')

In [0]:
new_pipeline = ModelsPipeline(rf, lr, knn)

In [54]:
w = new_pipeline.find_weights(mean_absolute_error)

Best Score (mean_absolute_error): 2.1733346665315545
Best Weights: [0.907493 0.092507 0.      ]


In [0]:
wresults = pipeline.weight(w)

In [0]:
# Stacking using heamy using allstate-claims-severity

In [65]:
!unzip /content/sample_data/allstate-claims-severity.zip

Archive:  /content/sample_data/allstate-claims-severity.zip
  inflating: train.csv               
  inflating: sample_submission.csv   
  inflating: test.csv                


In [0]:
import logging

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression

"""
Read more about heamy:
https://github.com/rushter/heamy
"""
from heamy.dataset import Dataset
from heamy.estimator import Regressor
from heamy.pipeline import ModelsPipeline

In [0]:
logging.basicConfig(level = logging.DEBUG)

In [0]:
ID = 'id'
TARGET = 'loss'

DATA_DIR = "./"
SUBMISSION_FILE = "{0}/sample_submission.csv".format(DATA_DIR)

In [0]:
def first_dataset():
    train = pd.read_csv("{0}/train.csv".format(DATA_DIR))
    test = pd.read_csv("{0}/test.csv".format(DATA_DIR))

    y_train = train[TARGET].ravel()

    train.drop([ID, TARGET], axis=1, inplace=True)
    test.drop([ID], axis=1, inplace=True)

    train_test = pd.concat((train, test)).reset_index(drop=True)

    ntrain = train.shape[0]

    features = train.columns
    cats = [feat for feat in features if 'cat' in feat]
    for feat in cats:
        train_test[feat] = pd.factorize(train_test[feat], sort=True)[0]
    x_train = np.array(train_test.iloc[:ntrain, :])
    x_test = np.array(train_test.iloc[ntrain:, :])

    return {'X_train': x_train, 'X_test': x_test, 'y_train': y_train}

In [0]:
def xgb_first(X_train, y_train, X_test, y_test=None):
  params = {
      
      'seed': 1111,
      'colsample_bytree': 0.7,
      'silent': 1,
      'subsample': .85,
      'learing_rate': 0.005,
      'objective': 'reg:linear',
      'max_depth': 10,
      'num_estimators': 550,
      'gamma': 0.05
  }
  
  X_train = xgb.DMatrix(X_train, label=y_train, missing=np.nan)
  model = xgb.train(params, X_train, params['num_estimators'])
  return model.predict(xgb.DMatrix(X_test, missing=np.nan))


def xgb_stack(X_train, y_train, X_test, y_test=None):
  params = {
      
      'seed': 11111, 
      'colsample_bytree': 0.6,
      'silent': 1,
      'subsample': .85,
      'learning_rate': 0.004,
      'objective' : 'reg:linear',
      'max_depth': 10,
      'num_estimators': 550,
      'gamma': 0.05
     
  }
  
  X_train = xgb.DMatrix(X_train, label=y_train, missing=np.nan)
  model = xgb.train(params, X_train, params['num_estimators'])
  return model.predict(xgb.DMatrix(X_test, missing=np.nan))

In [0]:
et_params = {'n_estimators': 100, 'max_features': 0.5,
             'max_depth': 18, 'min_samples_leaf': 4,
             'n_jobs': -1}
rf_params = {'n_estimators': 125, 'max_features': 0.2,
             'max_depth': 25, 'min_samples_leaf': 4,
             'n_jobs': -1}

In [0]:
ds = Dataset(preprocessor = first_dataset, use_cache=False)

In [0]:
pipeline = ModelsPipeline(    
    Regressor(estimator=xgb_first, dataset=ds, use_cache=False),
    Regressor(estimator=ExtraTreesRegressor, dataset=ds, use_cache=False, parameters=et_params),
    Regressor(estimator=RandomForestRegressor, dataset=ds, use_cache=False, parameters=rf_params),
    Regressor(estimator=LinearRegression, dataset=ds, use_cache=False)
)

In [90]:
stack_ds = pipeline.stack(k=4, seed=1111, add_diff=False, full_test=True)

INFO:heamy.estimator:Calculating xgb_first(cb454fe8cd594320e8d8cce3b8a17970)'s fold #1
INFO:heamy.estimator:Calculating xgb_first(cb454fe8cd594320e8d8cce3b8a17970)'s fold #2
INFO:heamy.estimator:Calculating xgb_first(cb454fe8cd594320e8d8cce3b8a17970)'s fold #3
INFO:heamy.estimator:Calculating xgb_first(cb454fe8cd594320e8d8cce3b8a17970)'s fold #4
INFO:heamy.estimator:Calculating xgb_first(cb454fe8cd594320e8d8cce3b8a17970)'s test data
INFO:heamy.estimator:Calculating ExtraTreesRegressor(39d55ae2d4a3f848b866d0495d9f34b6)'s fold #1
INFO:heamy.estimator:Calculating ExtraTreesRegressor(39d55ae2d4a3f848b866d0495d9f34b6)'s fold #2
INFO:heamy.estimator:Calculating ExtraTreesRegressor(39d55ae2d4a3f848b866d0495d9f34b6)'s fold #3
INFO:heamy.estimator:Calculating ExtraTreesRegressor(39d55ae2d4a3f848b866d0495d9f34b6)'s fold #4
INFO:heamy.estimator:Calculating ExtraTreesRegressor(39d55ae2d4a3f848b866d0495d9f34b6)'s test data
INFO:heamy.estimator:Calculating RandomForestRegressor(3d7183a0f6f6d8de37282

In [0]:
stacker = Regressor(dataset=stack_ds, estimator=xgb_stack, use_cache=False)
predictions = stacker.predict()

In [94]:
stacker.validate(k=4)

INFO:heamy.estimator:Calculating xgb_stack(8fa65385ca453ce2b157804792a5cb70)'s fold #1
INFO:heamy.estimator:Calculating xgb_stack(8fa65385ca453ce2b157804792a5cb70)'s fold #2
INFO:heamy.estimator:Calculating xgb_stack(8fa65385ca453ce2b157804792a5cb70)'s fold #3
INFO:heamy.estimator:Calculating xgb_stack(8fa65385ca453ce2b157804792a5cb70)'s fold #4


([array([10280.2 ,   585.18,  6609.32, ...,  1198.62,  5762.64,  1562.87]),
  array([2213.18,  939.85, 5142.87, ...,  839.41,  896.57, 4751.72]),
  array([1283.6 , 1132.22, 1071.77, ..., 1173.3 , 4659.57, 1108.34]),
  array([ 3005.09,  2763.85,  6184.59, ..., 12065.38,  2161.12,   804.28])],
 [array([7671.0513, 1143.5234, 8783.805 , ..., 2927.4187, 5030.2974,
         3088.1677], dtype=float32),
  array([1698.3885, 1037.8986, 4558.643 , ..., 2146.2117, 1641.1965,
         3036.1824], dtype=float32),
  array([1888.4546 , 1349.8948 ,  973.0861 , ..., 1393.9312 , 2217.946  ,
          983.81116], dtype=float32),
  array([3985.874 , 3441.2153, 5960.281 , ..., 5091.6724, 2159.5203,
         1683.6952], dtype=float32)])