In [25]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [26]:
import h2o
from h2o.automl import H2OAutoML
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

In [27]:
# h2o.init()

In [29]:
train_df = pd.read_csv('../input/ail302m/train.csv', sep=';')
test_df = pd.read_csv('../input/ail302m/test.csv', sep=';')

In [30]:
train_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,6.6,0.3,0.36,1.2,0.035,43,126,0.9909,3.01,0.63,11.4,6,white
1,7.7,0.5,0.26,1.9,0.062,9,31,0.9966,3.39,0.64,9.6,5,red
2,8.4,0.5,0.35,2.9,0.076,21,127,0.9976,3.23,0.63,9.2,5,red
3,7.5,0.4,0.33,5.0,0.045,30,131,0.9942,3.32,0.44,10.9,6,white
4,6.4,0.2,0.25,20.2,0.083,35,157,0.9998,3.17,0.50,9.1,5,white
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6709,7.2,0.2,0.19,7.7,0.045,53,176,0.9958,3.17,0.38,9.5,5,white
6710,6.7,0.3,0.34,7.5,0.036,39,124,0.9912,2.99,0.32,12.4,8,white
6711,6.6,0.3,0.24,3.3,0.034,29,99,0.9903,3.10,0.40,12.3,7,white
6712,8.0,0.2,0.31,5.6,0.049,24,97,0.9930,3.10,0.42,10.9,5,white


In [31]:
def create_new_feature(data):
    # combine fixed and volatile acidity to create total acidity
    # and mean acidity
    acidity_features = ['fixed acidity', 'volatile acidity']
    data['total_acidity'] = data[acidity_features].sum(axis=1)
    data['average_acidity'] = data[acidity_features].mean(axis=1)
    
    #combine salts into total minerals and average minerals
    salt_features = ['chlorides','sulphates']
    data['total_minerals'] = data[salt_features].sum(axis=1)
    data['average_minerals'] = data[salt_features].mean(axis=1)
    
    #the sulfur that is not free
    sulfur_features = ['total sulfur dioxide','free sulfur dioxide']
    data['non_free_sulfur_dioxide'] = data[sulfur_features[0]] - data[sulfur_features[1]]
    
    #percentage of free sulfur
    data['percentage_free_sulfur'] = data[sulfur_features[1]] / data[sulfur_features[0]]
    
    #determine from all free sulfur how much is as salt
    data['percentage_salt_sulfur'] = data['sulphates'] / data['free sulfur dioxide']
    return data

In [32]:
train_df = create_new_feature(train_df)
test_df = create_new_feature(test_df)

In [33]:
type_dict = {cat:i for (i, cat) in enumerate(train_df['type'].unique())}
train_df['type']=train_df['type'].map(type_dict)
test_df['type']=test_df['type'].map(type_dict)

In [34]:
type_dict

{'white': 0, 'red': 1}

In [35]:
target = 'quality'
features = [c for c in train_df.columns if c not in target]

In [36]:
train_df.duplicated().sum()/len(train_df)


0.1636878165028299

In [37]:
test_df[features].duplicated().sum()/len(test_df[features])


0.13658536585365855

Should we drop duplicates?

In [38]:
train_df = train_df.drop_duplicates().reset_index(drop=True)

In [39]:
# merge train and test
train_df['isTest'] = 0
test_df['isTest'] = 1
data = pd.concat([train_df,test_df],axis=0)
data.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['isTest'] = 0


(6435, 22)

In [40]:
# Normalization data
from sklearn.preprocessing import StandardScaler
normal = StandardScaler()
col_normal = [col for col in data.columns if col != 'quality' and col != 'isTest' and col != 'type']
data[col_normal] = normal.fit_transform(data[col_normal])
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,...,type,total_acidity,average_acidity,total_minerals,average_minerals,non_free_sulfur_dioxide,percentage_free_sulfur,percentage_salt_sulfur,isTest,id
0,-0.488985,-0.307637,0.280039,-0.85207,-0.588504,0.753716,0.234512,-1.264388,-1.352002,0.571834,...,0,-0.510532,-0.510532,0.375857,0.375857,0.001848,0.436843,-0.439836,0,
1,0.323198,0.849153,-0.383382,-0.696625,0.094075,-1.172312,-1.429334,0.644361,1.022693,0.634706,...,1,0.417115,0.417115,0.581358,0.581358,-1.331855,0.026457,1.00239,0,
2,0.840042,0.849153,0.213697,-0.474561,0.448005,-0.492537,0.252026,0.979229,0.022821,0.571834,...,1,0.916618,0.916618,0.603574,0.603574,0.50472,-0.980178,-0.047762,0,
3,0.175528,0.270758,0.081013,-0.008225,-0.335697,0.017294,0.322082,-0.159323,0.585249,-0.622747,...,0,0.203043,0.203043,-0.623877,-0.623877,0.3954,-0.467443,-0.43944,0,
4,-0.636655,-0.886032,-0.449725,3.367154,0.62497,0.300533,0.777451,1.715939,-0.35213,-0.245511,...,0,-0.724605,-0.724605,-0.079577,-0.079577,0.854544,-0.5164,-0.449171,0,


In [41]:
# split data
train_df = data[data['isTest'] == 0]
test_df = data[data['isTest'] == 1]
train_df.drop('isTest',axis=1,inplace=True)
test_df.drop(['isTest','quality'],axis=1,inplace=True)
train_df.shape,test_df.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.drop('isTest',axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.drop(['isTest','quality'],axis=1,inplace=True)


((5615, 21), (820, 20))

In [None]:
# processing imbalanced data using smogn
from smogn import smoter
train = smoter(data=train_df, y='quality', k=5)

In [33]:
# train_df = train_df[train_df['density']<1.01]

In [42]:
train_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,...,quality,type,total_acidity,average_acidity,total_minerals,average_minerals,non_free_sulfur_dioxide,percentage_free_sulfur,percentage_salt_sulfur,id
0,-0.488985,-0.307637,0.280039,-0.852070,-0.588504,0.753716,0.234512,-1.264388,-1.352002,0.571834,...,6.0,0,-0.510532,-0.510532,0.375857,0.375857,0.001848,0.436843,-0.439836,
1,0.323198,0.849153,-0.383382,-0.696625,0.094075,-1.172312,-1.429334,0.644361,1.022693,0.634706,...,5.0,1,0.417115,0.417115,0.581358,0.581358,-1.331855,0.026457,1.002390,
2,0.840042,0.849153,0.213697,-0.474561,0.448005,-0.492537,0.252026,0.979229,0.022821,0.571834,...,5.0,1,0.916618,0.916618,0.603574,0.603574,0.504720,-0.980178,-0.047762,
3,0.175528,0.270758,0.081013,-0.008225,-0.335697,0.017294,0.322082,-0.159323,0.585249,-0.622747,...,6.0,0,0.203043,0.203043,-0.623877,-0.623877,0.395400,-0.467443,-0.439440,
4,-0.636655,-0.886032,-0.449725,3.367154,0.624970,0.300533,0.777451,1.715939,-0.352130,-0.245511,...,5.0,0,-0.724605,-0.724605,-0.079577,-0.079577,0.854544,-0.516400,-0.449171,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6708,-0.119811,0.849153,-1.909252,-0.541180,0.448005,-0.492537,-0.360970,0.945742,1.710104,0.383216,...,5.0,1,-0.011030,-0.011030,0.436952,0.436952,-0.260519,-0.473458,-0.084254,
6709,-0.045976,-0.886032,-0.847777,0.591349,-0.335697,1.320195,1.110220,0.376466,-0.352130,-0.999984,...,5.0,0,-0.153745,-0.153745,-0.957122,-0.957122,0.876408,0.113563,-0.630942,
6710,-0.415150,-0.307637,0.147355,0.546936,-0.563224,0.527124,0.199483,-1.163928,-1.476986,-1.377220,...,8.0,0,-0.439175,-0.439175,-1.340354,-1.340354,0.045576,0.221339,-0.604495,
6712,0.544703,-0.886032,-0.051672,0.125013,-0.234574,-0.322594,-0.273399,-0.561165,-0.789574,-0.748493,...,5.0,0,0.417115,0.417115,-0.712743,-0.712743,-0.216792,-0.319107,-0.367065,


In [35]:
# sns.boxplot(train_df['density'])

In [36]:
# train_df[train_df['type']==0]

In [43]:
train_df_red = train_df[train_df['type']==type_dict['red']]
train_df_white = train_df[train_df['type']==type_dict['white']]
test_df_red = test_df[test_df['type']==type_dict['red']]
test_df_white = test_df[test_df['type']==type_dict['white']]

In [44]:
test_df_red.id

0     -1.036521
5      1.187546
7      1.034199
12    -1.589210
13     1.703496
         ...   
804    1.291375
809   -0.637179
811   -0.180865
814    1.204052
817    1.004381
Name: id, Length: 233, dtype: float64

In [45]:
# train = h2o.H2OFrame(train_df)
# test = h2o.H2OFrame(test_df)


In [45]:
def stratify(df, seed=42):
    stratify = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    df['fold'] = np.zeros(len(df), dtype='int')
    
    for (i, (train_idx, val_idx)) in enumerate(stratify.split(df, y=df['quality'])):
        df['fold'].iloc[val_idx] = i
        
    return df

In [46]:
train_df_red = stratify(train_df_red, seed=30)
train_df_red

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fold'] = np.zeros(len(df), dtype='int')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fold'].iloc[val_idx] = i


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,...,type,total_acidity,average_acidity,total_minerals,average_minerals,non_free_sulfur_dioxide,percentage_free_sulfur,percentage_salt_sulfur,id,fold
1,0.323198,0.849153,-0.383382,-0.696625,0.094075,-1.172312,-1.429334,0.644361,1.022693,0.634706,...,1,0.417115,0.417115,0.581358,0.581358,-1.331855,0.026457,1.002390,,1
2,0.840042,0.849153,0.213697,-0.474561,0.448005,-0.492537,0.252026,0.979229,0.022821,0.571834,...,1,0.916618,0.916618,0.603574,0.603574,0.504720,-0.980178,-0.047762,,0
18,1.135381,0.270758,1.142487,-0.496767,0.624970,-0.662481,-0.028201,1.180150,0.460265,0.383216,...,1,1.130690,1.130690,0.475831,0.475831,0.220488,-1.005891,0.037385,,4
20,-0.193646,0.270758,-0.847777,-0.607799,0.321601,-0.662481,-1.341763,0.041598,1.022693,0.131725,...,1,-0.153745,-0.153745,0.187018,0.187018,-1.419311,1.715434,-0.019380,,4
21,0.692372,0.270758,0.412723,-0.541180,0.877777,-0.379241,-0.921423,1.079690,0.522757,1.074815,...,1,0.702545,0.702545,1.142320,1.142320,-1.003895,0.775670,-0.025550,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6690,0.618537,-0.307637,0.280039,-0.630006,-0.259855,0.300533,-1.044022,0.108572,0.272789,2.960996,...,1,0.559830,0.559830,2.558611,2.558611,-1.419311,3.007294,-0.076955,,2
6695,-0.710490,-0.307637,1.076145,-0.718831,0.271040,-0.662481,-0.903909,0.409953,1.335153,1.514924,...,1,-0.724605,-0.724605,1.397808,1.397808,-0.872711,0.064794,0.292828,,2
6703,-0.193646,0.849153,-1.511199,-0.674419,0.574408,-1.115664,-1.692046,0.041598,1.272661,0.320343,...,1,-0.082387,-0.082387,0.409182,0.409182,-1.681678,2.722325,0.693021,,0
6704,-1.448838,2.005943,-0.847777,-0.785451,2.596865,-1.285607,-0.886395,-0.226296,2.460008,0.446088,...,1,-1.152749,-1.152749,0.964590,0.964590,-0.610343,-1.402678,1.411909,,3


In [47]:
train_df_white = stratify(train_df_white, seed=29)
train_df_white

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fold'] = np.zeros(len(df), dtype='int')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fold'].iloc[val_idx] = i


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,...,type,total_acidity,average_acidity,total_minerals,average_minerals,non_free_sulfur_dioxide,percentage_free_sulfur,percentage_salt_sulfur,id,fold
0,-0.488985,-0.307637,0.280039,-0.852070,-0.588504,0.753716,0.234512,-1.264388,-1.352002,0.571834,...,0,-0.510532,-0.510532,0.375857,0.375857,0.001848,0.436843,-0.439836,,4
3,0.175528,0.270758,0.081013,-0.008225,-0.335697,0.017294,0.322082,-0.159323,0.585249,-0.622747,...,0,0.203043,0.203043,-0.623877,-0.623877,0.395400,-0.467443,-0.439440,,4
4,-0.636655,-0.886032,-0.449725,3.367154,0.624970,0.300533,0.777451,1.715939,-0.352130,-0.245511,...,0,-0.724605,-0.724605,-0.079577,-0.079577,0.854544,-0.516400,-0.449171,,4
5,0.249363,-0.307637,1.341514,1.812703,-0.411539,1.773378,0.619823,1.247124,-1.539478,-0.434129,...,0,0.203043,0.203043,-0.473917,-0.473917,0.089304,1.007889,-0.617273,,2
6,-0.636655,-0.307637,0.545408,0.258252,-0.462101,0.923660,0.987621,0.209032,-0.477114,-0.497002,...,0,-0.653247,-0.653247,-0.540566,-0.540566,0.876408,-0.119609,-0.558647,,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6707,-0.267481,0.270758,2.004935,1.524019,-0.335697,1.320195,0.759936,0.945742,-0.664590,-0.559875,...,0,-0.225102,-0.225102,-0.568336,-0.568336,0.439128,0.424549,-0.597204,,1
6709,-0.045976,-0.886032,-0.847777,0.591349,-0.335697,1.320195,1.110220,0.376466,-0.352130,-0.999984,...,0,-0.153745,-0.153745,-0.957122,-0.957122,0.876408,0.113563,-0.630942,,4
6710,-0.415150,-0.307637,0.147355,0.546936,-0.563224,0.527124,0.199483,-1.163928,-1.476986,-1.377220,...,0,-0.439175,-0.439175,-1.340354,-1.340354,0.045576,0.221339,-0.604495,,4
6712,0.544703,-0.886032,-0.051672,0.125013,-0.234574,-0.322594,-0.273399,-0.561165,-0.789574,-0.748493,...,0,0.417115,0.417115,-0.712743,-0.712743,-0.216792,-0.319107,-0.367065,,2


In [48]:
train_white = h2o.H2OFrame(train_df_white)
train_red = h2o.H2OFrame(train_df_red)

test_white = h2o.H2OFrame(test_df_white)
test_red = h2o.H2OFrame(test_df_red)

H2OConnectionError: Not connected to a cluster. Did you run `h2o.init()` or `h2o.connect()`?

In [59]:
len(test_red)

233

In [60]:
target = 'quality'
features = [c for c in train_df.columns if c not in [target, 'type', 'fold']]
features

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol']

In [61]:
# type_dict = {cat:i for (i, cat) in enumerate(train['type'].unique().as_data_frame())}
# # type_dict
# # {'white': 0, 'red': 1}
# train['type']=train['type'].map(type_dict)
# test['type']=test['type'].map(type_dict)

# train['type'] = train['type'].asfactor()
# test['type'] = test['type'].asfactor()

In [63]:
# Run AutoML for 20 base models
aml = H2OAutoML(
    max_models=20, 
                seed=1, 
#                 max_runtime_secs = 60*60
               )
aml.train(x=features, y=target, training_frame=train_white,     fold_column='fold')

AutoML progress: |
02:58:43.118: Fold column fold will be used for cross-validation. nfolds parameter will be ignored.

███████████████████████████████████████████████████████████████| (done) 100%
Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_AllModels_1_AutoML_2_20220614_25843

No model summary for this model

ModelMetricsRegressionGLM: stackedensemble
** Reported on train data. **

MSE: 0.06302859647409952
RMSE: 0.25105496703729946
MAE: 0.19068940572564008
RMSLE: 0.03765740627978781
R^2: 0.9223828627185502
Mean Residual Deviance: 0.06302859647409952
Null degrees of freedom: 4051
Residual degrees of freedom: 4045
Null deviance: 3290.4057255677694
Residual deviance: 255.39187291305123
AIC: 314.6742292782539

ModelMetricsRegressionGLM: stackedensemble
** Reported on cross-validation data. **

MSE: 0.43822090447423756
RMSE: 0.661982556019596
MAE: 0.5066079744365157
RMSLE: 0.09870809284592329
R^2: 0.46034888915612726
Mean Residual Deviance: 0.43

Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,0.506664,0.007139,0.498832,0.501848,0.515388,0.504411,0.512841
1,mean_residual_deviance,0.438212,0.011033,0.452261,0.426507,0.43647,0.42927,0.446554
2,mse,0.438212,0.011033,0.452261,0.426507,0.43647,0.42927,0.446554
3,null_deviance,658.08276,4.311973,660.8355,661.98724,653.3879,653.3879,660.8152
4,r2,0.460364,0.012725,0.444967,0.477477,0.458911,0.467837,0.452628
5,residual_deviance,355.12787,8.971119,366.78333,345.89722,353.5408,347.709,361.70892
6,rmse,0.661934,0.008325,0.672503,0.653075,0.660659,0.655187,0.668247
7,rmsle,0.0987,0.001338,0.100322,0.097189,0.098854,0.097522,0.099615




In [78]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)


model_id,rmse,mse,mae,rmsle,mean_residual_deviance
StackedEnsemble_AllModels_1_AutoML_2_20220614_25843,0.661983,0.438221,0.506608,0.0987081,0.438221
StackedEnsemble_BestOfFamily_1_AutoML_2_20220614_25843,0.663791,0.440618,0.508002,0.0989468,0.440618
DRF_1_AutoML_2_20220614_25843,0.671783,0.451293,0.515737,0.100347,0.451293
XRT_1_AutoML_2_20220614_25843,0.674161,0.454493,0.520131,0.100607,0.454493
GBM_4_AutoML_2_20220614_25843,0.69488,0.482859,0.536771,0.10323,0.482859
XGBoost_grid_1_AutoML_2_20220614_25843_model_2,0.699323,0.489052,0.536927,0.104254,0.489052
GBM_5_AutoML_2_20220614_25843,0.700658,0.490922,0.544808,0.103951,0.490922
GBM_3_AutoML_2_20220614_25843,0.702317,0.49325,0.544696,0.104259,0.49325
GBM_2_AutoML_2_20220614_25843,0.702567,0.4936,0.546465,0.104122,0.4936
GBM_grid_1_AutoML_2_20220614_25843_model_1,0.706437,0.499054,0.547563,0.104927,0.499054




In [100]:
# mean_squared_error(aml.predict(train_white).as_data_frame(), train_white[target].as_data_frame(), squared=False)

In [101]:
# type(aml.predict(train_white))

In [77]:
# pred = aml.predict(test)
# pred.head()

In [None]:
# submission_df = pd.DataFrame({'id':test.as_data_frame()['id'], 'quality':pred.as_data_frame()['predict']})

# submission_df

In [None]:
# submission_df.to_csv('submission.csv', index=False)

In [None]:
# aml.leader.base_models

In [79]:
aml.leader.metalearner()

Model Details
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  metalearner_AUTO_StackedEnsemble_AllModels_1_AutoML_2_20220614_25843


GLM Model: summary


Unnamed: 0,Unnamed: 1,family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
0,,gaussian,identity,"Elastic Net (alpha = 0.5, lambda = 0.006094 )","nlambda = 100, lambda.max = 0.769, lambda.min = 0.006094, lambda.1...",20,6,53,levelone_training_StackedEnsemble_AllModels_1_AutoML_2_20220614_25843




ModelMetricsRegressionGLM: glm
** Reported on train data. **

MSE: 0.43604564810319363
RMSE: 0.660337525893534
MAE: 0.5049370934764137
RMSLE: 0.09847742809132419
R^2: 0.46302762836053024
Mean Residual Deviance: 0.43604564810319363
Null degrees of freedom: 4051
Residual degrees of freedom: 4045
Null deviance: 3290.4057255677694
Residual deviance: 1766.8569661141405
AIC: 8151.884064705501

ModelMetricsRegressionGLM: glm
** Reported on cross-validation data. **

MSE: 0.43822090447423756
RMSE: 0.661982556019596
MAE: 0.5066079744365157
RMSLE: 0.09870809284592329
R^2: 0.46034888915612726
Mean Residual Deviance: 0.43822090447423756
Null degrees of freedom: 4051
Residual degrees of freedom: 4044
Null deviance: 3290.413677499588
Residual deviance: 1775.6711049296107
AIC: 8174.047610886789

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,0.506664,0.007139,0.498832,0.501848,0.515388,0.504411,0.512841
1,mean_residual_deviance,0.438212,0.011033,0.452261,0.426507,0.43647,0.42927,0.446554
2,mse,0.438212,0.011033,0.452261,0.426507,0.43647,0.42927,0.446554
3,null_deviance,658.08276,4.311973,660.8355,661.98724,653.3879,653.3879,660.8152
4,r2,0.460364,0.012725,0.444967,0.477477,0.458911,0.467837,0.452628
5,residual_deviance,355.12787,8.971119,366.78333,345.89722,353.5408,347.709,361.70892
6,rmse,0.661934,0.008325,0.672503,0.653075,0.660659,0.655187,0.668247
7,rmsle,0.0987,0.001338,0.100322,0.097189,0.098854,0.097522,0.099615



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,iteration,lambda,predictors,deviance_train,alpha,iterations,training_rmse,training_deviance,training_mae,training_r2
0,,2022-06-14 03:12:46,0.000 sec,1,0.77,1,0.812045,0.5,,,,,
1,,2022-06-14 03:12:46,0.001 sec,2,0.7,4,0.767216,0.5,,,,,
2,,2022-06-14 03:12:46,0.005 sec,3,0.64,5,0.725177,0.5,,,,,
3,,2022-06-14 03:12:46,0.018 sec,4,0.58,5,0.686577,0.5,,,,,
4,,2022-06-14 03:12:46,0.021 sec,5,0.53,5,0.65289,0.5,5.0,0.808016,0.65289,0.616408,0.195993
5,,2022-06-14 03:12:46,0.027 sec,6,0.48,5,0.623215,0.5,,,,,
6,,2022-06-14 03:12:46,0.029 sec,7,0.44,7,0.595352,0.5,,,,,
7,,2022-06-14 03:12:46,0.030 sec,8,0.4,8,0.571953,0.5,,,,,
8,,2022-06-14 03:12:46,0.032 sec,9,0.37,8,0.551662,0.5,,,,,
9,,2022-06-14 03:12:46,0.033 sec,10,0.33,8,0.534561,0.5,10.0,0.731137,0.534561,0.56778,0.34171



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,DRF_1_AutoML_2_20220614_25843,0.19198,1.0,0.299988
1,XRT_1_AutoML_2_20220614_25843,0.13328,0.694238,0.208263
2,XGBoost_2_AutoML_2_20220614_25843,0.103393,0.538564,0.161563
3,XGBoost_grid_1_AutoML_2_20220614_25843_model_2,0.096037,0.500244,0.150067
4,DeepLearning_grid_3_AutoML_2_20220614_25843_model_1,0.071658,0.373256,0.111972
5,XGBoost_1_AutoML_2_20220614_25843,0.043611,0.227164,0.068146
6,GBM_4_AutoML_2_20220614_25843,0.0,0.0,0.0
7,GBM_5_AutoML_2_20220614_25843,0.0,0.0,0.0
8,GBM_3_AutoML_2_20220614_25843,0.0,0.0,0.0
9,GBM_2_AutoML_2_20220614_25843,0.0,0.0,0.0




In [None]:
# h2o.get_model('XRT_1_AutoML_3_20220609_25602')

In [None]:
# h2o.get_model('XGBoost_grid_1_AutoML_3_20220609_25602_model_2')

In [64]:
# Run AutoML for 20 base models
aml_red = H2OAutoML(
    max_models=20, 
                seed=5, 
#                 max_runtime_secs = 60*60
               )
aml_red.train(x=features, y=target, training_frame=train_red, fold_column='fold')

AutoML progress: |
03:13:23.194: Fold column fold will be used for cross-validation. nfolds parameter will be ignored.

███████████████████████████████████████████████████████████████| (done) 100%
Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_BestOfFamily_1_AutoML_3_20220614_31323

No model summary for this model

ModelMetricsRegressionGLM: stackedensemble
** Reported on train data. **

MSE: 0.06986048775837325
RMSE: 0.2643113462535675
MAE: 0.1981077576945904
RMSLE: 0.04145903240750547
R^2: 0.9000835982955395
Mean Residual Deviance: 0.06986048775837325
Null degrees of freedom: 1562
Residual degrees of freedom: 1557
Null deviance: 1092.833013435689
Residual deviance: 109.19194236633739
AIC: 290.0601996327403

ModelMetricsRegressionGLM: stackedensemble
** Reported on cross-validation data. **

MSE: 0.3820225353967769
RMSE: 0.6180797160535014
MAE: 0.46675943400852704
RMSLE: 0.09570796219271935
R^2: 0.4536208043825767
Mean Residual Deviance: 0.38

Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,0.468004,0.012151,0.460161,0.4698,0.455771,0.487369,0.466917
1,mean_residual_deviance,0.38255,0.013809,0.380028,0.388215,0.365409,0.402443,0.376655
2,mse,0.38255,0.013809,0.380028,0.388215,0.365409,0.402443,0.376655
3,null_deviance,218.57353,3.229615,216.17413,222.17413,222.01868,216.66078,215.83992
4,r2,0.452687,0.022915,0.44975,0.453076,0.484821,0.420267,0.455522
5,residual_deviance,119.582436,4.221552,118.948845,121.51142,114.37312,125.56228,117.516495
6,rmse,0.618426,0.01114,0.616464,0.623069,0.604491,0.634384,0.613723
7,rmsle,0.095735,0.002037,0.094811,0.09754,0.094416,0.098261,0.093645




In [80]:
# View the AutoML Leaderboard
lb_red = aml_red.leaderboard
lb_red.head(rows=lb_red.nrows)  # Print all rows instead of default (10 rows)


model_id,rmse,mse,mae,rmsle,mean_residual_deviance
StackedEnsemble_BestOfFamily_1_AutoML_3_20220614_31323,0.61808,0.382023,0.466759,0.095708,0.382023
StackedEnsemble_AllModels_1_AutoML_3_20220614_31323,0.619038,0.383208,0.466648,0.0958107,0.383208
XRT_1_AutoML_3_20220614_31323,0.625475,0.391219,0.476175,0.0969651,0.391219
DRF_1_AutoML_3_20220614_31323,0.628785,0.39537,0.475492,0.0975401,0.39537
GBM_grid_1_AutoML_3_20220614_31323_model_1,0.642278,0.412521,0.487909,0.0993878,0.412521
XGBoost_grid_1_AutoML_3_20220614_31323_model_1,0.643812,0.414494,0.486062,0.0996406,0.414494
GBM_5_AutoML_3_20220614_31323,0.644639,0.41556,0.490895,0.0996822,0.41556
GBM_3_AutoML_3_20220614_31323,0.646489,0.417948,0.4938,0.100196,0.417948
GBM_4_AutoML_3_20220614_31323,0.64707,0.418699,0.494308,0.100116,0.418699
GBM_2_AutoML_3_20220614_31323,0.651403,0.424326,0.49869,0.100809,0.424326




In [81]:
aml_red.leader.metalearner()

Model Details
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  metalearner_AUTO_StackedEnsemble_BestOfFamily_1_AutoML_3_20220614_31323


GLM Model: summary


Unnamed: 0,Unnamed: 1,family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
0,,gaussian,identity,"Elastic Net (alpha = 0.5, lambda = 0.004975 )","nlambda = 100, lambda.max = 0.6278, lambda.min = 0.004975, lambda....",6,5,53,levelone_training_StackedEnsemble_BestOfFamily_1_AutoML_3_20220614...




ModelMetricsRegressionGLM: glm
** Reported on train data. **

MSE: 0.3800391406615523
RMSE: 0.6164731467481388
MAE: 0.4651134064847684
RMSLE: 0.09544645119514805
R^2: 0.4564575103871016
Mean Residual Deviance: 0.3800391406615523
Null degrees of freedom: 1562
Residual degrees of freedom: 1557
Null deviance: 1092.833013435689
Residual deviance: 594.0011768540063
AIC: 2937.429005181186

ModelMetricsRegressionGLM: glm
** Reported on cross-validation data. **

MSE: 0.3820225353967769
RMSE: 0.6180797160535014
MAE: 0.46675943400852704
RMSLE: 0.09570796219271935
R^2: 0.4536208043825767
Mean Residual Deviance: 0.3820225353967769
Null degrees of freedom: 1562
Residual degrees of freedom: 1557
Null deviance: 1092.867654226258
Residual deviance: 597.1012228251623
AIC: 2945.5649686297297

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,0.468004,0.012151,0.460161,0.4698,0.455771,0.487369,0.466917
1,mean_residual_deviance,0.38255,0.013809,0.380028,0.388215,0.365409,0.402443,0.376655
2,mse,0.38255,0.013809,0.380028,0.388215,0.365409,0.402443,0.376655
3,null_deviance,218.57353,3.229615,216.17413,222.17413,222.01868,216.66078,215.83992
4,r2,0.452687,0.022915,0.44975,0.453076,0.484821,0.420267,0.455522
5,residual_deviance,119.582436,4.221552,118.948845,121.51142,114.37312,125.56228,117.516495
6,rmse,0.618426,0.01114,0.616464,0.623069,0.604491,0.634384,0.613723
7,rmsle,0.095735,0.002037,0.094811,0.09754,0.094416,0.098261,0.093645



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,iteration,lambda,predictors,deviance_train,alpha,iterations,training_rmse,training_deviance,training_mae,training_r2
0,,2022-06-14 03:18:15,0.000 sec,1,0.63,1,0.699189,0.5,,,,,
1,,2022-06-14 03:18:15,0.001 sec,2,0.57,2,0.672465,0.5,,,,,
2,,2022-06-14 03:18:15,0.001 sec,3,0.52,5,0.635874,0.5,,,,,
3,,2022-06-14 03:18:15,0.002 sec,4,0.47,5,0.599842,0.5,,,,,
4,,2022-06-14 03:18:15,0.003 sec,5,0.43,5,0.568605,0.5,5.0,0.754059,0.568605,0.633652,0.186765
5,,2022-06-14 03:18:15,0.005 sec,6,0.39,5,0.541475,0.5,,,,,
6,,2022-06-14 03:18:15,0.006 sec,7,0.36,5,0.518034,0.5,,,,,
7,,2022-06-14 03:18:15,0.006 sec,8,0.33,5,0.497819,0.5,,,,,
8,,2022-06-14 03:18:15,0.007 sec,9,0.3,6,0.480388,0.5,,,,,
9,,2022-06-14 03:18:15,0.007 sec,10,0.27,6,0.464699,0.5,10.0,0.681688,0.464699,0.551561,0.335375



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,XRT_1_AutoML_3_20220614_31323,0.181427,1.0,0.310956
1,DRF_1_AutoML_3_20220614_31323,0.120882,0.666281,0.207184
2,GBM_grid_1_AutoML_3_20220614_31323_model_1,0.103474,0.570334,0.177349
3,XGBoost_grid_1_AutoML_3_20220614_31323_model_1,0.090013,0.496138,0.154277
4,DeepLearning_grid_1_AutoML_3_20220614_31323_model_1,0.087654,0.483134,0.150233
5,GLM_1_AutoML_3_20220614_31323,0.0,0.0,0.0




In [82]:
test_y_red = aml_red.predict(test_red[features]).as_data_frame()['predict']
test_y_red

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


0      6.896752
1      5.217917
2      6.431069
3      5.811726
4      5.503815
         ...   
228    6.632493
229    6.657870
230    6.204942
231    5.550302
232    6.039355
Name: predict, Length: 233, dtype: float64

In [83]:
test_y_white = aml.predict(test_white[features]).as_data_frame()['predict']
test_y_white

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


0      5.588221
1      4.896008
2      6.127342
3      5.742914
4      5.236204
         ...   
582    5.825178
583    6.487025
584    5.607108
585    6.579523
586    5.965495
Name: predict, Length: 587, dtype: float64

In [84]:
test_df_red

Unnamed: 0,id,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type
0,1257,7.2,0.25,0.37,2.5,0.063,11.0,41.0,0.99439,3.52,0.80,12.4,1
5,5434,10.2,0.54,0.37,15.4,0.214,55.0,95.0,1.00369,3.18,0.77,9.0,1
7,5146,6.2,0.39,0.43,2.0,0.071,14.0,24.0,0.99428,3.45,0.87,11.2,1
12,219,6.0,0.31,0.47,3.6,0.067,18.0,42.0,0.99549,3.39,0.66,11.0,1
13,6403,10.4,0.41,0.55,3.2,0.076,22.0,54.0,0.99960,3.15,0.89,9.9,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
804,5629,7.2,0.39,0.44,2.6,0.066,22.0,48.0,0.99494,3.30,0.84,11.5,1
809,2007,8.5,0.32,0.42,2.3,0.075,12.0,19.0,0.99434,3.14,0.71,11.8,1
811,2864,10.6,0.36,0.57,2.3,0.087,6.0,20.0,0.99676,3.14,0.72,11.1,1
814,5465,9.1,0.52,0.33,1.3,0.070,9.0,30.0,0.99780,3.24,0.60,9.3,1


In [85]:
submission_df_red = pd.DataFrame({'id':test_df_red['id'].reset_index(drop=True), 'quality':test_y_red})

submission_df_red

Unnamed: 0,id,quality
0,1257,6.896752
1,5434,5.217917
2,5146,6.431069
3,219,5.811726
4,6403,5.503815
...,...,...
228,5629,6.632493
229,2007,6.657870
230,2864,6.204942
231,5465,5.550302


In [86]:
submission_df_white = pd.DataFrame({'id':test_df_white.id.reset_index(drop=True), 'quality':test_y_white})

submission_df_white

Unnamed: 0,id,quality
0,6409,5.588221
1,136,4.896008
2,1631,6.127342
3,6084,5.742914
4,1094,5.236204
...,...,...
582,1366,5.825178
583,4646,6.487025
584,734,5.607108
585,1579,6.579523


In [87]:
submission_df = pd.concat([submission_df_red, submission_df_white])

In [88]:
submission_df.to_csv('submission.csv', index=False)

In [89]:
submission_df

Unnamed: 0,id,quality
0,1257,6.896752
1,5434,5.217917
2,5146,6.431069
3,219,5.811726
4,6403,5.503815
...,...,...
582,1366,5.825178
583,4646,6.487025
584,734,5.607108
585,1579,6.579523
