In [53]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

In [54]:
name_list=['Sex','Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weigh','Rings']

In [55]:
train=pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data",names=name_list)

In [56]:
train.head(5)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weigh,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [57]:
dummies_Sex=pd.get_dummies(train['Sex'],prefix='Sex')

train = pd.concat([train, dummies_Sex], axis=1)
train = train.drop(['Sex'], axis=1)

train.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weigh,Rings,Sex_F,Sex_I,Sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,0,0,1
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,0,0,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,1,0,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,0,0,1
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,0,1,0


In [58]:
numerical_features=['Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weigh']
from sklearn.preprocessing import MinMaxScaler
for i in numerical_features: 
    scaler = MinMaxScaler()
    train[i] = scaler.fit_transform(train[i])




In [59]:
train.head(5)

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weigh,Rings,Sex_F,Sex_I,Sex_M
0,0.513514,0.521008,0.084071,0.181335,0.150303,0.132324,0.147982,15,0,0,1
1,0.371622,0.352941,0.079646,0.079157,0.066241,0.063199,0.068261,7,0,0,1
2,0.614865,0.613445,0.119469,0.239065,0.171822,0.185648,0.207773,9,1,0,0
3,0.493243,0.521008,0.110619,0.182044,0.14425,0.14944,0.152965,10,0,0,1
4,0.344595,0.336134,0.070796,0.071897,0.059516,0.05135,0.053313,7,0,1,0


In [60]:
features=train.drop("Rings",axis=1)
outcomes=train["Rings"].values
features.head(5)

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weigh,Sex_F,Sex_I,Sex_M
0,0.513514,0.521008,0.084071,0.181335,0.150303,0.132324,0.147982,0,0,1
1,0.371622,0.352941,0.079646,0.079157,0.066241,0.063199,0.068261,0,0,1
2,0.614865,0.613445,0.119469,0.239065,0.171822,0.185648,0.207773,1,0,0
3,0.493243,0.521008,0.110619,0.182044,0.14425,0.14944,0.152965,0,0,1
4,0.344595,0.336134,0.070796,0.071897,0.059516,0.05135,0.053313,0,1,0


In [61]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(features, outcomes, test_size=0.25, random_state=10)

In [67]:
def scatter_y(true_y, predicted_y):
    """Scatter-plot the predicted vs true number of rings
    
    Plots:
       * predicted vs true number of rings
       * perfect agreement line
       * +2/-2 number dotted lines

    Returns the root mean square of the error
    """
    fig, ax = plt.subplots(figsize=(8, 8))
    ax.plot(true_y, predicted_y, '.k')
    
    ax.plot([0, 30], [0, 30], '--k')
    ax.plot([0, 30], [2, 32], ':k')
    ax.plot([2, 32], [0, 30], ':k')
    
    rms = (true_y - predicted_y).std()
    
    ax.text(25, 3,
            "Root Mean Square Error = %.2g" % rms,
            ha='right', va='bottom')

    ax.set_xlim(0, 30)
    ax.set_ylim(0, 30)
    
    ax.set_xlabel('True number of rings')
    ax.set_ylabel('Predicted number of rings')
    
    return rms

# Random forest by sklearn

In [63]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [64]:
from sklearn import metrics
from matplotlib import pylab

final_model = RandomForestRegressor(n_estimators=200,max_features=None,bootstrap=True,oob_score=True,max_depth=8)
final_model.fit(X_train.values, Y_train)
y_pred = final_model.predict(X_test.values)

from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score

print(mean_squared_error(Y_test, y_pred))
print(explained_variance_score(Y_test, y_pred))

4.4344787456
0.55803060339


In [65]:
from sklearn import metrics
from matplotlib import pylab

final_model = RandomForestRegressor(n_estimators=1,max_features=None,bootstrap=False,max_depth=8)
final_model.fit(X_train.values, Y_train)
y_pred = final_model.predict(X_test.values)

from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score

print(mean_squared_error(Y_test, y_pred))
print(explained_variance_score(Y_test, y_pred))

5.7948568664
0.422300358515


In [66]:
from sklearn import metrics
from matplotlib import pylab

final_model = RandomForestRegressor(n_estimators=200,max_features='auto',bootstrap=True,max_depth=8)
final_model.fit(X_train.values, Y_train)
y_pred = final_model.predict(X_test.values)

from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score

print(mean_squared_error(Y_test, y_pred))
print(explained_variance_score(Y_test, y_pred))

4.44670479962
0.556765466485


# Random forest by xgboost

In [52]:
from xgboost import XGBRegressor

final_model = XGBRegressor(n_estimators=100,num_boost_round=1,max_depth=5,subsample=0.632,colsample_bytree=0.8)
final_model.fit(X_train.values, Y_train.astype(int))
y_pred = final_model.predict(X_test.values)

from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score

print(mean_squared_error(Y_test, y_pred))
print(explained_variance_score(Y_test, y_pred))


4.33718376647
0.568121705519


# Decision tree by Xgboost

In [57]:
from xgboost import XGBRegressor

final_model = XGBRegressor(n_estimators=1,num_boost_round=1,max_depth=5,subsample=1,colsample_bytree=1)
final_model.fit(X_train.values, Y_train.astype(int))
y_pred = final_model.predict(X_test.values)

from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score

print(mean_squared_error(Y_test, y_pred))
print(explained_variance_score(Y_test, y_pred))




79.0755040467
0.101002152246


# Bagged decision trees by Xgboost

In [58]:
from xgboost import XGBRegressor

final_model = XGBRegressor(n_estimators=100,num_boost_round=1,max_depth=5,subsample=0.632,colsample_bytree=1)
final_model.fit(X_train.values, Y_train.astype(int))
y_pred = final_model.predict(X_test.values)

from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score

print(mean_squared_error(Y_test, y_pred))
print(explained_variance_score(Y_test, y_pred))

4.35065438353
0.566516590474


In [67]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.wrappers.scikit_learn import KerasRegressor


def base_model():
    model = Sequential()
    model.add(Dense(128, input_dim=10, init='normal', activation='sigmoid'))
    model.add(Dense(128, init='normal', activation='relu'))
    model.add(Dense(1, init='normal'))
    model.compile(loss='mean_squared_error', optimizer = 'adam')
    return model

estimator = KerasRegressor(build_fn = base_model,epochs=200,batch_size=32,)
estimator.fit(X_train.values, Y_train )
y_pred = estimator.predict(X_test.values )
rmse = mean_squared_error( Y_test, y_pred)



Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [68]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score

print(np.sqrt(mean_squared_error(Y_test, y_pred)))
print(explained_variance_score(Y_test, y_pred))

2.14163382494
0.54284387288


# NEW MODEL

In [69]:
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint,History
from keras.layers import Dense, Activation, Dropout, Input
from keras import optimizers
import keras
from keras.models import Model

In [70]:
def new_model():
    input_1 = Input(shape=(1,))
    input_2 = Input(shape=(1,))
    input_3 = Input(shape=(1,))
    input_4 = Input(shape=(1,))
    input_5 = Input(shape=(1,))
    input_6 = Input(shape=(1,))
    input_7 = Input(shape=(1,))
    input_8 = Input(shape=(1,))
    input_9 = Input(shape=(1,))
    input_10 = Input(shape=(1,))

    hidden_1 = Dense(32, activation='sigmoid')(input_1)
    hidden_2 = Dense(32, activation='sigmoid')(input_2)
    hidden_3 = Dense(32, activation='sigmoid')(input_3)
    hidden_4 = Dense(32, activation='sigmoid')(input_4)
    hidden_5 = Dense(32, activation='sigmoid')(input_5)
    hidden_6 = Dense(32, activation='sigmoid')(input_6)
    hidden_7 = Dense(32, activation='sigmoid')(input_7)
    hidden_8 = Dense(32, activation='sigmoid')(input_8)
    hidden_9 = Dense(32, activation='sigmoid')(input_9)
    hidden_10 = Dense(32, activation='sigmoid')(input_10)



    x = keras.layers.concatenate([hidden_1,hidden_2,hidden_3,hidden_4,hidden_5,hidden_6,hidden_7,hidden_8,
                             hidden_9,hidden_10])
    x = Dense(256, activation='relu')(x)
    x = Dense(1)(x)
    model = Model(inputs=[input_1,input_2,input_3,input_4,input_5,input_6,input_7,input_8,
                     input_9,input_10], outputs=[x])
    model.compile(loss='mean_squared_error', optimizer = 'adam')
    return model

value_list=[X_train[['Length']].values,
            X_train[['Diameter']].values,
            X_train[['Height']].values,
            X_train[['Whole weight']].values,
            X_train[['Shucked weight']].values,
            X_train[['Viscera weight']].values,
            X_train[['Shell weigh']].values,
            X_train[['Sex_F']].values,
            X_train[['Sex_I']].values,
            X_train[['Sex_M']].values]

value_list_test=[X_test[['Length']].values,
            X_test[['Diameter']].values,
            X_test[['Height']].values,
            X_test[['Whole weight']].values,
            X_test[['Shell weigh']].values,
            X_test[['Viscera weight']].values,
            X_test[['Shell weigh']].values,
            X_test[['Sex_F']].values,
            X_test[['Sex_I']].values,
            X_test[['Sex_M']].values]
    
    

In [71]:
estimator = KerasRegressor(build_fn = new_model,epochs=200,batch_size=32,)
estimator.fit(value_list, Y_train )


y_pred = estimator.predict(value_list_test)


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [73]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score

print(np.sqrt(mean_squared_error(Y_test, y_pred)))
print(explained_variance_score(Y_test, y_pred))

2.50009578856
0.391797793801


# Neural network ensemble with bagging

In [47]:
from sklearn.model_selection import train_test_split
data_train,data_val=train_test_split(train,test_size=0.25, random_state=10)
X_val=data_val.drop(['Rings'], axis=1).values
y_val=data_val['Rings'].ravel()

In [48]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.metrics import mean_squared_error



def train_nn_simple(data_train,X_val,y_val):
    data_train_new=data_train.sample(frac=0.632,replace=True)
    X_train=data_train_new.drop(['Rings'], axis=1).values
    y_train=data_train_new['Rings'].ravel()
    estimator = KerasRegressor(build_fn = base_model,epochs=200,batch_size=15,)
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_val)
    rmse =np.sqrt(mean_squared_error( y_val, y_pred)) 
    return rmse
    
    
def base_model():
    model = Sequential()
    model.add(Dense(128, input_dim=10, init='normal', activation='sigmoid'))
    model.add(Dense(128, init='normal', activation='relu'))
    model.add(Dense(1, init='normal',activation='relu'))
    model.compile(loss='mean_squared_error', optimizer = 'adam')
    return model
  

In [49]:
result=[]
for i in range(50):
    rmse=train_nn_simple(train,X_val,y_val)
    result.append(rmse)



Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [51]:
rmse=np.sum(result)/50
rmse

2.1027799226339274