In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt, inf

In [None]:
train = pd.read_csv('../input/petfinder-pawpularity-score/train.csv')
train = train.set_index('Id')
targets = train['Pawpularity']
train = train.drop('Pawpularity', axis=1)
preds = pd.read_csv('../input/petfinder-pawpularity-score/sample_submission.csv')
preds = preds.set_index('Id')
test = pd.read_csv('../input/petfinder-pawpularity-score/test.csv')
test = test.set_index('Id')

In [None]:
train.describe()

In [None]:
train.hist()
plt.show()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train, targets, 
                                                  test_size=0.2, random_state=0)

In [None]:
_="""n_estimators = np.linspace(100, 1000, 10)
max_depths = range(1,20,2)
best_model = (None, None)
min_error = inf
for n in n_estimators:
    print('estimators:', n)
    for d in max_depths:
        print('    depth:', d)
        rf = RandomForestRegressor(n_estimators=int(n), max_features='sqrt',
                                  max_depth=d)
        rf.fit(X_train, y_train)
        pred = rf.predict(X_val)
        error = sqrt(mean_squared_error(y_val, pred))
        if error < min_error:
            best_model = (n, d)
            min_error = error
        print('        Error:', error)"""

In [None]:
_="""print('Best error:', min_error)
print('Best n_estimator:', best_model[0])
print('Best depth:', best_model[1])"""

In [None]:
_="""n_estimators = np.linspace(100, 1000, 10)
learning_rates = np.logspace(-1, -4, 10)
best_model = (None, None)
min_error = inf
for n in n_estimators:
    print('estimators:', n)
    for l in learning_rates:
        print('    learning_rate:', l)
        xgb = XGBRegressor(n_estimators=int(n), learning_rate=l)
        xgb.fit(X_train, y_train)
        pred = xgb.predict(X_val)
        error = sqrt(mean_squared_error(y_val, pred))
        if error < min_error:
            best_model = (n, l)
            min_error = error
        print('        Error:', error)"""

In [None]:
_="""print('Best error:', min_error)
print('Best n_estimator:', best_model[0])
print('Best learning rate:', best_model[1])"""

In [None]:
_="""n_estimators = np.linspace(100, 1000, 10)
learning_rates = np.logspace(-1, -4, 10)
best_model = (None, None)
min_error = inf
for n in n_estimators:
    print('estimators:', n)
    for l in learning_rates:
        print('    learning_rate:', l)
        lgbm = LGBMRegressor(n_estimators=int(n), learning_rate=l)
        lgbm.fit(X_train, y_train)
        pred = lgbm.predict(X_val)
        error = sqrt(mean_squared_error(y_val, pred))
        if error < min_error:
            best_model = (n, l)
            min_error = error
        print('        Error:', error)"""

In [None]:
_="""print('Best error:', min_error)
print('Best n_estimator:', best_model[0])
print('Best learning rate:', best_model[1])"""

In [None]:
_="""n_estimators = np.linspace(100, 1000, 10)
learning_rates = np.logspace(-1, -4, 10)
best_model = (None, None)
min_error = inf
for n in n_estimators:
    print('estimators:', n)
    for l in learning_rates:
        print('    learning_rate:', l)
        cat = CatBoostRegressor(n_estimators=int(n), learning_rate=l,
                               verbose=0)
        cat.fit(X_train, y_train)
        pred = cat.predict(X_val)
        error = sqrt(mean_squared_error(y_val, pred))
        if error < min_error:
            best_model = (n, l)
            min_error = error
        print('        Error:', error)"""

In [None]:
_="""print('Best error:', min_error)
print('Best n_estimator:', best_model[0])
print('Best learning rate:', best_model[1])"""

In [None]:
from sklearn.ensemble import StackingRegressor, VotingRegressor

X_train, X_val, y_train, y_val = train_test_split(train, targets, 
                                                 random_state=1, test_size=0.2)

In [None]:
_="""stacking = StackingRegressor(
    estimators=[
        ('rf', RandomForestRegressor(n_estimators=400, max_features='sqrt',
                                    max_depth=5)),
        ('xgb', XGBRegressor(n_estimators=200, learning_rate=0.021544346900318846)),
        ('lgbm', LGBMRegressor(n_estimators=100, learning_rate=0.004641588833612782)),
        ('cat', CatBoostRegressor(n_estimators=300, learning_rate=0.004641588833612782,
                                 verbose=0))
    ],
    final_estimator=RandomForestRegressor(n_estimators=500)
)
stacking.fit(X_train, y_train)
pred = stacking.predict(X_val)
print('Error:', sqrt(mean_squared_error(y_val, pred)))"""

In [None]:
voting = VotingRegressor(
    estimators=[
        ('rf', RandomForestRegressor(n_estimators=400, max_features='sqrt',
                                    max_depth=5)),
        ('xgb', XGBRegressor(n_estimators=200, learning_rate=0.021544346900318846)),
        ('lgbm', LGBMRegressor(n_estimators=100, learning_rate=0.004641588833612782)),
        ('cat', CatBoostRegressor(n_estimators=300, learning_rate=0.004641588833612782,
                                 verbose=0))
    ]
)
voting.fit(X_train, y_train)
pred = voting.predict(X_val)
print('Error:', sqrt(mean_squared_error(y_val, pred)))

In [None]:
voting.fit(X_train, y_train)
pseudo_labels = pd.DataFrame(voting.predict(X_val), columns=['Pawpularity'])
pseudo_labels.index = y_val.index
y_train = y_train.append(pseudo_labels['Pawpularity'])
X_train = X_train.append(X_val)
voting.fit(X_train, y_train)
pred = voting.predict(X_val)
error = sqrt(mean_squared_error(y_val, pred))
print('Error:', error)

In [None]:
voting.fit(train, targets)
pseudo_labels = pd.DataFrame(voting.predict(test), columns=['Pawpularity'])
pseudo_labels.index = preds.index
targets = targets.append(pseudo_labels['Pawpularity'])
train = train.append(test)

In [None]:
voting.fit(train, targets)
preds['Pawpularity'] = voting.predict(test)
preds.to_csv('submission.csv')