In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.cross_validation import KFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.svm import SVR


In [None]:
train = pd.read_csv('../input/geek.csv')
test = pd.read_csv('../input/geektest.csv')

In [None]:
ID = 'ID'
TARGET = 'Upvotes'
NFOLDS = 4
SEED = 0

In [None]:
one_hot_data = pd.concat([train, pd.get_dummies(train['Tag'])], axis=1)
one_hot_data = one_hot_data.drop('Tag', axis=1)

# Print the first 10 rows of our data
one_hot_data[:10]
train = one_hot_data[:]

In [None]:
one_hot_data = pd.concat([test, pd.get_dummies(test['Tag'])], axis=1)
one_hot_data = one_hot_data.drop('Tag', axis=1)

# Print the first 10 rows of our data
one_hot_data[:10]
test = one_hot_data[:]

In [None]:
train = train.drop(["ID"],axis=1)
y = train['Upvotes']
X = train.drop(columns=['Upvotes'])

In [None]:
test = test.drop(["ID"],axis=1)

In [None]:
ntrain = X.shape[0]
ntest = test.shape[0]
train_test = pd.concat((X, test)).reset_index(drop=True)

In [None]:
features = train.columns

In [None]:
cats = [feat for feat in features if 'cat' in feat]
for feat in cats:
    train_test[feat] = pd.factorize(train_test[feat], sort=True)[0]

print(train_test.head())

In [None]:
x_train = np.array(train_test.iloc[:ntrain,:])
x_test = np.array(train_test.iloc[ntrain:,:])

kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

In [None]:
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        #params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
class LGBWrapper(object):
    
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y):
        
        self.clf.fit(x_train, y)

    def predict(self, x):
        return (self.clf.predict(x))
    
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y):
        
        self.clf.fit(x_train, y)

    def predict(self, x):
        return self.clf.predict(x)


class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y):
        dtrain = xgb.DMatrix(x_train, label=y)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))


In [None]:
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [None]:
et_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

rf_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.2,
    'max_depth': 8,
    'min_samples_leaf': 2,
}

xgb_params = {'min_child_weight': 2.477811397259722, 'colsample_bytree': 0.5366763711556178, 'max_depth': 5, 'subsample': 0.6189058730703388, 'gamma': 5.6461161593168505, 'alpha': 5.861337956829854}

ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}
svr_params = {
    'kernel' : 'rbf',
    'C' : 0.025
    }
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}
lgb_params = params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': -1,
        'verbose': 0,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'subsample': 0.8,
        'subsample_freq': 1,
        'colsample_bytree': 0.6,
        'reg_aplha': 1,
        'reg_lambda': 0.001,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1     
    }

In [None]:
xg = XgbWrapper(seed=SEED, params=xgb_params)

In [None]:
lgb = LGBWrapper(clf=LGBMRegressor,seed=SEED,params=lgb_params)

In [None]:
ada = SklearnWrapper(clf=AdaBoostRegressor, seed=SEED, params=ada_params)

In [None]:
#svr = SklearnHelper(clf=SVR,seed=SEED,params = svr_params)
gbt = SklearnWrapper(clf=GradientBoostingRegressor,seed=SEED,params = gb_params)

In [None]:
svr = SklearnHelper(clf=SVR,seed=SEED,params = svr_params)

In [None]:
et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)

In [None]:
rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)

In [None]:
lgb_oof_train,lgb_oof_test = get_oof(lgb)

In [None]:
xg_oof_train, xg_oof_test = get_oof(xg)

In [None]:
ada_oof_train , ada_oof_test = get_oof(ada)

In [None]:
svr_oof_train , svr_oof_test = get_oof(svr)

In [None]:
et_oof_train, et_oof_test = get_oof(et)


In [None]:
rf_oof_train, rf_oof_test = get_oof(rf)

In [None]:
gb_oof_train , gb_oob_test = get_oof(gbt)

In [None]:
x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train,ada_oof_train,gb_oof_train,lgb_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test,ada_oof_test,gb_oob_test,lgb_oof_test), axis=1)

In [None]:
print("{},{}".format(x_train.shape, x_test.shape))


In [None]:
dtrain = xgb.DMatrix(x_train, label=y)
dtest = xgb.DMatrix(x_test)

In [None]:
xgb_params ={'min_child_weight': 2.477811397259722, 'colsample_bytree': 0.5366763711556178, 'max_depth': 5, 'subsample': 0.6189058730703388, 'gamma': 5.6461161593168505, 'alpha': 5.861337956829854}

In [None]:
res = xgb.cv(xgb_params, dtrain, num_boost_round=500, nfold=4, seed=SEED, stratified=False,
             early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

In [None]:
best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]

print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std))


In [None]:
gbdt = xgb.train(xgb_params, dtrain, best_nrounds)


In [None]:
a = pd.read_csv('../input/geektest.csv')

In [None]:
y_pred = gbdt.predict(dtest)

In [None]:
submission = pd.DataFrame({
        "ID": a.ID,
        "Upvotes": y_pred
})

submission.to_csv('ensemble1.csv',index=False)
submission.head()
