In [15]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
plt.style.use('ggplot')
%matplotlib inline

In [16]:
from scipy import stats
from scipy.stats import norm, skew

In [17]:
train_raw = pd.read_csv('../data/d_train.csv', index_col='id', encoding='gbk')
test_raw = pd.read_csv('../data/d_test_A_20180102.csv', index_col='id', encoding='gbk')

In [18]:
new_columns = ['F{0}'.format(i+1) for i in range(train_raw.shape[1])]
new_columns[-1] = 'blood_sugar'
feature_map = pd.Series(index=new_columns, data=train_raw.columns)
train_raw.columns = new_columns
test_raw.columns = new_columns[:-1]

In [19]:
all_data = pd.concat([train_raw.iloc[:, :-1], test_raw], axis=0, join='outer').reset_index(drop=True)
# 2是缺失值
sex_dict = {u'男':'male', u'女':'female'}
all_data.F1 = all_data.F1.apply(lambda x: sex_dict[x] if x in sex_dict.keys() else 'other')

In [20]:
n_train = train_raw.shape[0]
n_test = test_raw.shape[0]

train_raw.blood_sugar = np.log1p(train_raw.blood_sugar)
y_train = train_raw.blood_sugar.values

In [21]:
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing_Ratio' :all_data_na})

In [22]:
# F3: 日期
drop_feature_list = ['F3', 'F19', 'F20', 'F21', 'F22', 'F23']
all_data = all_data.drop(drop_feature_list, axis=1)

In [23]:
all_data.iloc[:, 1:] = all_data.iloc[:, 1:].apply(lambda x: x.fillna(x.mean()))

In [24]:
# Check the skew of all numerical features
skewed_feats = all_data.iloc[:, 1:].apply(lambda x: skew(x)).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_feats})

In [25]:
skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    #all_data[feat] += 1
    all_data[feat] = boxcox1p(all_data[feat], lam)

There are 33 skewed numerical features to Box Cox transform


In [26]:
all_data = pd.get_dummies(all_data)

In [27]:
train = all_data[:n_train]
test = all_data[n_train:]

In [28]:
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC, Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [29]:
n_folds = 5
def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=0).get_n_splits(train.values)
    rmse = np.sqrt(-cross_val_score(model, train.values, y_train, scoring='neg_mean_squared_error', cv=kf))
    return rmse

In [30]:
ridge = make_pipeline(RobustScaler(), Ridge(alpha=2.08, random_state=11))

In [31]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=0.5, random_state=2))

In [32]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=0.25)

In [33]:
GBoost = GradientBoostingRegressor(n_estimators=500, 
                                   learning_rate=0.05,
                                   max_depth=4, 
                                   max_features='sqrt',
                                   min_samples_leaf=15, 
                                   min_samples_split=10, 
                                   loss='huber', 
                                   random_state =3)

In [34]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, 
                             gamma=0.0468, 
                             learning_rate=0.05, 
                             max_depth=3, 
                             min_child_weight=1.7817, 
                             n_estimators=2200,
                             reg_alpha=0.4640, 
                             reg_lambda=0.8571,
                             subsample=0.5213, 
                             silent=1,
                             seed=4, 
                             nthread = -1)

In [35]:
model_lgb = lgb.LGBMRegressor(objective='regression',
                              num_leaves=5,
                              learning_rate=0.05, 
                              n_estimators=720,
                              max_bin = 55, 
                              bagging_fraction = 0.8,
                              bagging_freq = 5, 
                              feature_fraction = 0.2319,
                              feature_fraction_seed=9, 
                              bagging_seed=9,
                              min_data_in_leaf =6, 
                              min_sum_hessian_in_leaf = 11)

In [36]:
score = rmsle_cv(ridge)
print("\nRidge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Ridge score: 0.1546 (0.0091)



In [37]:
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

ElasticNet score: 0.1546 (0.0091)



In [38]:
score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Kernel Ridge score: 0.1523 (0.0093)



In [62]:
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Gradient Boosting score: 0.0499 (0.0029)



In [47]:
score = rmsle_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Xgboost score: 0.1562 (0.0098)



In [46]:
score = rmsle_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

LGBM score: 0.1532 (0.0094)



In [31]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        for model in self.models_:
            model.fit(X, y)
        return self
    
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

In [32]:
averaged_models = AveragingModels(models = (ENet, GBoost, KRR, ridge))
score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

 Averaged base models score: 0.0492 (0.0028)



In [39]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models =base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
    
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        
        kfold = KFold(n_splits=n_folds, shuffle=True, random_state=0)
        
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, hold_out_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[hold_out_index])
                out_of_fold_predictions[hold_out_index, i] = y_pred
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
    
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([
                model.predict(X) 
                for model in base_models]).mean(axis=1)
                    for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [40]:
stacked_averaged_models = StackingAveragedModels(base_models = (model_xgb, model_lgb, ENet, GBoost, KRR),
                                                 meta_model = ridge)

score = rmsle_cv(stacked_averaged_models)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

Stacking Averaged models score: 0.1511 (0.0094)


In [33]:
stacked_averaged_models = StackingAveragedModels(base_models = (model_xgb, model_lgb, ENet, GBoost, KRR),
                                                 meta_model = ridge)

score = rmsle_cv(stacked_averaged_models)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

Stacking Averaged models score: 0.1511 (0.0094)


In [63]:
def rmsle(y, y_pred):
#     y_pred = map(lambda x: float('%0.3f' % x), y_pred)
    return np.sqrt(mean_squared_error(y, y_pred))

In [42]:
stacked_averaged_models.fit(train.values, y_train)
stacked_train_pred = stacked_averaged_models.predict(train.values)
stacked_pred = np.expm1(stacked_averaged_models.predict(test.values))
print(rmsle(y_train, stacked_train_pred))

0.131713596106


In [64]:
model_xgb.fit(train, y_train)
xgb_train_pred = model_xgb.predict(train)
xgb_pred = np.expm1(model_xgb.predict(test))
print(rmsle(y_train, xgb_train_pred))

0.0933179568238


In [65]:
model_lgb.fit(train, y_train)
lgb_train_pred = model_lgb.predict(train)
lgb_pred = np.expm1(model_lgb.predict(test.values))
print(rmsle(y_train, lgb_train_pred))

0.123489867926


In [50]:
print('RMSLE score on train data:')
print(rmsle(y_train,stacked_train_pred*0.70 +
            xgb_train_pred*0.15 + 
            lgb_train_pred*0.15 ))

RMSLE score on train data:
0.123476369234


In [59]:
# ensemble = stacked_pred*0.70 + xgb_pred*0.15 + lgb_pred*0.15
ensemble = stacked_pred
ensemble = map(lambda x: '%0.3f' % x, ensemble)

In [60]:
sub = pd.DataFrame()
sub['blood_sugar'] = ensemble
sub.to_csv('../data/submission.csv',index=False, header=None)