In [1]:
%matplotlib inline

from datetime import datetime

from scipy import stats
from scipy.special import boxcox1p
from scipy.stats import skew
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from scipy.special import inv_boxcox1p
from mlxtend.regressor import StackingRegressor
from sklearn.preprocessing import scale
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from mlxtend.regressor import StackingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.feature_selection import SelectPercentile, f_classif
from scipy.stats import zscore


import warnings
warnings.filterwarnings('ignore')


# 用于正常显示中文
plt.rcParams['font.sans-serif'] = ['SimHei']
# 用来正常显示负号
plt.rcParams['axes.unicode_minus'] = False
# 设置图片清晰度
%config InlineBackend.figure_format = 'retina'  

In [2]:
train = pd.read_csv('../original_data/train_dataset.csv')
test = pd.read_csv('../original_data/test_dataset.csv')

In [3]:
data = pd.concat([train, test]).reset_index(drop=True)

In [4]:
data.columns = [
    'score', 'tour_app_count', 'sport_flag', 'sam_flag', 'tour_flag',
    'movie_flag', 'wanda_flag', 'train_count', 'express_count', 
    'online_shopping_count', 'video_app_count', 'connect_num', 'finance_app_count',
    'flight_count', '4g_unhealth_flag', 'uni_student_flag', 'freq_shopping_flag', 'blk_list_flag',
    'true_name_flag', 'age', 'curr_month_balance', 'top_up_month_diff', 'uid',
    'net_age_till_now', 'cost_sensitivity', 'total_account_fee', 'recent_6month_avg_use', 'curr_overdue_flag',
    'top_up_amount', 'recent_3month_shopping_count'
]

### 对网龄进行年份化

In [5]:
data['net_age_till_now'] = data['net_age_till_now'] / 12.0

### 对缴费方式进行提取

In [6]:
data['top_up_amount_offline'] = 0
data['top_up_amount_offline'][(data['top_up_amount'] % 10 == 0) & 
             data['top_up_amount'] != 0] = 1

### 对数据进行惩罚修改
1. 网龄 > 年龄 或者 年龄 为0
2. 经常逛商场的人的逛商场的次数 < 不经常逛商场的人的次数

In [7]:
def compare_age_by_network_age(row):
    if row['net_age_till_now'] >= row['age']:
        return 0.0
    else:
        return row['age']
    

data['age'] = data.apply(compare_age_by_network_age, axis=1)

In [8]:
data.loc[data[(data['freq_shopping_flag'] == 1) & (
        data['recent_3month_shopping_count'] < data[(data['freq_shopping_flag'] == 1)][
    'recent_3month_shopping_count'].quantile(0.15))].index, "freq_shopping_flag"] = 0
data.loc[data[(data['freq_shopping_flag'] == 0) & (
        data['recent_3month_shopping_count'] > data[(data['freq_shopping_flag'] == 1)][
    'recent_3month_shopping_count'].quantile(0.85))].index, "freq_shopping_flag"] = 1

In [9]:
data['cost_sensitivity'].replace(data['cost_sensitivity'].mode(), inplace=True)

### 特征交叉

In [10]:
data['current_fee_stability'] = data['total_account_fee'] / (data['recent_6month_avg_use'] + 0.001)
data['recent_5month_avg_use'] = (data['recent_6month_avg_use']*6-data['total_account_fee'])/5

In [11]:
data.drop(['uni_student_flag', 'true_name_flag', 'flight_count', 
           'train_count', 'top_up_month_diff', 'wanda_flag', 'sam_flag'], axis=1, inplace=True)

### 模型训练

In [12]:
import h2o
from h2o.automl import H2OAutoML
h2o.init(nthreads=-1)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.181-b13, mixed mode)
  Starting server from D:\Users\Administrator\Anaconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\ADMINI~1\AppData\Local\Temp\tmp_lt32qs1
  JVM stdout: C:\Users\ADMINI~1\AppData\Local\Temp\tmp_lt32qs1\h2o_Administrator_started_from_python.out
  JVM stderr: C:\Users\ADMINI~1\AppData\Local\Temp\tmp_lt32qs1\h2o_Administrator_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,04 secs
H2O cluster timezone:,Asia/Shanghai
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.5
H2O cluster version age:,5 days
H2O cluster name:,H2O_from_python_Administrator_kbxb39
H2O cluster total nodes:,1
H2O cluster free memory:,1.762 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [13]:
# def evaluation(data):
#     n_folds = 5
#     train = data[:50000]
#     test = data[50000:]
#     lgb_mae = {
#         'learning_rate': 0.01,
#         'boosting_type': 'gbdt',
#         'objective': 'regression_l1',
#         'metric': 'mae',
#         'feature_fraction': 0.5,
#         'bagging_fraction': 0.7,
#         'bagging_freq': 1,
#         'num_leaves': 30,
#         'verbose': -1,
#         'max_depth': 5,
#         'lambda_l2': 5,
#         'lambda_l1': 1,
#         'min_data_in_leaf': 30,
#         'nthread': 4,
#     }
#     train_y = train['score']
#     train_x = train.drop(['score', 'uid'], axis=1)
#     test_x = test.drop(['score', 'uid'], axis=1)
#     kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=86)
#     kf = kfold.split(train_x, train_y)
#     valid_best_l2_all = 0
#     test_name = test[['uid']]
#     cv_pred = np.zeros(test.shape[0])
#     count = 0
#     global feature_importance_df
#     for i, (train_fold, validate) in enumerate(kf):
#         print('fold: ',i, ' training')
#         # 切分数据
#         X_train, X_test, y_train, y_test = train_x.iloc[train_fold, :], train_x.iloc[validate, :], train_y[train_fold], train_y[validate]

#         # 载入数据集
#         dtrain = lgb.Dataset(X_train, y_train)
#         dvalid = lgb.Dataset(X_test, y_test, reference=dtrain)

#         bst = lgb.train(lgb_mae, dtrain, num_boost_round=100000, valid_sets=dvalid, 
#                         verbose_eval=-1, early_stopping_rounds=100)
#         # 训练
#         y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
#         cv_pred += bst.predict(test_x, num_iteration=bst.best_iteration)
#         valid_best_l2_all += bst.best_score['valid_0']['l1']

#         count += 1
#     valid_best_l2_all /= n_folds
#     cv_pred /= n_folds
#     result = test_name.copy()
#     result.columns = ['id']
#     result['score'] = cv_pred
#     result['score'] = result['score'].map(round)
#     result.to_csv('../result/result_mae.csv', index=None)
#     print('cv score for valid is: ', 1/(1+valid_best_l2_all))

In [17]:
def get_score(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    return 1.0 / (mae + 1)


def evaluation(data):
    n_folds = 5
    train = data[:50000]
    test = data[50000:]
    train_x = train.drop(['uid'], axis=1)
    test_x = test.drop(['score', 'uid'], axis=1)
    valid_best_l2_all = 0
    test_name = test[['uid']]
    remove_columns = ['uid', 'score']
    features_columns = [column for column in train.columns if column not in remove_columns]
    aml = H2OAutoML(max_models=320, seed=2019, max_runtime_secs=12800)
    
    aml.train(x=features_columns, y='score', training_frame= h2o.H2OFrame(train_x))

        
    y_pred =aml.predict(h2o.H2OFrame(test_x)).as_data_frame().values.flatten()
    test_name['score'] = y_pred
    
    return test_name

In [18]:
result = evaluation(data)

Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
stackedensemble prediction progress: |████████████████████████████████████| 100%


In [19]:
result

Unnamed: 0,uid,score
50000,7171737d49b143d1b38883a39e4a5730,601.878841
50001,3af0a449d5424488912e8fb2bf4b9faa,531.419246
50002,eb2cf02e0d5c4d1294dd73e776dbb441,670.045097
50003,9c0f780ecb254670a11aa9e3f10777c5,678.396684
50004,d794eed46c1e44f785a575f18b3023a5,661.549182
50005,18f6a7d824a1421b9da3e5f10854c3df,612.693601
50006,e7b63888a36f499a88811c0936bb12df,639.857407
50007,f995ef4d96fc426191a104421b0f5b20,566.187332
50008,11a3ce45ce234f7db4e91e5f152dc8b8,672.994952
50009,86ebeac087284c87bbcb1281d95ef9eb,583.787336


In [21]:
result.to_csv('../result/auto_ml.csv', index=None)