In [1]:
%matplotlib inline

from datetime import datetime

from scipy import stats
from scipy.special import boxcox1p
from scipy.stats import skew
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from scipy.special import inv_boxcox1p
from mlxtend.regressor import StackingRegressor
from sklearn.preprocessing import scale
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from mlxtend.regressor import StackingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.feature_selection import SelectPercentile, f_classif
from scipy.stats import zscore


import warnings
warnings.filterwarnings('ignore')


# 用于正常显示中文
plt.rcParams['font.sans-serif'] = ['SimHei']
# 用来正常显示负号
plt.rcParams['axes.unicode_minus'] = False
# 设置图片清晰度
%config InlineBackend.figure_format = 'retina'  

In [2]:
train = pd.read_csv('../original_data/train_dataset.csv')
test = pd.read_csv('../original_data/test_dataset.csv')

In [3]:
data = pd.concat([train, test]).reset_index(drop=True)

### 对网龄进行年份化

In [4]:
def conversion_month_year(month):
    try:
        return month / 12.0
    except Exception:
        return 0.0
    
    
data['用户网龄（年）'] = data['用户网龄（月）'].map(conversion_month_year)
data.drop(['用户网龄（月）'], axis=1, inplace=True)

### 对缴费方式进行提取

In [5]:
data['充值方式'] = 0
data['充值方式'][(data['缴费用户最近一次缴费金额（元）'] % 10 == 0) & 
             data['缴费用户最近一次缴费金额（元）'] != 0] = 1

### 对数据进行惩罚修改
1. 网龄 > 年龄 或者 年龄 为0
2. 经常逛商场的人的逛商场的次数 < 不经常逛商场的人的次数

In [6]:
def compare_age_by_network_age(row):
    if row['用户网龄（年）'] >= row['用户年龄']:
        return 0.0
    else:
        return row['用户年龄']
    

data['用户年龄'] = data.apply(compare_age_by_network_age, axis=1)

In [7]:
data.loc[data[(data['是否经常逛商场的人'] == 1) & (
        data['近三个月月均商场出现次数'] < data[(data['是否经常逛商场的人'] == 1)][
    '近三个月月均商场出现次数'].quantile(0.15))].index, "是否经常逛商场的人"] = 0
data.loc[data[(data['是否经常逛商场的人'] == 0) & (
        data['近三个月月均商场出现次数'] > data[(data['是否经常逛商场的人'] == 1)][
    '近三个月月均商场出现次数'].quantile(0.85))].index, "是否经常逛商场的人"] = 1

In [8]:
data['用户话费敏感度'].replace(data['用户话费敏感度'].mode(), inplace=True)

### 特征交叉

In [9]:
data['最近账单稳定性'] = data['用户账单当月总费用（元）'] / (data['用户近6个月平均消费值（元）'] + 0.001)
data['用户近5个月平均消费值（元）'] = (data['用户近6个月平均消费值（元）']*6-data['用户账单当月总费用（元）'])/5

In [10]:
data.drop(['是否大学生客户', '用户实名制是否通过核实', '用户最近一次缴费距今时长（月）', 
           '当月是否到过福州山姆会员店', '当月是否逛过福州仓山万达', '当月火车类应用使用次数',
           '当月飞机类应用使用次数'], axis=1, inplace=True)

### 模型训练

In [11]:
feature_importance_df = pd.DataFrame()

In [12]:
def evaluation(data):
    n_folds = 7
    train = data[:50000]
    test = data[50000:]
    lgb_mae = {
        'learning_rate': 0.01,
        'boosting_type': 'gbdt',
        'objective': 'regression_l1',
        'metric': 'mae',
        'feature_fraction': 0.6,
        'bagging_fraction': 0.8,
        'bagging_freq': 2,
        'num_leaves': 28,
        'verbose': -1,
        'max_depth': 5,
        'lambda_l2': 10,
        'lambda_l1': 4,
        'min_data_in_leaf': 35,
        'nthread': 4,
    }
    lgb_mse = {
        'learning_rate': 0.01,
        'boosting_type': 'gbdt',
        'objective': 'regression_l2',
        'metric': 'mae',
        'feature_fraction': 0.5,
        'bagging_fraction': 0.7,
        'bagging_freq': 1,
        'num_leaves': 43,  # 45
        'verbose': -1,
        'max_depth': 6,
        'lambda_l2': 5,
        'lambda_l1': 2,
        'min_data_in_leaf': 30,
        'nthread': 4,
    }
    train_y = train['信用分']
    train_x = train.drop(['信用分', '用户编码'], axis=1)
    test_x = test.drop(['信用分', '用户编码'], axis=1)
    kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=89)
    kf = kfold.split(train_x, train_y)
    valid_best_l2_all = 0
    test_name = test[['用户编码']]
    cv_pred = np.zeros(test.shape[0])
    offline_pred = np.zeros(train.shape[0])
    count = 0
    global feature_importance_df
    for i, (train_fold, validate) in enumerate(kf):
        print('fold: ',i, ' training')
        # 切分数据
        X_train, X_test, y_train, y_test = train_x.iloc[train_fold, :], train_x.iloc[validate, :], train_y[train_fold], train_y[validate]

        # 载入数据集
        dtrain = lgb.Dataset(X_train, y_train)
        dvalid = lgb.Dataset(X_test, y_test, reference=dtrain)

        bst = lgb.train(lgb_mae, dtrain, num_boost_round=100000, valid_sets=dvalid, 
                        verbose_eval=-1, early_stopping_rounds=100)
        # 训练
        offline_pred[validate] = bst.predict(X_test, num_iteration=bst.best_iteration)
        cv_pred += bst.predict(test_x, num_iteration=bst.best_iteration)
        valid_best_l2_all += bst.best_score['valid_0']['l1']
        
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = list(X_train.columns)
        fold_importance_df["importance"] = bst.feature_importance(importance_type='gain', iteration=bst.best_iteration)
        fold_importance_df["fold"] = count + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        count += 1
    valid_best_l2_all /= n_folds
    cv_pred /= n_folds
    result = test_name.copy()
    result.columns = ['id']
    result['score'] = cv_pred
#     result['score'] = result['score'].map(round)
    result.to_csv('../result/result_mae6391.csv', index=None)
    print('cv score for valid is: ', 1/(1+valid_best_l2_all))
    return offline_pred
    # 0.06392 mae
    # 0.06381 mse
    # 0.06398101910912465

In [13]:
offline_pred = evaluation(data)

fold:  0  training
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[3474]	valid_0's l1: 14.6997
fold:  1  training
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[2430]	valid_0's l1: 14.7592
fold:  2  training
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[2761]	valid_0's l1: 14.6436
fold:  3  training
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[3453]	valid_0's l1: 14.8064
fold:  4  training
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[3476]	valid_0's l1: 14.4598
fold:  5  training
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[3073]	valid_0's l1: 14.6095
fold:  6  training
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[3265]	

In [14]:
feature_importance_df[feature_importance_df['fold'] == 1].sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance,fold
7,当月通话交往圈人数,738722.945263,1
20,用户网龄（年）,473035.685519,1
23,用户近5个月平均消费值（元）,198918.894537,1
16,用户近6个月平均消费值（元）,168560.298453,1
12,用户年龄,115154.507431,1
15,用户账单当月总费用（元）,57997.2395,1
8,当月金融理财类应用使用总次数,53561.434864,1
22,最近账单稳定性,43753.880514,1
6,当月视频播放类应用使用次数,42216.903593,1
18,缴费用户最近一次缴费金额（元）,35624.749925,1


In [15]:
data_tmp = data.copy()