In [15]:
%matplotlib inline

from datetime import datetime

from scipy import stats
from scipy.special import boxcox1p
from scipy.stats import skew
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from mlxtend.regressor import StackingRegressor
from sklearn.preprocessing import scale
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from mlxtend.regressor import StackingRegressor
from catboost import CatBoostRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectPercentile, f_classif
from scipy.stats import zscore


import warnings
warnings.filterwarnings('ignore')


# 用于正常显示中文
plt.rcParams['font.sans-serif'] = ['SimHei']
# 用来正常显示负号
plt.rcParams['axes.unicode_minus'] = False
# 设置图片清晰度
%config InlineBackend.figure_format = 'retina'  

In [16]:
train = pd.read_csv('../original_data/train_dataset.csv')
test = pd.read_csv('../original_data/test_dataset.csv')

In [17]:
data = pd.concat([train, test]).reset_index(drop=True)

### 对数据进行截断

In [4]:
columns = ['当月视频播放类应用使用次数', '用户当月账户余额（元）', '当月物流快递类应用使用次数', '当月飞机类应用使用次数']
for col in columns:
    up = np.percentile(data[col].values, 99.99)
    down = np.percentile(data[col].values,0.01)
    data.loc[data[col]>up, col] = up
    data.loc[data[col]<down, col] = down

### 对网龄进行年份化

In [18]:
data['用户网龄（年）'] = data['用户网龄（月）'] / 12.0
data.drop(['用户网龄（月）'], axis=1, inplace=True)

### 对缴费方式进行提取

In [19]:
def extract_way(s):
    if s == 0:
        return -1
    elif s % 10 == 0:
        return 1
    else:
        return 0

data['充值方式'] = data['缴费用户最近一次缴费金额（元）'].map(extract_way)

### 对数据进行惩罚修改
1. 网龄 > 年龄
2. 经常逛商场的人的逛商场的次数 < 不经常逛商场的人的次数

In [20]:
def compare_age_by_network_age(row):
    if row['用户网龄（年）'] >= row['用户年龄']:
        return 0.0
    else:
        return row['用户年龄']
    

data['用户年龄'] = data.apply(compare_age_by_network_age, axis=1)

In [21]:
data.loc[data[(data['是否经常逛商场的人'] == 1) & (
        data['近三个月月均商场出现次数'] < data[(data['是否经常逛商场的人'] == 1)][
    '近三个月月均商场出现次数'].quantile(0.15))].index, "是否经常逛商场的人"] = 0
data.loc[data[(data['是否经常逛商场的人'] == 0) & (
        data['近三个月月均商场出现次数'] > data[(data['是否经常逛商场的人'] == 1)][
    '近三个月月均商场出现次数'].quantile(0.85))].index, "是否经常逛商场的人"] = 1

### 特征交叉

In [22]:
data['当月账单超过平均消费'] = data['用户账单当月总费用（元）'] - data['用户近6个月平均消费值（元）']
data['用户近5个月平均消费值（元）'] = data['用户近6个月平均消费值（元）']*6-data['用户账单当月总费用（元）']

In [23]:
data['交通APP'] = data['当月飞机类应用使用次数'] + data['当月火车类应用使用次数']     

### 丢弃列

In [24]:
data.drop(['是否大学生客户', '用户实名制是否通过核实', '用户最近一次缴费距今时长（月）', 
           '当月是否到过福州山姆会员店', '当月是否逛过福州仓山万达', '当月火车类应用使用次数',
           '当月飞机类应用使用次数'], axis=1, inplace=True)

### 数据放缩

In [25]:
log_feats = ['当月通话交往圈人数', '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数', '交通APP'] 
for col in log_feats:
    data[col] = data[col].map(lambda x: np.log1p(x))

### 模型训练

In [29]:
feature_importance_df = pd.DataFrame()

In [55]:
def evaluation(data):
    n_folds = 5
    train = data[:50000]
    test = data[50000:]
    cat_params = {
        'n_estimators': 10000,
        'learning_rate': 0.01,
        'random_seed': 4590,
        'reg_lambda': 5,
        'subsample': 0.7,
        'bootstrap_type': 'Bernoulli',
        'boosting_type': 'Plain',
        'one_hot_max_size': 10,
        'rsm': 0.5,
        'leaf_estimation_iterations': 5,
        'use_best_model': True,
        'max_depth': 6,
        'verbose': -1,
        'thread_count': 4
    }
    train_y = train['信用分']
    train_x = train.drop(['信用分', '用户编码'], axis=1)
    
    test_x = test.drop(['信用分', '用户编码'], axis=1)
    model = CatBoostRegressor(**cat_params)
    kfold = KFold(n_splits=10, shuffle=True, random_state=89)
    kf = kfold.split(train_x, train_y)
    valid_best_l2_all = 0
    test_name = test[['用户编码']]
    cv_pred = np.zeros(test.shape[0])
    oof = np.zeros(train.shape[0])
    count = 0
    global feature_importance_df
    for i, (train_fold, validate) in enumerate(kf):
        print('fold: ',i, ' training')
        # 切分数据
        X_train, X_test, y_train, y_test = train_x.iloc[train_fold, :], train_x.iloc[validate, :], train_y[train_fold], train_y[validate]


        bst = model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)],
                          early_stopping_rounds=200, verbose=False)
        iteration_kwargs = {'ntree_end': bst.best_iteration_}
        cv_pred += bst.predict(test_x, **iteration_kwargs)
        oof[validate] = bst.predict(X_test, **iteration_kwargs)
       
    mae = mean_absolute_error(train_y, oof)
    print("mae:", 1/(1+mae))
    cv_pred /= n_folds
    result = test_name.copy()
#     result.columns = ['id']
#     result['score'] = cv_pred
#     result['score'] = result['score'].map(round)
#     result.to_csv('../result/result_mse.csv', index=None)
    return cv_pred
    # 0.06392  10折
    # 0.06381  5折

In [56]:
cv_pred = evaluation(data)

fold:  0  training
fold:  1  training
fold:  2  training
fold:  3  training
fold:  4  training
fold:  5  training
fold:  6  training
fold:  7  training
fold:  8  training
fold:  9  training
mae: 0.06400876754020883


In [70]:
test_name = test[['用户编码']]

In [71]:
result = test_name.copy()
result.columns = ['id']

In [72]:
result['score'] = cv_pred
# result['score'] = result['score'].map(round)

In [73]:
result.to_csv('../result/result_cat63799.csv', index=None)