In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
import time
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from scipy.sparse import csr_matrix
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

#特征重要性
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.show()
 
#聚合求count
def feature_count(data, features=[]):
    if len(set(features)) != len(features):
        print('equal feature !!!!')
        return data
    new_feature = 'count'
    for i in features:
        new_feature += '_' + i.replace('add_', '')
    try:
        del data[new_feature]
    except:
        pass
    temp = data.groupby(features).size().reset_index().rename(columns={0: new_feature})
    data = data.merge(temp, 'left', on=features)
    new_num_features.append(new_feature)
    return data    

In [2]:
train = pd.read_csv('../input/train_dataset.csv')
test = pd.read_csv('../input/test_dataset.csv')

#删除id
test_id = test['用户编码'].copy()

train.drop("用户编码", axis = 1, inplace = True)
test.drop("用户编码", axis = 1, inplace = True)

label = train['信用分'].copy()
train.drop(['信用分'], axis=1, inplace=True)


#特征工程
data = pd.concat([train, test])

#原始的类别和数值特征
ori_cat_features = ['用户实名制是否通过核实','是否大学生客户','是否黑名单客户','是否4G不健康客户','缴费用户当前是否欠费缴费','是否经常逛商场的人','当月是否逛过福州仓山万达',
                    '当月是否到过福州山姆会员店','当月是否看电影','当月是否景点游览','当月是否体育场馆消费']
ori_num_features = ['用户年龄','用户网龄（月）','用户最近一次缴费距今时长（月）','缴费用户最近一次缴费金额（元）','用户近6个月平均消费值（元）','用户账单当月总费用（元）',
                    '用户当月账户余额（元）','用户话费敏感度','当月通话交往圈人数','近三个月月均商场出现次数','当月网购类应用使用次数','当月物流快递类应用使用次数',
                    '当月金融理财类应用使用总次数','当月视频播放类应用使用次数','当月飞机类应用使用次数','当月火车类应用使用次数','当月旅游资讯类应用使用次数']
ori_col = data.columns.tolist()
                    
#对年龄异常值取众数填充
data.loc[data['用户年龄']==0, '用户年龄'] = data['用户年龄'].mode()

#年龄特征
data['网龄/年龄'] = data['用户网龄（月）'] / data['用户年龄']
data['网龄年龄差'] = data['用户年龄'] - data['用户网龄（月）']/12

#对金额相关特征做组合
data['缴费金额是否能覆盖当月账单'] = data['缴费用户最近一次缴费金额（元）'] - data['用户账单当月总费用（元）']
data['最近一次交费是否超过平均消费额'] = data['缴费用户最近一次缴费金额（元）'] - data['用户近6个月平均消费值（元）']
data['当月账单是否超过平均消费额'] = data['用户账单当月总费用（元）'] - data['用户近6个月平均消费值（元）']
data['缴费习惯'] = data['缴费用户最近一次缴费金额（元）'] / (data['用户近6个月平均消费值（元）'] + 0.001)
data['通话人均花费'] = data['用户账单当月总费用（元）'] / (data['当月通话交往圈人数']+1)
data['近半年账单'] = data['用户近6个月平均消费值（元）']*6 + data['用户账单当月总费用（元）']
data['最近账单稳定性'] = data['用户账单当月总费用（元）'] / (data['用户近6个月平均消费值（元）'] + 0.001)
data['费用/余额'] = data['用户账单当月总费用（元）'] / (data['缴费用户最近一次缴费金额（元）'] + 0.001)
data['账户余额利用率'] = data['用户账单当月总费用（元）'] / (data['用户当月账户余额（元）'] + 0.001)

#对次数特征做组合
data['交通类应用使用次数'] = data['当月飞机类应用使用次数'] + data['当月火车类应用使用次数']

new_num_features = [i for i in data.columns.tolist() if i not in ori_col]

#充值金额是整数，和小数，应该对应不同的充值途径
def top_up_amount_method(s):
    
    if(s == 0):
        return 0
    elif(s % 10 == 0):
        return 1
    elif((s / 0.998) % 10 ==0):
        return 2
    else:
        return 3
data['充值方式1'] = data['缴费用户最近一次缴费金额（元）'].apply(top_up_amount_method)

def real_top_up_amount(s):
    if((s / 0.998) % 10 ==0):
        return s/0.998
    else:
        return s
data["充值方式2"] = data['缴费用户最近一次缴费金额（元）'].apply(real_top_up_amount)

#对类别特征进行组合，是否可以得出更好的结果
data['是否去过高档商场'] = data['当月是否逛过福州仓山万达'] + data['当月是否到过福州山姆会员店']
data['是否去过高档商场'] = data['是否去过高档商场'].map(lambda x:1 if x>=1 else 0)
data['是否_商场_电影'] = data['是否去过高档商场'] * data['当月是否看电影']
data['是否_商场_旅游'] = data['是否去过高档商场'] * data['当月是否景点游览']
data['是否_商场_体育馆'] = data['是否去过高档商场'] * data['当月是否体育场馆消费']
data['是否_电影_体育馆'] = data['当月是否看电影'] * data['当月是否体育场馆消费']
data['是否_电影_旅游'] = data['当月是否看电影'] * data['当月是否景点游览']
data['是否_旅游_体育馆'] = data['当月是否景点游览'] * data['当月是否体育场馆消费']
data['是否_商场_旅游_体育馆'] = data['是否去过高档商场'] * data['当月是否景点游览'] * data['当月是否体育场馆消费']
data['是否_商场_电影_体育馆'] = data['是否去过高档商场'] * data['当月是否看电影'] * data['当月是否体育场馆消费']
data['是否_商场_电影_旅游'] = data['是否去过高档商场'] * data['当月是否看电影'] * data['当月是否景点游览']
data['是否_体育馆_电影_旅游'] = data['当月是否体育场馆消费'] * data['当月是否看电影'] * data['当月是否景点游览']
data['是否_商场_体育馆_电影_旅游'] = data['是否去过高档商场'] * data['当月是否体育场馆消费'] * data['当月是否看电影'] * data['当月是否景点游览']

new_cat_features = [i for i in data.columns.tolist() if i not in ori_col and i not in new_num_features]

#对一些特征分段
discretize_features=['交通类应用使用次数','当月物流快递类应用使用次数','当月飞机类应用使用次数','当月火车类应用使用次数','当月旅游资讯类应用使用次数']
def map_discretize(x):
        if x==0:
            return 0
        elif x<=5:
            return 1
        elif x<=15:
            return 2
        elif x<=50:
            return 3
        elif x<=100:
            return 4
        else:
            return 5
        
for col in discretize_features:
    data[col]=data[col].map(lambda x:map_discretize(x))

#离散化
transform_value_feature=['用户年龄','用户网龄（月）','当月通话交往圈人数','近三个月月均商场出现次数','当月网购类应用使用次数','当月物流快递类应用使用次数'
                            ,'当月金融理财类应用使用总次数','当月视频播放类应用使用次数','当月飞机类应用使用次数','当月火车类应用使用次数','当月旅游资讯类应用使用次数']
user_fea=['缴费用户最近一次缴费金额（元）','用户近6个月平均消费值（元）','用户账单当月总费用（元）','用户当月账户余额（元）']
log_features=['当月网购类应用使用次数','当月金融理财类应用使用总次数','当月物流快递类应用使用次数','当月视频播放类应用使用次数']

for col in transform_value_feature+log_features:
    #取出最高99.9%值
    ulimit=np.percentile(data[col].values,99.9)
    #取出最低0.1%值
    llimit=np.percentile(data[col].values,0.1)
    data.loc[data[col]>ulimit,col]=ulimit
    data.loc[data[col]<llimit,col]=llimit
    
for col in user_fea+transform_value_feature+log_features:
    data[col]=data[col].map(lambda x:np.log1p(x))
    
#聚合特征
# data = feature_count(data, ['用户年龄'])
# data = feature_count(data, ['用户网龄（月）'])
# data = feature_count(data, ['用户最近一次缴费距今时长（月）'])
# data = feature_count(data, ['缴费用户最近一次缴费金额（元）'])
# data = feature_count(data, ['用户近6个月平均消费值（元）'])
# data = feature_count(data, ['用户账单当月总费用（元）'])
# data = feature_count(data, ['用户话费敏感度'])
# data = feature_count(data, ['当月通话交往圈人数'])
# data = feature_count(data, ['近三个月月均商场出现次数'])
# data = feature_count(data, ['最近一次交费是否超过平均消费额'])
# data = feature_count(data, ['当月账单是否超过平均消费额'])

# data = feature_count(data, ['用户话费敏感度','用户年龄'])
# data = feature_count(data, ['用户话费敏感度','用户网龄（月）'])
# data = feature_count(data, ['用户话费敏感度','用户最近一次缴费距今时长（月）'])
# data = feature_count(data, ['用户话费敏感度','缴费用户最近一次缴费金额（元）'])
# data = feature_count(data, ['用户话费敏感度','用户近6个月平均消费值（元）'])
# data = feature_count(data, ['用户话费敏感度','用户账单当月总费用（元）'])
# data = feature_count(data, ['用户话费敏感度','当月通话交往圈人数'])
# data = feature_count(data, ['用户话费敏感度','近三个月月均商场出现次数'])
# data = feature_count(data, ['用户话费敏感度','最近一次交费是否超过平均消费额'])
# data = feature_count(data, ['用户话费敏感度','当月账单是否超过平均消费额'])

# #聚合其他列的特征
# sparse_feature = ['用户年龄','用户网龄（月）','用户最近一次缴费距今时长（月）','用户话费敏感度']
# dense_feature = ['缴费用户最近一次缴费金额（元）','用户近6个月平均消费值（元）','用户账单当月总费用（元）',
#                     '用户当月账户余额（元）']

# def get_new_columns(name,aggs):
#     l=[]
#     for k in aggs.keys():
#         for agg in aggs[k]:
#             if str(type(agg))=="<class 'function'>":
#                 l.append(name + '_' + k + '_' + 'other')
#             else:
#                 l.append(name + '_' + k + '_' + agg)
#     return l
# for d in tqdm(sparse_feature):
#     aggs={}
#     for s in sparse_feature:
#         aggs[s]=['count','nunique']
#     for den in dense_feature:
#         aggs[den]=['mean','max','min','std']
#     aggs.pop(d)
#     temp=data.groupby(d).agg(aggs).reset_index()
#     temp.columns=[d]+get_new_columns(d,aggs)
#     new_num_features.append(get_new_columns(d,aggs))
#     data=pd.merge(data,temp,on=d,how='left')

#记录特征
cat_features = new_cat_features + ori_cat_features
num_features = new_num_features + ori_num_features

for i in cat_features:
    data[i] = data[i].astype('category')
for i in num_features:
    data[i] = data[i].astype('float')

# #类别特征做one-hot    
# for feature in cat_features:
#     try:
#         data[feature] = LabelEncoder().fit_transform(data[feature].apply(int))
#     except:
#         data[feature] = LabelEncoder().fit_transform(data[feature])    
 

train = data[:train.shape[0]]
test = data[train.shape[0]:]    


# train_x=train[num_features]
# test_x=test[num_features]
# enc = OneHotEncoder()
# for feature in cat_features:
#     enc.fit(data[feature].values.reshape(-1, 1))
#     train_a= enc.transform(train[feature].values.reshape(-1, 1))
#     test_a = enc.transform(test[feature].values.reshape(-1, 1))
#     train= sparse.hstack((train_x, train_a), 'csr')
#     test = sparse.hstack((test_x, test_a), 'csr')
    
    
# #CountVectorizer()特征,在本题中没有合适的量
# vector_feature = []
# # for i in vector_feature:
# #     data[i] = data[i].astype('str')
# #     train[i] = train[i].astype('str')
# #     test[i] = test[i].astype('str')

    

# cv=CountVectorizer()
# for feature in vector_feature:
#     cv.fit(data[feature])
#     train_a = cv.transform(train[feature])
#     test_a = cv.transform(test[feature])
#     train = sparse.hstack((train_x, train_a), 'csr')
#     test = sparse.hstack((test_x, test_a), 'csr')


#开始训练
# kf = StratifiedKFold(n_splits=5, random_state=2019, shuffle=False)
# training_time = 0 
# feature_importance_df = pd.DataFrame()
# best_score = []
# sub_list = []

# clf = lgb.LGBMRegressor(
#           boosting_type='gbdt', num_leaves=31, reg_alpha=2.2, reg_lambda=1.5,
#           max_depth=-1, n_estimators=2000,
#           subsample=0.8, colsample_bytree=0.7, subsample_freq=1,
#           learning_rate=0.03, random_state=2019, n_jobs=-1)

# for i, (train_index, val_index) in enumerate(kf.split(train, label)):
#      t0 = time.time()
#      X_train, y_train = train.loc[train_index,:], label[train_index]
#      X_val, y_val     = train.loc[val_index,:],   label[val_index]
#      #X_train, y_train = train[train_index], label[train_index]
#      #X_val,    y_val   = train[val_index],   label[val_index]
#      #clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)],\
#       #        eval_metric='mae', early_stopping_rounds=200, verbose=200, categorical_feature=cat_features)
#      clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)],\
#               eval_metric='mae', early_stopping_rounds=200, verbose=200)
#      pred_val = clf.predict(X_val, num_iteration=clf.best_iteration_)
#      vali_mae = mean_absolute_error(y_val, np.round(pred_val))
#      best_score.append(1/(1+vali_mae))
#      pred_test = clf.predict(test,num_iteration=clf.best_iteration_)
     
#      fold_importance_df = pd.DataFrame()
#      fold_importance_df["feature"] = list(X_train.columns)
#      fold_importance_df["importance"] = clf.feature_importances_
#      fold_importance_df["fold"] = i + 1
#      feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
     
#      sub_list.append(pred_test)
#      t = (time.time() - t0) / 60
#      training_time += t
     
#      print("This round cost time:{:.2f} minutes, lgb scor:{:.8f},\n".format(t, 1/(1+vali_mae)))
        
# pred_test = np.mean(np.array(sub_list), axis=0)
# print(best_score, '\n', np.mean(best_score), np.std(best_score))
# print("Total training time cost:{:.2f} minutes".format(training_time))  


In [3]:
for i in cat_features:
    train[i] = train[i].astype('int')
    test[i]  = test[i].astype('int')
    
train = train.fillna(0)
test = test.fillna(0)

In [4]:
from datetime import datetime
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [5]:
X = train.copy()
y =label

kfolds = KFold(n_splits=5, shuffle=True, random_state=42)

def mae(y, y_pred):
    return mean_absolute_error(y, y_pred)

def cv_mae(model, X=X):
    mae = -cross_val_score(model, X, y, scoring="neg_mean_absolute_error", cv=kfolds)
    return (mae)

In [6]:
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

In [7]:
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))
lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kfolds))
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio))                                
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003,))

In [8]:
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =42)

In [9]:
lightgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       )

In [10]:
xgboost = XGBRegressor(learning_rate=0.01,n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)

In [11]:
stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet, gbr, xgboost, lightgbm),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

In [12]:
score = cv_mae(ridge , X)
print("Ridge: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

Ridge: 16.0686 (0.0712)
 2019-03-21 06:39:18.573581


In [13]:
score = cv_mae(lasso , X)
print("LASSO: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

LASSO: 16.0692 (0.0711)
 2019-03-21 06:40:15.861821


In [14]:
score = cv_mae(elasticnet)
print("elastic net: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

elastic net: 16.0687 (0.0713)
 2019-03-21 06:42:20.960286


In [15]:
score = cv_mae(svr)
print("SVR: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

SVR: 16.2203 (0.0821)
 2019-03-21 07:07:07.757559


In [16]:
score = cv_mae(gbr)
print("gbr: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

gbr: 14.7810 (0.0802)
 2019-03-21 07:16:09.629010


In [17]:
score = cv_mae(lightgbm)
print("lightgbm: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

lightgbm: 14.8736 (0.0766)
 2019-03-21 07:16:52.641360


In [18]:
score = cv_mae(xgboost)
print("xgboost: {:.4f} ({:.4f})\n".format(score.mean(), score.std()), datetime.now(), )

xgboost: 14.7975 (0.0756)
 2019-03-21 07:24:22.152527


In [19]:
print('START Fit')

print('stack_gen')
stack_gen_model = stack_gen.fit(np.array(X), np.array(y))

print('elasticnet')
elastic_model_full_data = elasticnet.fit(X, y)

print('Lasso')
lasso_model_full_data = lasso.fit(X, y)

print('Ridge')
ridge_model_full_data = ridge.fit(X, y)

print('Svr')
svr_model_full_data = svr.fit(X, y)

print('GradientBoosting')
gbr_model_full_data = gbr.fit(X, y)

print('xgboost')
xgb_model_full_data = xgboost.fit(X, y)

print('lightgbm')
lgb_model_full_data = lightgbm.fit(X, y)

START Fit
stack_gen
elasticnet
Lasso
Ridge
Svr
GradientBoosting
xgboost
lightgbm


In [20]:
def blend_models_predict(X):
    return ((0.05 * elastic_model_full_data.predict(X)) + \
            (0.05 * lasso_model_full_data.predict(X)) + \
            (0.05 * ridge_model_full_data.predict(X)) + \
            (0.1 * svr_model_full_data.predict(X)) + \
            (0.15 * gbr_model_full_data.predict(X)) + \
            (0.15 * xgb_model_full_data.predict(X)) + \
            (0.15 * lgb_model_full_data.predict(X)) + \
            (0.3 * stack_gen_model.predict(np.array(X))))

In [21]:
print('MAE score on train data:')
print(mae(y, blend_models_predict(X)))
predictions = blend_models_predict(test)

MAE score on train data:
13.972378983793371


In [22]:
test_data_sub1 = pd.DataFrame()
test_data_sub1['id'] = test_id
test_data_sub1['score'] =  predictions
test_data_sub1.columns = ['id','score']

test_data_sub1['score'] = test_data_sub1['score'].apply(lambda x: int(np.round(x)))
test_data_sub1[['id','score']].to_csv('lgb_xgb_stacking.csv', index=False)