In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.max_columns = 100
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import xgboost as xgb
from xgboost import plot_importance
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from scipy import stats
# from funset import funcs
import seaborn as sns
import sys
from sklearn import tree
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error


In [45]:
# 1. read data
class model_lgb:
    def __init__(self, path):
        self.data = pd.read_csv(path)
        self.predictors = [x for x in self.data.columns if x not in ['用户编码', '信用分']]
        self.X_train_val, self.X_test, self.y_train_val, self.y_test = 0, 0, 0, 0
        
    def get_train_val(self):
        self.X_train_val = self.data.loc[self.data['信用分'] != -1, self.predictors]
        self.y_train_val = self.data.loc[self.data['信用分'] != -1, '信用分']
        self.X_test = self.data.loc[self.data['信用分'] == -1, self.predictors]
        self.y_test = self.data.loc[self.data['信用分'] == -1, '信用分']
        
    def split_train_test(self):
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(self.X_train_val, self.y_train_val, train_size = 0.7)
        
    def lgb_modeling(self):
        # 2. modeling
        # (1) create datasets for lightgbm
        lgb_train = lgb.Dataset(self.X_train, self.y_train)
        lgb_eval = lgb.Dataset(self.X_val, self.y_val, reference=lgb_train)

        # (2) specify your configuration as a dict
        params = {
            'max_depth' : 4,
            'boosting_type' : 'gbdt',
            'objective' : 'regression_l1',
            'num_leaves' : 31,
            'metric' : {'l1', 'mae'},
            'learning_rate' : 0.05,
            'feature_fraction' : 0.9,
            'bagging_fraction' : 0.8,
            'bagging_freq' : 5,
            'verbose' : 1
        }

        # (3) train & save
        print("Starting training ...")
        gbm = lgb.train(params, lgb_train, num_boost_round=1000, valid_sets=lgb_eval, early_stopping_rounds=500, verbose_eval=0)
        print("Saving Model ...")
        gbm.save_model('../model/lgb_model.txt')

        # (4) predict
        y_train_pre = gbm.predict(self.X_train, num_iteration=gbm.best_iteration)
        y_val_pre = gbm.predict(self.X_val, num_iteration=gbm.best_iteration)

        print("The Train MAE of prediction is: %s" % mean_absolute_error(self.y_train, y_train_pre))
        print("The Test MAE of prediction is: %s" % mean_absolute_error(self.y_val, y_val_pre))
        y_test_pre = gbm.predict(self.X_test)
        
        return self.y_val.values, y_val_pre
    
def case(file):
    path = "../data/all_data_fea%s.csv" % file
    r1 = model_lgb(path)
    r1.get_train_val()
    r1.split_train_test()
    return r1.lgb_modeling()
    

def stacking_linear():
    y_val_ave, y_val_pre_ave = 0.0, 0.0
    weights = [.55,.2,.2,.05]
    for no, file in enumerate([1,2,3,4]):
        temp_y_val, temp_y_val_pre = case(file)    
        y_val_ave += temp_y_val * weights[no]/4
        y_val_pre_ave += temp_y_val_pre * weights[no]/4
    print()
    print("The Test MAE of prediction by Linear Stacking is: %s" % mean_absolute_error(y_val_ave, y_val_pre_ave))  

    return y_val_ave, y_val_pre_ave   

In [46]:
path = "../data/all_data_fea4.csv" 
r1 = model_lgb(path)
r1.get_train_val()
r1.split_train_test()
y_val_ave, y_val_pre_ave = r1.lgb_modeling()



Starting training ...
Saving Model ...
The Train MAE of prediction is: 13.186372066853146
The Test MAE of prediction is: 14.698851424284522


In [47]:
y_val_ave, y_val_pre_ave = stacking_linear()



Starting training ...
Saving Model ...
The Train MAE of prediction is: 13.27721804895076
The Test MAE of prediction is: 14.766324180098056
Starting training ...
Saving Model ...
The Train MAE of prediction is: 13.325513749470767
The Test MAE of prediction is: 14.665039355085192
Starting training ...
Saving Model ...
The Train MAE of prediction is: 13.385056731980264
The Test MAE of prediction is: 14.67741551021617
Starting training ...
Saving Model ...
The Train MAE of prediction is: 13.19699260185436
The Test MAE of prediction is: 14.836995472797213

The Test MAE of prediction by Linear Stacking is: 2.3112380857350883
