对数据建模、5折交叉验、划分数据，对LGB模型进行训练、计算MSE评分性能

In [8]:
import pandas as pd
train_data2=pd.read_csv('zhengqi_train.txt',sep='\t')
test_data2=pd.read_csv('zhengqi_test.txt',sep='\t')
train_data2_f=train_data2[test_data2.columns].values
train_data2_target=train_data2['target'].values
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import numpy as np

#5折交叉验证
Folds=5
kf=KFold(n_splits=Folds,shuffle=True,random_state=2019)
#记录训练和预测MSE
MSE_DICT={'train_mse':[],'test_mse':[]}
for i,(train_index,test_index) in enumerate(kf.split(train_data2_f)):
    lgb_reg=lgb.LGBMRegressor(learning_rate=0.01,
                             max_depth=-1,
                             n_estimators=5000,
                             boosting_type='gbdt',
                             random_state=2019,
                             objective='regression',)
    X_train_KFold,X_test_KFold=train_data2_f[train_index],train_data2_f[test_index]
    y_train_KFold,y_test_KFold=train_data2_target[train_index],train_data2_target[test_index]
    #训练模型
    lgb_reg.fit(X=X_train_KFold,
               y=y_train_KFold,
               eval_set=[(X_train_KFold,y_train_KFold),
                        (X_test_KFold,y_test_KFold)],
               eval_names=['Train','Test'],
               early_stopping_rounds=100,
               eval_metric='MSE',
               verbose=50)
    #训练集和测试集模型预测
    y_train_KFold_predict=lgb_reg.predict(X_train_KFold,num_iteration=lgb_reg.best_iteration_)
    y_test_KFold_predict=lgb_reg.predict(X_test_KFold,num_iteration=lgb_reg.best_iteration_)
    print('第{}折'.format(i+1))
    train_mse=mean_squared_error(y_train_KFold_predict,y_train_KFold)
    print('train_loss')
    print(train_mse)
    test_mse=mean_squared_error(y_test_KFold_predict,y_test_KFold)
    print('test_loss')
    print(test_mse)
    MSE_DICT['train_mse'].append(train_mse)
    MSE_DICT['test_mse'].append(test_mse)
print('train MSE')
print(np.mean(MSE_DICT['train_mse']))
print('test MSE')
print(np.mean(MSE_DICT['test_mse']))

Training until validation scores don't improve for 100 rounds
[50]	Train's l2: 0.428458	Test's l2: 0.466965
[100]	Train's l2: 0.221807	Test's l2: 0.263431
[150]	Train's l2: 0.135301	Test's l2: 0.183515
[200]	Train's l2: 0.0958737	Test's l2: 0.148825
[250]	Train's l2: 0.075642	Test's l2: 0.133187
[300]	Train's l2: 0.0629311	Test's l2: 0.125237
[350]	Train's l2: 0.054088	Test's l2: 0.121321
[400]	Train's l2: 0.0473551	Test's l2: 0.118307
[450]	Train's l2: 0.0418952	Test's l2: 0.116414
[500]	Train's l2: 0.0375119	Test's l2: 0.114863
[550]	Train's l2: 0.0337062	Test's l2: 0.114002
[600]	Train's l2: 0.0303868	Test's l2: 0.112781
[650]	Train's l2: 0.02756	Test's l2: 0.11207
[700]	Train's l2: 0.0250506	Test's l2: 0.111287
[750]	Train's l2: 0.0228758	Test's l2: 0.110742
[800]	Train's l2: 0.0209081	Test's l2: 0.1103
[850]	Train's l2: 0.0191684	Test's l2: 0.109946
[900]	Train's l2: 0.0176066	Test's l2: 0.109759
[950]	Train's l2: 0.0162243	Test's l2: 0.109423
[1000]	Train's l2: 0.014984	Test's l2

[1100]	Train's l2: 0.0128394	Test's l2: 0.0996427
[1150]	Train's l2: 0.0118688	Test's l2: 0.0996074
[1200]	Train's l2: 0.0110126	Test's l2: 0.0995783
[1250]	Train's l2: 0.0102254	Test's l2: 0.0995361
[1300]	Train's l2: 0.00951201	Test's l2: 0.0993949
[1350]	Train's l2: 0.00884199	Test's l2: 0.0992683
[1400]	Train's l2: 0.00822777	Test's l2: 0.0993064
[1450]	Train's l2: 0.00767529	Test's l2: 0.0992335
[1500]	Train's l2: 0.00712999	Test's l2: 0.0992797
Early stopping, best iteration is:
[1423]	Train's l2: 0.00797142	Test's l2: 0.0992124
第5折
train_loss
0.007971416523938925
test_loss
0.09921237825179019
train MSE
0.009311447531759892
test MSE
0.10904966011207792


经过特征优化后的数据建模

In [9]:
import pandas as pd
train_data=pd.read_csv('zhengqi_train.txt',sep='\t')
test_data=pd.read_csv('zhengqi_test.txt',sep='\t')

In [10]:
#定义特征构造方法
e=1e-5
#交叉特征
func_dict={
    'add':lambda x,y:x+y,
    'mins':lambda x,y:x-y,
    'div':lambda x,y:x/(y+e),
    'multi': lambda x,y:x*y
}
#构造特征
def auto_features_make(train_data,test_data,func_dict,col_list):
    train_data,test_data=train_data.copy(),test_data.copy()
    for col_i in col_list:
        for col_j in col_list:
            for func_name,func in func_dict.items():
                for data in [train_data,test_data]:
                    func_features=func(data[col_i],data[col_j])
                    col_func_features='-'.join([col_i,func_name,col_j])
                    data[col_func_features]=func_features
    return train_data,test_data
train_data2,test_data2=auto_features_make(train_data,test_data,func_dict,col_list=test_data.columns)

In [11]:
train_data2.head()

Unnamed: 0,V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V37-div-V35,V37-multi-V35,V37-add-V36,V37-mins-V36,V37-div-V36,V37-multi-V36,V37-add-V37,V37-mins-V37,V37-div-V37,V37-multi-V37
0,0.566,0.016,-0.143,0.407,0.452,-0.901,-1.812,-2.36,-0.436,-2.114,...,0.68771,17.894308,-6.116,-0.9,1.345097,9.148864,-7.016,0.0,1.000003,12.306064
1,0.968,0.437,0.066,0.566,0.194,-0.893,-1.566,-2.36,0.332,-2.114,...,-2.005439,-0.26572,-1.065,-0.395,2.17917,0.24455,-1.46,0.0,1.000014,0.5329
2,1.013,0.568,0.235,0.37,0.112,-0.797,-1.367,-2.36,0.396,-2.114,...,-1.618087,-0.214396,0.176,-1.354,-0.769925,-0.450585,-1.178,0.0,1.000017,0.346921
3,0.733,0.368,0.283,0.165,0.599,-0.679,-1.2,-2.086,0.403,-2.114,...,-0.307684,-0.040768,0.221,-0.445,-0.336326,-0.037296,-0.224,0.0,1.000089,0.012544
4,0.684,0.638,0.26,0.209,0.337,-0.454,-1.073,-2.086,0.314,-2.114,...,-0.076921,-0.010192,-0.308,0.252,0.100004,0.00784,-0.056,0.0,1.000357,0.000784


In [12]:
test_data2.head()

Unnamed: 0,V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V37-div-V35,V37-multi-V35,V37-add-V36,V37-mins-V36,V37-div-V36,V37-multi-V36,V37-add-V37,V37-mins-V37,V37-div-V37,V37-multi-V37
0,0.368,0.38,-0.225,-0.049,0.379,0.092,0.55,0.551,0.244,0.904,...,-2.042213,-0.07372,-0.179,0.955,-0.684315,-0.219996,0.776,0.0,0.999974,0.150544
1,0.148,0.489,-0.247,-0.049,0.122,-0.201,0.487,0.493,-0.127,0.904,...,12.98377,0.000832,-0.19,0.398,-0.353754,-0.030576,0.208,0.0,0.999904,0.010816
2,-0.166,-0.062,-0.311,0.046,-0.055,0.063,0.485,0.493,-0.227,0.904,...,71.036205,0.004552,0.942,0.196,1.525428,0.212237,1.138,0.0,0.999982,0.323761
3,0.102,0.294,-0.259,0.051,-0.183,0.148,0.474,0.504,0.01,0.904,...,48.813983,0.003128,-0.275,1.057,-0.587096,-0.260406,0.782,0.0,0.999974,0.152881
4,0.3,0.428,0.208,0.051,-0.033,0.116,0.408,0.497,0.155,0.904,...,-62.047441,-0.003976,-0.637,-0.357,3.550254,0.06958,-0.994,0.0,1.00002,0.247009


In [27]:
from sklearn.decomposition import PCA
#PCA降维
pca=PCA(n_components=500)
train_data2_pca=pca.fit_transform(train_data2.iloc[:,0:-1])
test_data2_pca=pca.fit_transform(test_data2)
train_data2_pca=pd.DataFrame(train_data2_pca)
test_data2_pca=pd.DataFrame(test_data2_pca)
train_data2_pca['target']=train_data2['target']
X_train2=train_data2[test_data2.columns].values
X_test2=test_data2
y_train=train_data2['target']

In [16]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import numpy as np

#5折交叉验证
Folds=5
kf=KFold(n_splits=Folds,shuffle=True,random_state=2019)
#记录训练和预测MSE
MSE_DICT={'train_mse':[],'test_mse':[],'result':[]}
for i,(train_index,test_index) in enumerate(kf.split(X_train2_pca)):
    lgb_reg=lgb.LGBMRegressor(learning_rate=0.01,
                             max_depth=-1,
                             n_estimators=5000,
                             boosting_type='gbdt',
                             random_state=2019,
                             objective='regression',)
    X_train_KFold,X_test_KFold=X_train2[train_index],X_train2[test_index]
    y_train_KFold,y_test_KFold=y_train[train_index],y_train[test_index]
    #训练模型
    lgb_reg.fit(X=X_train_KFold,
               y=y_train_KFold,
               eval_set=[(X_train_KFold,y_train_KFold),
                        (X_test_KFold,y_test_KFold)],
               eval_names=['Train','Test'],
               early_stopping_rounds=100,
               eval_metric='MSE',
               verbose=50)
    #训练集和测试集模型预测
    y_train_KFold_predict=lgb_reg.predict(X_train_KFold,num_iteration=lgb_reg.best_iteration_)
    y_test_KFold_predict=lgb_reg.predict(X_test_KFold,num_iteration=lgb_reg.best_iteration_)
    print('第{}折'.format(i+1))
    train_mse=mean_squared_error(y_train_KFold_predict,y_train_KFold)
    print('train_loss')
    print(train_mse)
    test_mse=mean_squared_error(y_test_KFold_predict,y_test_KFold)
    print('test_loss')
    print(test_mse)
    MSE_DICT['train_mse'].append(train_mse)
    MSE_DICT['test_mse'].append(test_mse)
    test_result=lgb_reg.predict(test_data2,num_iteration=lgb_reg.best_iteration_)
    MSE_DICT['result'].append(test_result)
print('train MSE')
print(np.mean(MSE_DICT['train_mse']))
print('test MSE')
print(np.mean(MSE_DICT['test_mse']))

Training until validation scores don't improve for 100 rounds
[50]	Train's l2: 0.413128	Test's l2: 0.455026
[100]	Train's l2: 0.198017	Test's l2: 0.245523
[150]	Train's l2: 0.108839	Test's l2: 0.164432
[200]	Train's l2: 0.0683439	Test's l2: 0.132008
[250]	Train's l2: 0.0478354	Test's l2: 0.117217
[300]	Train's l2: 0.035836	Test's l2: 0.110572
[350]	Train's l2: 0.0279916	Test's l2: 0.10673
[400]	Train's l2: 0.0225218	Test's l2: 0.104686
[450]	Train's l2: 0.0183733	Test's l2: 0.103133
[500]	Train's l2: 0.0151476	Test's l2: 0.102168
[550]	Train's l2: 0.012598	Test's l2: 0.101216
[600]	Train's l2: 0.0105448	Test's l2: 0.100722
[650]	Train's l2: 0.00886925	Test's l2: 0.100606
[700]	Train's l2: 0.00751108	Test's l2: 0.100288
[750]	Train's l2: 0.00639588	Test's l2: 0.100224
[800]	Train's l2: 0.00547284	Test's l2: 0.100142
[850]	Train's l2: 0.00469886	Test's l2: 0.0999705
[900]	Train's l2: 0.00405206	Test's l2: 0.0997473
[950]	Train's l2: 0.00350702	Test's l2: 0.0997148
[1000]	Train's l2: 0.00

In [20]:
print(MSE_DICT['result'])
print(len(MSE_DICT['result']))
print(len(MSE_DICT['result'][0]))

[array([ 0.42571059,  0.47386542,  0.11604891, ..., -2.4635752 ,
       -2.4070427 , -2.43662651]), array([ 0.28754625,  0.23362962,  0.08347083, ..., -2.52658076,
       -2.58703227, -2.56597587]), array([ 0.4473345 ,  0.43181061,  0.12843418, ..., -2.48398929,
       -2.45004632, -2.42548234]), array([ 0.44225069,  0.39121929, -0.02195631, ..., -2.56523083,
       -2.62722776, -2.60273653]), array([ 0.42012632,  0.34648743,  0.13855812, ..., -2.37656089,
       -2.38777441, -2.47070014])]
5
1925


In [23]:
a=np.mean(MSE_DICT['result'],axis=0)
print(len(a))

1925


In [25]:
result_pd = pd.DataFrame(a)
result_pd.to_csv("result.txt", index=False, header=False)