In [1]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import cross_val_score,train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from hyperopt import tpe,fmin,Trials,hp,rand,anneal,space_eval

import pandas as pd
import numpy as np
# 统计notavailable的情况
import matplotlib.pyplot as plt
plt.style.use("seaborn")
plt.rcParams['font.sans-serif']=['SimHei']   # 用黑体显示中文
plt.rcParams['axes.unicode_minus']=False  
import seaborn as sns
# 使用LightGBM进行模型预训练，并检测异常值
from lightgbm import LGBMRegressor

In [2]:
train=pd.read_csv("train_all_features_noPCA-2.csv",index_col=0)

test=pd.read_csv("test_all_features_noPCA-2.csv",index_col=0)
def onehot(name,train=train,test=test):
    from sklearn.preprocessing import OneHotEncoder
    clf=OneHotEncoder(sparse=False)
    clf.fit(train.loc[:,name].to_numpy().reshape(-1,1))
    res_train=clf.transform(train.loc[:,name].to_numpy().reshape(-1,1))   
    res_test=clf.transform(test.loc[:,name].to_numpy().reshape(-1,1))
    new_columns=[name+str(i) for i in range(0,res_train.shape[1])]
    train.loc[:,new_columns]=res_train
    test.loc[:,new_columns]=res_test
    train=train.drop(name,axis=1)
    test=test.drop(name,axis=1)
    return (train,test)
def drop_multiline(train,test):
    """删掉强相关变量"""
    train_corr=train.corr()
    test_corr=test.corr()
    be_droped=[];
    for i,index in enumerate(train_corr.index):
        for j,column in enumerate(train_corr.columns):
            if  column not in be_droped and i<j and train_corr.loc[index,column]>0.9 and test_corr.loc[index,column]>0.9:
                train=train.drop(column,axis=1)
                test=test.drop(column,axis=1)
                # print(column,"被删除")
                be_droped.append(column)
                # return drop_multiline(train,test)
    print(be_droped,"被删除")
    return (train,test)

train,test=onehot("brand")
train,test=onehot("product_category")
# test=test.dropna()
train,test=drop_multiline(train,test)
X_columns=np.array([i for i in train.columns[:] 
                    if i!="product_id" and i!="price" and i!="face_x" and i!="faces_y" and i!="face_size" ])
X_train=train.loc[:,X_columns].to_numpy()
X_test=test.loc[:,X_columns].to_numpy()
y_train=train.loc[:,"price"].to_numpy()
y_test=test.loc[:,"price"].to_numpy()
print(train.shape)
print(test.shape)

['face_x', 'faces_y', 'face_size', 'h3', 's3', 'v3', '333', 'product_category1', 'product_category2'] 被删除
(7835, 535)
(2000, 535)


In [3]:
class Machine(object):
    """定义一个Machine的类。
    本Machine类的思路如下：
    任何一个特殊的模型如SVC都可以用来建立Machine类
    通过调用self.headquater方法可以完成从hyperopt参数寻优一直到模型报告的过程
    如果想象征性的跑一跑代码看看能不能跑通，把HyperoptTrain(self,max_evals=50)中的50改为较小的数字即可（比如3）"""
    def __init__(self,clf,params:dict,
                 X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test):
        self.X_train=X_train
        self.y_train=y_train
        self.X_test=X_test
        self.y_test=y_test
        self.clf=clf
        self.params=params
    def Report(self):
        """报告分类性能的函数,同时画出分类结果ROC函数"""
        y_train_predict=self.clf.predict(self.X_train)
        y_predict=self.clf.predict(self.X_test)
        train_mae=mean_absolute_error(y_true=self.y_train,y_pred=y_train_predict)
        train_mse=mean_squared_error(y_true=self.y_train,y_pred=y_train_predict)
        train_r2=r2_score(y_true=self.y_train,y_pred=y_train_predict)
        test_mae=mean_absolute_error(y_true=self.y_test,y_pred=y_predict)
        test_mse=mean_squared_error(y_true=self.y_test,y_pred=y_predict)
        test_r2=r2_score(y_true=self.y_test,y_pred=y_predict)
        
        print("="*60)    
        print("train数据集上模型精度指标(MAE,MSE,R2):",[train_mae,train_mse,train_r2])
        print("test数据集上模型精度指标(MAE,MSE,R2):",[test_mae,test_mse,test_r2])
        print("="*60)    
    def objective(self,params):
        self.clf.set_params(**params)
        self.clf.fit(self.X_train,self.y_train)
        y_pred=self.clf.predict(self.X_test)
        res=r2_score(y_true=self.y_test,y_pred=y_pred)
        return -res
    def HyperoptTrain(self,max_evals=50):
        """使用tpe.suggest寻找最优参数"""
        trials=Trials()
        if self.model_name!="SVR" and self.model_name!="Isotonic":
            self.params["random_state"]=hp.choice("random_state",[0]) #加一个random_state操作
        best_params=fmin(fn=self.objective,space=self.params,
                         algo=tpe.suggest,max_evals=max_evals,trials=trials)
        best_params=space_eval(self.params, best_params)
        # np.save(file_path+self.model_name+"best_params",best_params)
        print("best params:\n",best_params)
        self.clf.set_params(**best_params)
        self.clf.fit(self.X_train,self.y_train)
        return self.clf
    def headquarter(self,model_name,max_evals=50):
        """中心调度器，完成从模型训练直到模型报告的所有工作"""
        self.model_name=model_name
        self.clf=self.HyperoptTrain(max_evals)
        print(model_name,"模型","训练完成，下面是模型报告：")
        self.Report()
        return self.clf


In [4]:
from lightgbm import LGBMRegressor
clf_LGBM=LGBMRegressor()
params_LGBM={
        "boosting_type":hp.choice("boosting_type",["gbdt"]),
        'max_depth': hp.choice('max_depth', [0,1,2,3]),
        'num_leaves':  hp.choice('num_leaves', np.arange(50,150,10, dtype=int)),
        'learning_rate': hp.choice('learning_rate',np.arange(0.01,0.5,0.01)),
        'n_estimators': hp.choice("n_estimators",np.arange(10,150,10,dtype=int)),
        "min_split_gain":hp.uniform("min_split_gain",0,0.08),
        "reg_alpha":hp.uniform("reg_alpha",0.01,1),
        "reg_lambda": hp.uniform("reg_lambda",0.01,1)
    }

LGBM_machine=Machine(clf_LGBM,params_LGBM,X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test)
clf_LGBM=LGBM_machine.headquarter(model_name="LGBM",max_evals=150)

100%|████████████████████████████████████████████| 150/150 [06:49<00:00,  2.73s/trial, best loss: -0.04538051103088703]
best params:
 {'boosting_type': 'gbdt', 'learning_rate': 0.21000000000000002, 'max_depth': 3, 'min_split_gain': 0.05420612259948168, 'n_estimators': 30, 'num_leaves': 60, 'random_state': 0, 'reg_alpha': 0.8659578194486246, 'reg_lambda': 0.04989104834225522}
LGBM 模型 训练完成，下面是模型报告：
train数据集上模型精度指标(MAE,MSE,R2): [1280.3183371330729, 20227337.771141175, 0.4852873616890675]
test数据集上模型精度指标(MAE,MSE,R2): [1795.8083622930308, 217559320.245632, 0.04538051103088703]


In [5]:
feature_importance=clf_LGBM.feature_importances_
sort_index=np.argsort(feature_importance)
print("重要性:",feature_importance[sort_index[::-1]])
print("特征名称：",X_columns[sort_index[::-1]])


重要性: [13  6  6  5  5  5  4  4  4  4  4  4  4  3  3  3  3  3  3  3  3  2  2  2
  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 

In [6]:
from sklearn.neighbors import LocalOutlierFactor
clf=LocalOutlierFactor(n_neighbors=20,novelty=True)
clf.fit(feature_importance*X_train)
outliers=clf.predict(feature_importance*X_train)
outliers_test=clf.predict(feature_importance*X_test)
print("离群点个数：",len(outliers[outliers==-1]),len(outliers_test[outliers_test==-1]))
print(np.mean(y_train[outliers==-1]),np.mean(y_train[outliers!=-1]))
y_train_no_outlier=y_train[outliers!=-1]
X_train_no_outlier=X_train[outliers!=-1]
X_test_no_outlier=X_test[outliers_test!=-1]
y_test_no_outlier=y_test[outliers_test!=-1]
X_train_no_outlier=X_train_no_outlier[y_train_no_outlier<np.mean(y_train[outliers==-1]),:]
y_train_no_outlier=y_train_no_outlier[y_train_no_outlier<np.mean(y_train[outliers==-1])]
X_test_no_outlier=X_test_no_outlier[y_test_no_outlier<np.mean(y_train[outliers==-1]),:]
y_test_no_outlier=y_test_no_outlier[y_test_no_outlier<np.mean(y_train[outliers==-1])]
X_train_no_outlier=np.append(X_train_no_outlier,X_train[outliers==-1],axis=0)
y_train_no_outlier=np.append(y_train_no_outlier,y_train[outliers==-1])
X_test_no_outlier=np.append(X_test_no_outlier,X_test[outliers_test==-1],axis=0)
y_test_no_outlier=np.append(y_test_no_outlier,y_test[outliers_test==-1])
print(X_train_no_outlier.shape,y_train_no_outlier.shape,X_test_no_outlier.shape,y_test_no_outlier.shape)

离群点个数： 477 389
3397.724838944403 1725.027650835941
(6844, 533) (6844,) (1795, 533) (1795,)


In [7]:
data_last=pd.read_csv("train_all_features_noPCA-2.csv")
test_last=pd.read_csv("test_all_features_noPCA-2.csv")
print(data_last.shape,test_last.shape)
be_droped=['face_x', 'faces_y', 'face_size', 'h3', 's3', 'v3', '67', '88', '133','158', '218', '278',
           '366', '400', '416', '452', '484', '314', '478', '433', '475']
for i in be_droped:
    data_last=data_last.drop(i,axis=1)
    test_last=test_last.drop(i,axis=1)
data_last_no_outlier=data_last.loc[outliers!=-1,:].copy()
data_last_no_outlier=data_last_no_outlier.loc[data_last_no_outlier.loc[:,"price"]<np.mean(data_last.loc[outliers==-1,"price"]),:]
data_last_no_outlier=data_last_no_outlier.append(data_last.loc[outliers==-1,:])
test_last_no_outlier=test_last.loc[outliers_test!=-1,:].copy()
test_last_no_outlier=test_last_no_outlier.loc[test_last_no_outlier.loc[:,"price"]<np.mean(data_last.loc[outliers==-1,"price"]),:]
test_last_no_outlier=test_last_no_outlier.append(test_last.loc[outliers_test==-1,:])
print(data_last_no_outlier.shape,test_last_no_outlier.shape)
data_last_no_outlier.to_csv("train_all_features_noPCA-3.csv",index=False)
test_last_no_outlier.to_csv("test_all_features_noPCA-3.csv",index=False)

(7835, 537) (2000, 537)
(6844, 516) (1795, 516)
