## 机器学习部分
* 用一个GBTmodel
* 用一个一般线性回归
* 用一个RandomForestRegressor

In [1]:
import numpy as np
import pandas as pd
import time
import cv2 
import torch
from torchvision import datasets,models,transforms,utils
from torch.utils.data import DataLoader,Dataset
from torchsummary import summary
import torch.nn as nn
import matplotlib.pyplot as plt
from PIL import Image
import os
import PIL
PIL.Image.MAX_IMAGE_PIXELS = 933120000
from pyspark.ml.linalg import DenseVector
import pyspark.ml.evaluation as ev
import pyspark.ml.regression as reg
from pyspark.ml import Pipeline
import pyspark.ml.tuning as tune
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType,FloatType,IntegerType,StructType
spark=SparkSession.builder.master("local").appName("大数据大作业-机器学习").getOrCreate()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from pyspark.ml import feature as ft
# Enable Arrow support.
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "64")
# 搭一个spark的环境
sc=spark.sparkContext
sc

In [2]:
def Read_features(filename):
    all_features=pd.read_csv(filename,index_col=0)
    df_all_features=spark.createDataFrame(all_features.values.tolist(),schema=all_features.columns.tolist())
    for column in df_all_features.columns[3:]:
        df_all_features=df_all_features.withColumn(column,df_all_features[column].cast(FloatType()))
    for column in df_all_features.columns[:3]:
        df_all_features=df_all_features.withColumn(column,df_all_features[column].cast(IntegerType()))
    print("读取",filename,"完成")
    # df_all_features.printSchema()
    return df_all_features
df_data_all_features=Read_features("train_all_features_noPCA-3.csv")
df_test_all_features=Read_features("test_all_features_noPCA-3.csv")

读取 train_all_features_noPCA-3.csv 完成
读取 test_all_features_noPCA-3.csv 完成


### Regression

In [3]:
class Machine(object):
    """通过Machine类完成模型调参的工作，不同的回归器建立一个不同的Machine类"""
    def __init__(self,clf,clf_params,df_data_all_features=df_data_all_features,df_test_all_features=df_test_all_features):
        self.data=df_data_all_features
        self.test=df_test_all_features
        features = [column for column in df_data_all_features.columns if column!="product_id" and column!="price" 
                    and column!="product_category"  and column!="brand"
                   ]
        features.append("categorys")
        features.append("brand_category")
        cate_creator=ft.OneHotEncoderEstimator(inputCols=["product_category"],outputCols=["categorys"])
        brands_creator=ft.OneHotEncoderEstimator(inputCols=["brand"],outputCols=["brand_category"])
        featuresCreator = ft.VectorAssembler(
            inputCols=[col for col in features[1:]], 
            outputCol='features'
            )
        # 设定评价指标列表，默认第一个指标是用来参数寻优的
        self.evaluators=[ev.RegressionEvaluator(predictionCol="prediction", labelCol='price',metricName="rmse"),
                         ev.RegressionEvaluator(predictionCol="prediction", labelCol='price',metricName="r2"),
                        ev.RegressionEvaluator(predictionCol="prediction", labelCol='price',metricName="mae")]
        self.evaluator=self.evaluators[0]
        # 建立pipeline并设定参数
        self.pipeline=Pipeline(stages=[
            cate_creator,
            brands_creator,
            featuresCreator,
            # ft.VarianceThresholdSelector(varianceThreshold=0.0001, outputCol="features"),
            clf])
        self.grid=tune.ParamGridBuilder()
        for key,value in zip(clf_params.keys(),clf_params.values()):
            self.grid=self.grid.addGrid(eval("self.pipeline.getStages()[3]."+key),value)
        self.grid=self.grid.build()
        # 建立cv
        self.cv = CrossValidator(estimator=self.pipeline, estimatorParamMaps=self.grid, evaluator=self.evaluator,
                            parallelism=5,seed=0)
    def GridSearch(self):
        """参数寻优"""
        self.cvModel = self.cv.fit(self.data)
        self.bestModel=self.cvModel.bestModel
    def Report(self):
        """返回模型报告"""
        print("="*70)
        results = [([{key.name: paramValue} for key, paramValue in zip(params.keys(),params.values())], metric)
                   for params, metric in zip(self.cvModel.getEstimatorParamMaps(),self.cvModel.avgMetrics)]
        print("最优模型参数：",sorted(results,key=lambda el: el[1],reverse=True)[0][0])
        print("在训练集上，rmse,r2,mae分别是：",[evaluator.evaluate(self.bestModel.transform(self.data)) for evaluator in self.evaluators])
        print("在测试集上，rmse,r2,mae分别是：",[evaluator.evaluate(self.bestModel.transform(self.test)) for evaluator in self.evaluators])     
        print("="*70)
    def Header(self,model_name):
        """中心调度器"""
        print("模型",model_name,"正在训练...")
        self.GridSearch()
        print("训练完成，下面是报告")
        self.Report()
        return self.bestModel

In [4]:
clf_GBT = reg.GBTRegressor(labelCol='price',seed=0)
params_GBT={"maxIter":[10,15,30],"maxDepth":[0,1,2],"minInfoGain":[0.0005,0.001,0.01]}
GBT=Machine(clf=clf_GBT,clf_params=params_GBT)
bestModel_GBT=GBT.Header(model_name="GBT")

模型 GBT 正在训练...
训练完成，下面是报告
最优模型参数： [{'maxIter': 30}, {'maxDepth': 2}, {'minInfoGain': 0.0005}]
在训练集上，rmse,r2,mae分别是： [4187.700616831333, 0.06809774725140827, 626.6017157449023]
在测试集上，rmse,r2,mae分别是： [2838.347473938381, 0.13038898609913763, 772.7639506435714]


In [5]:
clf_RandomForest=reg.RandomForestRegressor(labelCol='price',seed=0)
params_RandomForest={"maxDepth":[0,2,4],"numTrees":[10,15,30,60]}
RandomForest=Machine(clf=clf_RandomForest,clf_params=params_RandomForest)
bestModel_RandomForest=RandomForest.Header("RandomForest")

模型 RandomForest 正在训练...
训练完成，下面是报告
最优模型参数： [{'maxDepth': 4}, {'numTrees': 10}]
在训练集上，rmse,r2,mae分别是： [3878.181302105531, 0.20076346919495835, 588.1694615335601]
在测试集上，rmse,r2,mae分别是： [2904.3169980010457, 0.08949582321458305, 746.75009315821]


In [6]:
clf_GeneralizedLinear=reg.GeneralizedLinearRegression(labelCol='price')
params_GeneralizedLinear={"family":["gaussian","poisson","gamma","tweedie"],
                          "regParam":[0.001,0.01,0.05]}
GeneralizedLinear=Machine(clf=clf_GeneralizedLinear,clf_params=params_GeneralizedLinear)
bestModel_GeneralizedLinear=GeneralizedLinear.Header("GeneralizedLinear")

模型 GeneralizedLinear 正在训练...
训练完成，下面是报告
最优模型参数： [{'family': 'gamma'}, {'regParam': 0.001}]
在训练集上，rmse,r2,mae分别是： [4234.0269587588655, 0.04736540799572131, 631.6811833377358]
在测试集上，rmse,r2,mae分别是： [2942.413681499895, 0.06545251807499497, 782.9988383034854]


* 还是刚刚那个模型，展示特征重要性

In [15]:
bestModel_GBT.stages[3].featureImportances

SparseVector(519, {0: 0.0667, 1: 0.0667, 2: 0.0333, 11: 0.0667, 40: 0.0333, 103: 0.0667, 117: 0.0333, 153: 0.0333, 203: 0.0333, 269: 0.0333, 287: 0.0333, 296: 0.0333, 305: 0.0333, 343: 0.0333, 346: 0.0333, 380: 0.0333, 396: 0.0667, 408: 0.0333, 455: 0.1, 459: 0.0333, 472: 0.0333, 508: 0.0333, 514: 0.0333})

In [16]:
bestModel_RandomForest.stages[3].featureImportances

SparseVector(519, {0: 0.1517, 2: 0.0268, 31: 0.0294, 32: 0.006, 61: 0.0005, 82: 0.002, 88: 0.0029, 96: 0.0733, 98: 0.0105, 116: 0.0127, 141: 0.0037, 160: 0.0088, 176: 0.0243, 205: 0.0032, 218: 0.0031, 223: 0.0031, 250: 0.0313, 265: 0.0013, 278: 0.0176, 323: 0.0993, 368: 0.0178, 396: 0.0062, 419: 0.0065, 452: 0.0253, 455: 0.0019, 477: 0.0577, 479: 0.1152, 490: 0.0051, 500: 0.0146, 503: 0.0064, 512: 0.1059, 514: 0.0203, 515: 0.1055})