In [1]:
# !pip3 install matplotlib

In [2]:
# !pip3 install seaborn

In [3]:
# !pip3 install scikit-learn

In [4]:
from pyspark.sql import SparkSession, Row, DataFrame, Column

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [6]:
np.__version__, pd.__version__

('1.18.1', '1.0.1')

In [7]:
spark = SparkSession\
    .builder\
    .master("yarn")\
    .config('spark.executor.cores','2') \
    .config('spark.executor.instances','3') \
    .config("spark.executor.memory", '8g')\
    .appName("ss_001")\
    .getOrCreate()
# .config("spark.jars.packages", "com.microsoft.ml.spark:mmlspark_2.11:1.0.0-rc1")\

In [8]:
spark

In [None]:
sc = spark.sparkContext

In [31]:
spark.stop()

---

### Making Parquet files

Do only once per datasets

In [None]:
# df = spark.read.csv('/user/ss/datasets/ss_fteng_G3V3_ohe_recip_20200218a.csv', \
#                     sep = ',', header = True, inferSchema = True)

In [None]:
# df.printSchema()

In [None]:
# df.columns

In [None]:
# df

In [None]:
# # DataFrames can be saved as Parquet files, maintaining the schema information.
# df.write.parquet('/user/ss/datasets/ss_fteng_G3V3_ohe_recip_20200218a.parquet')

---

### Read parquet files

In [9]:
# Reading files from parquet
df = spark.read.parquet('/user/ss/datasets/ss_fteng_G3V3_ohe_recip_20200218a.parquet')

In [None]:
# final_df = final_df.repartition(4)

In [None]:
# df.printSchema()

In [None]:
# df

---

In [10]:
train_df = df[df['TARGET'].isNotNull()]
test_df = df[df['TARGET'].isNull()]

In [19]:
# assert df.count() == (train_df.count() + test_df.count())

In [20]:
# # spark機器學習要求輸入的DataFrame類型為數值類型， 將本來的string欄位轉換成double，並替代空值
# for col, t in app_train.dtypes:
#     if t == "string":
#         app_train = app_train.withColumn(col, app_train[col].cast("double"))

# app_train = app_train.withColumn("TARGET", app_train["TARGET"].cast("int"))
# app_train = app_train.fillna(999999)

In [24]:
# train_df.columns

In [25]:
# 跟在普通單機上做訓練時不同，spark做訓練時所有特徵列需要通過VectorAssembler轉換成特徵矩陣，才能用來訓練
import pyspark.ml.feature as ft
featuresCreator = ft.VectorAssembler(
    inputCols=[col for col in train_df.columns[:] if col not in ["TARGET"]],
    outputCol='features'
    )

---

##### info funtion

In [None]:
# # function
# def info(slef):
#     for i in spark.sparkContext._conf.getAll():
#         if i[0] in ['spark.executor.instances','spark.executor.cores','spark.executor.memory']:
#             print(f'{i[0]} : {i[1]}')
#     print(f'Partitions : {app_train.rdd.getNumPartitions()}')

---

In [28]:
# 實例化一個LightGBM Regressor， 其參數和單機版本類似但不盡相同， 文檔可以在以下鏈接找到：
# https://mmlspark.azureedge.net/docs/pyspark/LightGBMRegressor.html
from mmlspark.lightgbm import LightGBMRegressor

lgbm = LightGBMRegressor(
    boostingType="goss",
    numIterations=100,
    objective='binary',
    learningRate=0.1,
#     baggingSeed=50,
    lambdaL1=0.8,
    lambdaL2=0.8,
#     baggingFraction=0.87,
    minSumHessianInLeaf=0.03,
    maxDepth=12,
#     featureFraction=0.66,
    numLeaves=31,
    labelCol="TARGET"
                          )

In [29]:
# 建立一個pipeline，簡化訓練步驟
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[
                # 特徵整理
                featuresCreator,
                # 模型名稱
                    lgbm])

In [None]:
# # 這裡是將訓練數據分成訓練集和驗證集，測試模型預測效果 (OK)
# import pyspark.ml.evaluation as ev
# from pyspark.ml.evaluation import RegressionEvaluator

# tr, tv = train_df.randomSplit([0.7, 0.3], seed = 924)

# vmodel = pipeline.fit(tr)
# t_prediction = vmodel.transform(tv)
# evaluator = ev.BinaryClassificationEvaluator(
#      rawPredictionCol='prediction',
#      labelCol='TARGET')
# print(evaluator.evaluate(t_prediction, {evaluator.metricName: 'areaUnderROC'}))



In [None]:
# 交叉驗證
import pyspark.ml.evaluation as ev
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator

paramGrid = ParamGridBuilder()\
    .addGrid(lgbm.learningRate, [0.1])\
    .build()


evaluator = ev.RegressionEvaluator(
     labelCol='TARGET',
     rawPredictionCol='prediction')

cv = CrossValidator(
    estimator = pipeline,
    estimatorParamMaps = paramGrid,
    evaluator = evaluator,
    numFolds = 2)

# cv_pipeline = Pipeline(stages=[featuresCreator,cv])

cv_model = cv.fit(train_df)
cv_prediction = cv_model.transform(test_df)

print(f'Full AUC {evaluator.evaluate(cv_prediction)}')
print(evaluator.evaluate(cv_model, {evaluator.metricName: 'areaUnderROC'}))

selected = cv_prediction.select("SK_ID_CURR", "TARGET")
for row in selected.collect():
    print(row)




In [None]:
selected = cv_prediction.select("SK_ID_CURR", "TARGET")
for row in selected.collect():
    print(row)

In [None]:
# # 找出最佳模型
# from pyspark.ml.tuning import ParamGridBuilder
# from pyspark.ml.tuning import TrainValidationSplit

# paramGrid = ParamGridBuilder()\
#     .addGrid(lgbm.numLeaves, [10,20,30])\
#     .addGrid(lgbm.numIterations, [100,160,200])\
#     .addGrid(lgbm.baggingSeed, [25,50,75])\
#     .build()
# tvs = TrainValidationSplit( estimator=lgbm,
#     estimatorParamMaps=paramGrid,
#     evaluator=evaluator,
#     trainRatio = 0.8)
# # 最佳模型
# tvs_pipeline = Pipeline(stages=[featuresCreator,tvs])

# tvs_pipelineModel = tvs_pipeline.fit(train_df)

# prediction = tvs_pipelineModel.transform(test_df)
# print(f'Full AUC {evaluator.evaluate(prediction)}')


In [None]:
model = pipeline.fit(train_df)

In [None]:
prediction = model.transform(test_df)

In [None]:
# 测试集结果输出，从hadoop里将预测数据下载到本机
res = prediction.select("SK_ID_CURR", "prediction")
res = res.withColumn("TARGET", res["prediction"])
res = res.select("SK_ID_CURR", "TARGET")

In [None]:
res.coalesce(1).write.csv("./cluster_lgbm.csv", header='true')

In [None]:
%%time
df.count()

In [None]:
df.head

In [None]:
spark.stop()

---

### Reference

https://zhuanlan.zhihu.com/p/67828512

In [None]:
from mmlspark import LightGBMRegressor
import pyspark.ml.feature as ft
from pyspark.ml import Pipeline
import pyspark.ml.evaluation as ev
import pyspark.sql.types as typ

# 如果你是通过spark-submit来运行，则需要先实例化一个spark session对象， 在pyspark中spark session对象已经默认生成
# from pyspark import SparkConf, SparkContext
# from pyspark.sql import SparkSession
# conf = SparkConf().setMaster("spark://master:7077").setAppName("MMLSPARK")
# sc = SparkContext(conf = conf)
# spark = SparkSession \
#         .builder \
#         .appName("MMLSPARK") \
#         .enableHiveSupport() \
#         .getOrCreate()


# 读取csv数据，这里读取的是事先使用hadoop fs -put命令上传到hadoop里的数据
app_train = spark.read.csv("/homecredit/train_all3.csv", header='true', inferSchema='true')

# 数据预处理， 将本来应该是数字的字符串数据转化数据类型，并替代空值
for col, t in app_train.dtypes:
    if t == "string":
        app_train = app_train.withColumn(col, app_train[col].cast("double"))

app_train = app_train.withColumn("TARGET", app_train["TARGET"].cast("int"))
app_train = app_train.fillna(999999)

# 跟在普通单机上做训练时不同，spark做训练时所有特征列需要通过VectorAssembler转换成特征矩阵，才能用来训练
featuresCreator = ft.VectorAssembler(
    inputCols=[col for col in app_train.columns[1:] if col != "TARGET"],
    outputCol='features'
    )


# 实例化一个LightGBM Regressor， 其参数和单机版本类似但不尽相同， 文档可以在以下链接找到：
# https://mmlspark.azureedge.net/docs/pyspark/LightGBMRegressor.html
lgbm = LightGBMRegressor(numIterations=120, objective='binary',
        learningRate=0.007, baggingSeed=50,
        boostingType="goss", lambdaL1=0.4, lambdaL2=0.4,
        baggingFraction=0.87, minSumHessianInLeaf=0.003,
        maxDepth=9, featureFraction=0.66, numLeaves=47,
        labelCol="TARGET"
                          )

# 建立一个pipeline，简化训练步骤
pipeline = Pipeline(stages=[
                # 特征整理
                featuresCreator,
                # 模型名称
                    lgbm])

# 这里是将数据分成训练集和验证集，测试模型预测效果
tr, te = app_train.randomSplit([0.7, 0.3], seed=666)

vmodel = pipeline.fit(tr)
t_model = vmodel.transform(te)
evaluator = ev.BinaryClassificationEvaluator(
     rawPredictionCol='prediction',
     labelCol='TARGET')
print(evaluator.evaluate(t_model,
 {evaluator.metricName: 'areaUnderROC'}))

# 实际训练过程
model = pipeline.fit(app_train)

# 测试集的数据预处理和训练
app_test = spark.read.csv("/homecredit/test_all3.csv", header='true', inferSchema='true')
for col, t in app_test.dtypes:
    if t == "string":
        app_test = app_test.withColumn(col, app_test[col].cast("double"))
app_test = app_test.fillna(999999)
prediction = model.transform(app_test)

# 测试集结果输出，从hadoop里将预测数据下载到本机
res = prediction.select("SK_ID_CURR", "prediction")
res = res.withColumn("TARGET", res["prediction"])
res = res.select("SK_ID_CURR", "TARGET")
res.coalesce(1).write.csv("/homecredit/cluster_lgbm.csv", header='true')

---

In [None]:
%%time

# 交叉驗證

paramGrid = ParamGridBuilder()\
    .addGrid(lgbm.learningRate, [0.095,0.1,0.105])\
    .build()

cv = CrossValidator(
    estimator=lgbm,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds = 5)

cv_pipeline = Pipeline(stages=[featuresCreator,cv])

cv_model = cv_pipeline.fit(train_df)
cv_prediction = cv_model.transform(test_df)

print(f'Full AUC {evaluator.evaluate(cv_prediction)}')

In [None]:
%%time

# 找出最佳模型

from pyspark.ml.tuning import TrainValidationSplit
paramGrid = ParamGridBuilder()\
    .addGrid(lgbm.numLeaves, [10,20,30])\
    .addGrid(lgbm.numIterations, [100,160,200])\
    .addGrid(lgbm.baggingSeed, [25,50,75])\
    .build()
tvs = TrainValidationSplit( estimator=lgbm,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    trainRatio = 0.8)
# 最佳模型
tvs_pipeline = Pipeline(stages=[featuresCreator,tvs])

tvs_pipelineModel = tvs_pipeline.fit(train_df)

prediction = tvs_pipelineModel.transform(test_df)
print(f'Full AUC {evaluator.evaluate(prediction)}')


In [None]:
%%time

# 找出最佳模型+交叉驗證

from pyspark.ml.tuning import TrainValidationSplit
paramGrid = ParamGridBuilder()\
    .addGrid(lgbm.numLeaves, [10,20,30])\
    .addGrid(lgbm.numIterations, [100,200,300])\
    .addGrid(lgbm.baggingSeed, [25,50,75])\
    .build()

cv = CrossValidator(
    estimator=lgbm,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds = 10)

tvs_pipeline = Pipeline(stages=[featuresCreator,cv])

tvs_pipelineModel = tvs_pipeline.fit(train_df)

prediction = tvs_pipelineModel.transform(test_df)
print(f'Full AUC {evaluator.evaluate(prediction)}')


---

In [None]:
# https://spark.apache.org/docs/latest/ml-tuning.html
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Prepare training documents, which are labeled.
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0),
    (4, "b spark who", 1.0),
    (5, "g d a y", 0.0),
    (6, "spark fly", 1.0),
    (7, "was mapreduce", 0.0),
    (8, "e spark program", 1.0),
    (9, "a e c l", 0.0),
    (10, "spark compile", 1.0),
    (11, "hadoop software", 0.0)
], ["id", "text", "label"])

# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
# This will allow us to jointly choose parameters for all Pipeline stages.
# A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
# We use a ParamGridBuilder to construct a grid of parameters to search over.
# With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
# this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=2)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(training)

# Prepare test documents, which are unlabeled.
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "mapreduce spark"),
    (7, "apache hadoop")
], ["id", "text"])

# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cvModel.transform(test)
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
    print(row)