In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
import datetime
from pyspark.ml import Pipeline

spark = SparkSession.builder.master('local[1]').appName('learn_ml').getOrCreate()



In [2]:
import matplotlib.pyplot as plt
import numpy as np
# 载入数据
df0 = spark.read.csv('BIA 678 dataset.csv', header=True, inferSchema=True, encoding='utf-8')
# 查看是否有缺失值
# df0.toPandas().isna().sum()
# df0.toPandas().isna().values.any()
# False 没有缺失值

starttime = datetime.datetime.now()

# 先使用StringIndexer将字符转化为数值，然后将特征整合到一起
from pyspark.ml.feature import StringIndexer, VectorAssembler
old_columns_names = df0.columns
new_columns_names = [name+'-new' for name in old_columns_names]
for i in range(len(old_columns_names)):
    indexer = StringIndexer(inputCol=old_columns_names[i], outputCol=new_columns_names[i])
    df0 = indexer.fit(df0).transform(df0)
vecAss = VectorAssembler(inputCols=new_columns_names[3:11], outputCol='features')
df0 = vecAss.transform(df0)
# 更换label列名
df0 = df0.withColumnRenamed(new_columns_names[-1], 'label')

# 创建新的只有label和features的表
dfi = df0.select(['label', 'features'])

# 数据概观
# dfi.show(5, truncate=0)

train_data, test_data = dfi.randomSplit([0.9, 0.1], seed=2019)


from pyspark.ml.classification import LogisticRegression

from pyspark.ml.classification import NaiveBayes
lr = LogisticRegression()
lrModel = lr.fit(train_data)
result = lrModel.transform(test_data)

#accuracy
a = result.filter(result.label == result.prediction).count()/result.count()
print('\nLogistic Regression: ', a)

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(maxDepth=5, maxBins=35)
dtModel = dt.fit(train_data)
result = dtModel.transform(test_data)

# accuracy
b = result.filter(result.label == result.prediction).count()/result.count()
print('\nDecisionTree: ', b)

from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(numTrees=10, maxDepth=5, maxBins=35)
rfModel = rf.fit(train_data)
result = rfModel.transform(test_data)

# accuracy
c = result.filter(result.label == result.prediction).count()/result.count()
print('\nRandomForest: ', c)

from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(maxDepth=5, maxBins=35)
gbtModel = gbt.fit(train_data)
result = gbtModel.transform(test_data)

# accuracy
d = result.filter(result.label == result.prediction).count()/result.count()
print('\nGBT: ', d)

endtime = datetime.datetime.now()
print('\n', endtime - starttime)



Logistic Regression:  0.6104355056646918

DecisionTree:  0.6567404635692755

RandomForest:  0.6458788067768423

GBT:  0.6746180230745245

 0:01:14.525367
