# 1.数据导入与划分

In [1]:
from pyspark.sql import SparkSession # SparkSession 是Spark 2.0版本的新入口
spark = SparkSession.builder.master('local').getOrCreate()

In [3]:
raw_data = spark.read.csv(path="hdfs://localhost:9000/user/bdlab/lab2/SUSY.csv.gz",header=False,inferSchema=True)

In [29]:
raw_data.count()

5000000

In [4]:
train,test = raw_data.randomSplit([0.8,0.2])

In [45]:
raw_data.groupBy(raw_data[0]).count().collect()

[Row(_c0=0.0, count=2712173), Row(_c0=1.0, count=2287827)]

0/1 基本概率

In [3]:
prob_0 = 2712173/5000000
prob_1 = 1-prob_0

In [5]:
# 训练集0/1分布
train.groupBy(raw_data[0]).count().collect()

[Row(_c0=0.0, count=2170505), Row(_c0=1.0, count=1829609)]

In [6]:
# 测试集0/1分布
test.groupBy(raw_data[0]).count().collect()

[Row(_c0=0.0, count=541668), Row(_c0=1.0, count=458218)]

In [None]:
# 划分后写入hdfs
test.write.csv(path="hdfs://localhost:9000/user/bdlab/lab2/SUSY_test.csv.gzip",compression="gzip")
train.write.csv(path="hdfs://localhost:9000/user/bdlab/lab2/SUSY_train.csv.gzip",compression="gzip")

In [4]:
# 加载train
train = spark.read.csv(path="hdfs://localhost:9000/user/bdlab/lab2/SUSY_train.csv.gzip",header=False,inferSchema=True)

测试集处理

In [141]:
test_data = test.select(test.columns[1:])

In [142]:
test_label = test.select(test.columns[0])

# 2.Naive-Bayes

In [3]:
import pandas as pd
import numpy as np

In [4]:
cal_mean_func = { _:"mean" for _ in train.columns[1:] }
cal_var_func = { _:"variance" for _ in train.columns[1:]}

In [5]:
mean_train = train.groupBy(train[0]).agg(cal_mean_func).collect()
var_train = train.groupBy(train[0]).agg(cal_var_func).collect()

In [9]:
mean_0 = [ mean_train[0]["avg(_c"+str(i)+str(")")] for i in range(1,19)]
mean_1 = [ mean_train[1]["avg(_c"+str(i)+str(")")] for i in range(1,19)]
var_0 = [ var_train[0]["variance(_c"+str(i)+str(")")] for i in range(1,19)]
var_1 = [ var_train[0]["variance(_c"+str(i)+str(")")] for i in range(1,19)]

In [13]:
# 保存统计信息
np.savetxt("nb_sta",[mean_0,mean_1,var_0,var_1])

In [6]:
# 读取统计信息
mean_0,mean_1,var_0,var_1 = np.loadtxt("nb_sta")

In [29]:
const_pi = 1/ np.sqrt(2*np.pi)
def GaussianProb(x,miu,sigmaq):
    left = const_pi / np.sqrt(sigmaq)
    right = np.exp(- (x - miu)**2 / (2 * sigmaq))
    return left*right

def NB_classifier(x):
    """
    input:
        x: a vec with len of 18
    ouput:
        result: 0 or 1
    """
    if len(x) != 18:
        raise ValueError
        
    # 计算 0 概率
    result_0 = prob_0
    for i in range(18):
        result_0 = result_0 * GaussianProb(x[i],mean_0[i],var_0[i])
        
    # 计算 1 概率
    result_1 = prob_1
    for i in range(18):
        result_1 = result_1 * GaussianProb(x[i],mean_1[i],var_1[i])
    return 0 if result_0>result_1 else 1

def NB_predict(testset):
    return [ NB_classifier(_) for _ in testset ]

### 测试

In [54]:
from sklearn.metrics import accuracy_score,recall_score,classification_report

def metric(true,pre):
    print("Accuracy: {:.4f}, Recall: {:.4f} ".format(accuracy_score(true,pre),recall_score(true,pre)))
    print()
    print(classification_report(true,pre))

In [156]:
np.save("test_data_arr",test_data_arr)
np.save("test_label_arr",test_label_arr)

In [52]:
test_data_arr = np.load("test_data_arr.npy")
test_label_arr = np.load("test_label_arr.npy")

In [30]:
test_pre_nb = NB_predict(test_data_arr)

In [45]:
metric(test_label_arr,test_pre_nb)

Accuracy: 0.7360, Recall: 0.6047 

             precision    recall  f1-score   support

        0.0       0.72      0.85      0.78    541668
        1.0       0.77      0.60      0.68    458218

avg / total       0.74      0.74      0.73    999886



# 3.Logistic-Regression

In [23]:
sample_train = train.sample(0.01)
sample_train_list = sample_train.collect()

In [52]:
testrow = sample_train_list[1]

In [76]:
np.save("sample_train_list",sample_train_list)

In [5]:
sample_train_list = np.load("sample_train_list.npy")
sample_train = spark.sparkContext.parallelize(sample_train_list)

In [34]:
# n 特征数
n = 18
theta = np.zeros(n+1) # 在LR_fit 中更新

def sigmod(x):
    return 1/(1+np.exp( - np.matmul(x,theta)))

# 对数损失函数
def cost_mapper(x):
    h = sigmod(x[0])
    L = 0
    if x[1]==0:
        L = np.log(1-h)
    else:
        L = np.log(h)
    return -L 

# 计算梯度 (h(x)-y)*x
def grad_mapper(x):
    return (sigmod(x[0]) - x[1])*x[0]

def sum_reducer(x,y):
    # (grad,cost)
    return (x[0]+y[0],x[1]+y[1])

In [55]:
import time
def LR_fit(data,max_iter=30,alpha=0.1,tol=0.001,penalty=0.1):
    """
    input:
        data: rdd [x0,x1,...,xn], x0为label
        max_iter : 最大迭代次数
        alpha : 学习率
        tol: 收敛容忍度
        penalty: 惩罚系数
    output:
        theta: 
    """
    iter_cnt = 0
    m = data.count() # 样本数
    # x => ((1,x1,x2,...,xn),y)
    data = data.map(lambda x:(np.asarray((1,)+x[1:]),x[0]) )
    cost_pre = 0
    time_start = time.time()
    while iter_cnt<max_iter:
        grad,cost = data.map(lambda x:(grad_mapper(x),cost_mapper(x))).reduce(sum_reducer)
        # 计算代价
        cost = cost/m + penalty*np.sum(np.square(theta))/(2*m)
        # 更新 Θ
        theta[0] = theta[0] - alpha*grad[0] /m
        theta[1:] = theta[1:] - alpha*(grad[1:]+penalty*theta[1:]) /m
        
        cost_del = cost_pre - cost
        cost_del = -cost_del if cost_pre==0 else cost_del 
        time_end = time.time()
        print("iter {}, cost={:.8f}, △cost={:.8f}, △time={:.2f}".format(iter_cnt,cost,cost_del,time_end-time_start))
        
        if cost_del<tol:
            print("收敛,算法结束")
        cost_pre = cost
        time_start = time_end
        iter_cnt +=1
        
    print("迭代上限,算法结束")
    return theta

In [51]:
# 分类器
def LR_classifier(x):
    h = sigmod(np.insert(x,0,1))
    return 0 if h<0.5 else 1

def LR_predict(testset):
    return [ LR_classifier(_) for _ in testset ]

In [40]:
theta = LR_fit(data=sample_train.rdd)

iter 0, cost=0.6931471805593518, △cost=0.6931471805593518, △time=52.38
iter 1, cost=0.6882139385136494, △cost=0.004933242045702335, △time=53.38
iter 2, cost=0.6841936322932476, △cost=0.004020306220401837, △time=52.25
iter 3, cost=0.6806242263182065, △cost=0.0035694059750410734, △time=54.65
iter 4, cost=0.6773068522325246, △cost=0.00331737408568189, △time=54.45
iter 5, cost=0.6741542321075942, △cost=0.0031526201249304497, △time=52.48
iter 6, cost=0.6711263829311682, △cost=0.0030278491764259563, △time=53.21
iter 7, cost=0.6682034940627589, △cost=0.0029228888684093013, △time=52.35
iter 8, cost=0.6653744769248472, △cost=0.002829017137911727, △time=52.59
iter 9, cost=0.6626321154981288, △cost=0.002742361426718354, △time=52.37
iter 10, cost=0.6599710022961874, △cost=0.0026611132019414008, △time=51.16
iter 11, cost=0.6573866527245563, △cost=0.002584349571631117, △time=51.38
iter 12, cost=0.6548751190997462, △cost=0.0025115336248101583, △time=51.96
iter 13, cost=0.6524328169981621, △cost=0.002

KeyboardInterrupt: 

In [41]:
theta

array([-0.09697159,  0.08209373, -0.00150128, -0.002786  , -0.01411221,
        0.00256429,  0.00105199,  0.15344194, -0.00166682,  0.0619158 ,
        0.04945456,  0.01390256,  0.06590398, -0.06362982, -0.05282876,
        0.0079296 ,  0.01298479, -0.09001692,  0.01508648])

In [53]:
test_pre_lr = LR_predict(test_data_arr)

In [56]:
metric(test_label_arr,test_pre_lr)

Accuracy: 0.7379, Recall: 0.8077 

             precision    recall  f1-score   support

        0.0       0.81      0.68      0.74    541668
        1.0       0.68      0.81      0.74    458218

avg / total       0.75      0.74      0.74    999886



np.array 关于向量乘数值的运算速度测试

In [31]:
%%time
def grad_mapper(x):
    return (sigmod(x[0]) - x[1])*x[0]
grad,cost = sample_train.rdd.map(lambda x:(np.asarray((1,)+x[1:]),x[0]) ).map(lambda x:(grad_mapper(x),cost_mapper(x))).reduce(sum_reducer)

CPU times: user 15.5 ms, sys: 16.6 ms, total: 32.2 ms
Wall time: 52.9 s


In [32]:
%%time
def grad_mapper(x):
    h = (sigmod(x[0]) - x[1])
    return [ h*_ for _ in x[0]]
grad,cost = sample_train.rdd.map(lambda x:(np.asarray((1,)+x[1:]),x[0]) ).map(lambda x:(grad_mapper(x),cost_mapper(x))).reduce(sum_reducer)

CPU times: user 231 ms, sys: 78.7 ms, total: 309 ms
Wall time: 3min 3s


In [33]:
%%time
def grad_mapper(x):
    h = (sigmod(x[0]) - x[1])
    return [ h*_ for _ in x[0]]
grad,cost = sample_train.rdd.map(lambda x:((1,)+x[1:],x[0]) ).map(lambda x:(grad_mapper(x),cost_mapper(x))).reduce(sum_reducer)

CPU times: user 204 ms, sys: 41.4 ms, total: 246 ms
Wall time: 2min 50s


In [25]:
len(sample_train_list)

40230