# Strumbleupon

데이터 : https://www.kaggle.com/c/stumbleupon/data
 - 데이터는 다운받아서 사용

### 데이터 변수 특징

![](dataSchema.png)

In [1]:
val PATH = "/home/paulkim/workspace/Spark/semi_project"
val rawData = sc.textFile("%s/data/stumbleupon/train.tsv".format(PATH))
val records_full = rawData.map(line => line.split("\t").map(elem => elem.trim))
val header = records_full.first
val records = records_full.filter(_(0) != header(0))

In [2]:
records.take(2)

Array(Array("http://www.bloomberg.com/news/2010-12-23/ibm-predicts-holographic-calls-air-breathing-batteries-by-2015.html", "4042", "{""title"":""IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries"",""body"":""A sign stands outside the International Business Machines Corp IBM Almaden Research Center campus in San Jose California Photographer Tony Avelar Bloomberg Buildings stand at the International Business Machines Corp IBM Almaden Research Center campus in the Santa Teresa Hills of San Jose California Photographer Tony Avelar Bloomberg By 2015 your mobile phone will project a 3 D image of anyone who calls and your laptop will be powered by kinetic energy At least that s what International Business Machin...

## 1. Pre-processing
1. 텍스트 데이터 활용 X
2. 데이터 타입을 모두 Double형으로 변경함

In [3]:
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors

val data = records.map{ r => 
    val trimmed = r.map(_.replaceAll("\"", ""))
    val label = trimmed(r.size -1).toInt
    val features = trimmed.slice(4, r.size -1).map(d => if (d == "?") 0.0 else d.toDouble)
    LabeledPoint(label, Vectors.dense(features))
}
data.take(1)

Array((0.0,[0.789131,2.055555556,0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575]))

In [4]:
data.cache()

MapPartitionsRDD[4] at map at <console>:32

## 2. EDA 따위는 없다

In [5]:
val numData = data.count
println("number of distances : " + numData)

number of distances : 7395


In [6]:
// Naive Bayes 모형을 위해 독립변수에 존재하는 음수를 변경
val nbdata = records.map{ r => 
    val trimmed = r.map(fields => fields.replaceAll("\"", ""))
    val label = trimmed(r.size -1).toInt
    val features = trimmed.slice(4, r.size -1).map(d => if (d == "?") 0.0 else d.toDouble).map(d => if (d<0) 0.0 else d)
    LabeledPoint(label, Vectors.dense(features))
}

In [7]:
nbdata.take(1)

Array((0.0,[0.789131,2.055555556,0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575]))

## 3. Training Model & Evaluation(accuracy)
그냥 깡으로 때려넣음

- 로지스틱회귀
- SVM
- 나이브베이즈
- 의사결정나무

## 3.1 데이터 전처리 없는 학습모형

In [8]:
import org.apache.spark.mllib.classification.LogisticRegressionWithSGD
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.tree.configuration.Algo
import org.apache.spark.mllib.tree.impurity.Entropy

In [9]:
val numIters = 10
val maxTreeDepth = 5
val LR_Model = LogisticRegressionWithSGD.train(data, numIters)
val SVM_Model = SVMWithSGD.train(data, numIters)
val NB_Model = NaiveBayes.train(nbdata)
val DT_Model = DecisionTree.train(data, Algo.Classification, Entropy, maxTreeDepth)

### 3.1.1 LogisticRegressionWithSGD

In [10]:
val dataPoint = data.first
val trueLabel = dataPoint.label

val prediction = LR_Model.predict(dataPoint.features)
println("real label : %f".format(trueLabel))
println("predicted label : %f".format(prediction))

real label : 0.000000
predicted label : 1.000000


In [11]:
val predictions = LR_Model.predict(data.map(lp => lp.features))

In [12]:
val LR_TotalCorrect = data.map{ point =>
    if (LR_Model.predict(point.features) == point.label) 1 else 0
}.sum
val LR_Accuracy = LR_TotalCorrect / numData
println("LogisticRegressionWithSGD Model's Accuacy : %f".format(LR_Accuracy))

LogisticRegressionWithSGD Model's Accuacy : 0.514672


### 3.1.2 SVMWithSGD

In [13]:
val SVM_TotalCorrect = data.map { point =>
    if(SVM_Model.predict(point.features) == point.label) 1 else 0
}.sum

val SVM_Accuracy = SVM_TotalCorrect / numData
println("SVMWithSGD Model's Accuracy : %f".format(SVM_Accuracy))

SVMWithSGD Model's Accuracy : 0.514672


### 3.1.3 NaiveBayes

In [14]:
val NB_TotalCorrect = nbdata.map { point => 
    if (NB_Model.predict(point.features) == point.label) 1 else 0
}.sum

val NB_Accuracy = NB_TotalCorrect / numData
println("NaiveBeyes Model's Accuracy : %f".format(NB_Accuracy))

NaiveBeyes Model's Accuracy : 0.580392


### 3.1.4 DecionTree

In [15]:
val DT_TotalCorrect = data.map { point =>
    if (DT_Model.predict(point.features) == point.label) 1 else 0
}.sum

val DT_Accuracy = DT_TotalCorrect / numData
println("DecisionTree Model's Accuracy : %f".format(DT_Accuracy))

DecisionTree Model's Accuracy : 0.648276


### 3.1.5 evaluation
**Precision & Recall**

In [16]:
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
val metrics = Seq(LR_Model, SVM_Model).map { model =>
    val scoreAndLabels = data.map { point =>
        (model.predict(point.features), point.label)
    }
    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    (model.getClass.getSimpleName, metrics.areaUnderPR, metrics.areaUnderROC)
}

val nbMetrics = Seq(NB_Model).map{ model =>
    val scoreAndLabels = nbdata.map{ point =>
        val score = model.predict(point.features)
        (if (score > 0.5) 1.0 else 0.0, point.label)
    }
    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    (model.getClass.getSimpleName, metrics.areaUnderPR, metrics.areaUnderROC)
}

val dtMetrics = Seq(DT_Model).map { model =>
    val scoreAndLabels = data.map { point => 
        val score = model.predict(point.features)
        (if (score > 0.5) 1.0 else 0.0, point.label)
    }
    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    (model.getClass.getSimpleName, metrics.areaUnderPR, metrics.areaUnderROC)
}

val allMetrics = metrics ++ nbMetrics ++ dtMetrics
allMetrics.foreach{ case (m, pr, roc) =>
    println(f"$m, Area under PR: ${pr * 100.0}%2.4f%%, Area under ROC: ${roc * 100.0}%2.4f%%")
}

LogisticRegressionModel, Area under PR: 75.6759%, Area under ROC: 50.1418%
SVMModel, Area under PR: 75.6759%, Area under ROC: 50.1418%
NaiveBayesModel, Area under PR: 68.0851%, Area under ROC: 58.3559%
DecisionTreeModel, Area under PR: 74.3081%, Area under ROC: 64.8837%


## 3.2 데이터 전처리 수행 및 모형 학습

### 3.2.1 기술통계량 확인
**mllib.linalg.distributed.RowMatrix** : 열벡터를 RDD형태로 관리할 수 있음

In [17]:
import org.apache.spark.mllib.linalg.distributed.RowMatrix
val vectors = data.map(lp => lp.features)
val matrix = new RowMatrix(vectors)
val matrixSummary = matrix.computeColumnSummaryStatistics()

In [18]:
// 독립변수들의 기술통계량을 확인
println(matrixSummary.mean)
println(matrixSummary.min)
println(matrixSummary.max)
println(matrixSummary.variance)

[0.4122580529952672,2.761823191986608,0.4682304732861389,0.21407992638350232,0.09206236071899916,0.04926216043908053,2.255103452212041,-0.10375042752143335,0.0,0.05642274498417851,0.02123056118999324,0.23377817665490194,0.2757090373659236,0.615551048005409,0.6603110209601082,30.07707910750513,0.03975659229208925,5716.598242055447,178.75456389452353,4.960649087221096,0.17286405047031742,0.10122079189276552]
[0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.045564223,-1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0]
[0.999426,363.0,1.0,1.0,0.980392157,0.980392157,21.0,0.25,0.0,0.444444444,1.0,0.716883117,113.3333333,1.0,1.0,100.0,1.0,207952.0,4997.0,22.0,1.0,1.0]
[0.10974244167559001,74.30082476809639,0.04126316989120241,0.02153343633200108,0.009211817450882448,0.005274933469767946,32.53918714591821,0.09396988697611545,0.0,0.0017177410346628928,0.020782634824610638,0.0027548394224293036,3.683788919674426,0.2366799607085986,0.22433071201674218,415.8785589543846,0.03818116876739597,7.877330081138463

### 3.2.2 연속형 데이터 정규화(Standardization)

In [19]:
import org.apache.spark.mllib.feature.StandardScaler

In [20]:
val scaler = new StandardScaler(withMean = true, withStd = true).fit(vectors)
val scaledData = data.map(lp => LabeledPoint(lp.label, scaler.transform(lp.features)))

println(data.first.features)
println(scaledData.first.features)

[0.789131,2.055555556,0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575]
[1.137647336497678,-0.08193557169294771,1.0251398128933331,-0.05586356442541689,-0.4688932531289357,-0.3543053263079386,-0.3175352172363148,0.3384507982396541,0.0,0.828822173315322,-0.14726894334628504,0.22963982357813484,-0.14162596909880876,0.7902380499177364,0.7171947294529865,-0.29799681649642257,-0.2034625779299476,-0.03296720969690391,-0.04878112975579913,0.9400699751165439,-0.10869848852526258,-0.2788207823137022]


### 3.2.3 정규화 데이터를  로지스틱에 학습

In [21]:
val LR_ModelScaled = LogisticRegressionWithSGD.train(scaledData, numIters)
val LR_TotalCorrectScaled = scaledData.map { point =>
    if(LR_ModelScaled.predict(point.features) == point.label) 1 else 0
}.sum
val LR_AccuracyScaled = LR_TotalCorrectScaled / numData
val LR_PredictVSTrue = scaledData.map { point =>
    (LR_ModelScaled.predict(point.features), point.label)
}
val LR_MetricsScaled = new BinaryClassificationMetrics(LR_PredictVSTrue)
val LR_PR = LR_MetricsScaled.areaUnderPR
val LR_ROC = LR_MetricsScaled.areaUnderROC

In [22]:
println(f"${LR_ModelScaled.getClass.getSimpleName}\nAccuracy : ${LR_AccuracyScaled * 100}%2.4f%%\nAreaUnder PR: ${LR_PR * 100.0}%2.4f%%\nArea under ROC : ${LR_ROC * 100.0}%2.4f%%")

LogisticRegressionModel
Accuracy : 62.0419%
AreaUnder PR: 72.7254%
Area under ROC : 61.9663%


### 3.2.4 범주형 원핫인코딩 및 데이터 정규화
범주형 데이터 원핫인코딩으로 변환

In [23]:
val categories = records.map(r => r(3)).distinct.collect.zipWithIndex.toMap
val numCategories = categories.size
println(numCategories)

14


In [24]:
val dataCategories = records.map { r =>
    val trimmed = r.map(fields => fields.replaceAll("\"", ""))
    val label = trimmed(r.size - 1).toInt
    val categoryIdx = categories(r(3))
    val categoryFeatures = Array.ofDim[Double](numCategories)
    categoryFeatures(categoryIdx) = 1.0
    val otherFeatures = trimmed.slice(4, r.size -1).map(d => if (d == "?") 0.0 else d.toDouble)
    val features = categoryFeatures ++ otherFeatures
    LabeledPoint(label, Vectors.dense(features))
}

In [25]:
println(dataCategories.first)

(0.0,[0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.789131,2.055555556,0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575])


In [26]:
val scalerCats = new StandardScaler(withMean=true, withStd=true).fit(dataCategories.map(lp => lp.features))
val scaledDataCats = dataCategories.map(lp => LabeledPoint(lp.label, scalerCats.transform(lp.features)))
println(dataCategories.first.features)
println(scaledDataCats.first.features)

[0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.789131,2.055555556,0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176,0.079129575]
[-0.02326210589837061,2.7207366564548514,-0.4464212047941535,-0.22052688457880879,-0.028494000387023734,-0.2709990696925828,-0.23272797709480803,-0.2016540523193296,-0.09914991930875496,-0.38181322324318134,-0.06487757239262681,-0.6807527904251456,-0.20418221057887365,-0.10189469097220732,1.137647336497678,-0.08193557169294771,1.0251398128933331,-0.05586356442541689,-0.4688932531289357,-0.3543053263079386,-0.3175352172363148,0.3384507982396541,0.0,0.828822173315322,-0.14726894334628504,0.22963982357813484,-0.14162596909880876,0.7902380499177364,0.7171947294529865,-0.29799681649642257,-0.2034625779299476,-0.03296720969690391,-0.04878112975579913,0.9400699751165439,-0.10869848852526258,-0.2788207823137022]


### 3.2.5 로지스틱 모형 학습 및 평가

In [27]:
val LR_ModelScaledCats = LogisticRegressionWithSGD.train(scaledDataCats, numIters)
val LR_TotalCorrectScaledCats = scaledDataCats.map { point =>
    if(LR_ModelScaledCats.predict(point.features) == point.label) 1.0 else 0
}.sum
val LR_AccuracyScaledCats = LR_TotalCorrectScaledCats / numData
val LR_PredictVSTrueCats = scaledDataCats.map { point =>
    (LR_ModelScaledCats.predict(point.features), point.label)
}
val LR_MetricsScaled = new BinaryClassificationMetrics(LR_PredictVSTrueCats)
val LR_PRCats = LR_MetricsScaled.areaUnderPR
val LR_ROCCats = LR_MetricsScaled.areaUnderROC
println(f"${LR_ModelScaledCats.getClass.getSimpleName}\nAccuracy : ${LR_AccuracyScaledCats * 100}%2.4f%%\nAreaUnder PR: ${LR_PRCats * 100.0}%2.4f%%\nArea under ROC : ${LR_ROCCats * 100.0}%2.4f%%")

LogisticRegressionModel
Accuracy : 66.5720%
AreaUnder PR: 75.7964%
Area under ROC : 66.5483%


### 3.2.6 NaiveBayes 학습 및 평가
- 범주형 변수만 사용한 모형

In [28]:
val dataNB = records.map{ r =>
    val trimmed = r.map(fields => fields.replaceAll("\"", ""))
    val label = trimmed(r.size - 1).toInt
    val categoryIdx = categories(r(3))
    val categoryFeatures = Array.ofDim[Double](numCategories)
    categoryFeatures(categoryIdx) = 1.0
    LabeledPoint(label, Vectors.dense(categoryFeatures))
}
val NB_ModelCats = NaiveBayes.train(dataNB)
val NB_TotalCorrectCats = dataNB.map { point =>
    if (NB_ModelCats.predict(point.features) == point.label) 1 else 0
}.sum
val NB_AccuracyCats = NB_TotalCorrectCats / numData
val NB_PredictVSTrueCats = dataNB.map { point =>
    (NB_ModelCats.predict(point.features), point.label)
}
val NB_MetricsCats = new BinaryClassificationMetrics(NB_PredictVSTrueCats)
val NB_PRCats = NB_MetricsCats.areaUnderPR
val NB_ROCCats = NB_MetricsCats.areaUnderROC
println(f"${NB_ModelCats.getClass.getSimpleName}\nAccuracy: ${NB_AccuracyCats * 100}%2.4f%%\nArea under PR: ${NB_PRCats * 100.0}%2.4f%%\nArea under ROC: ${NB_ROCCats * 100.0}%2.4f%%")

NaiveBayesModel
Accuracy: 60.9601%
Area under PR: 74.0522%
Area under ROC: 60.5138%


## 3.3 모형의 하이퍼파라미터 튜닝
### 3.3.1 Logistic

In [29]:
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.optimization.Updater
import org.apache.spark.mllib.optimization.SimpleUpdater
import org.apache.spark.mllib.optimization.L1Updater
import org.apache.spark.mllib.optimization.SquaredL2Updater
import org.apache.spark.mllib.classification.ClassificationModel

In [30]:
def trainWithParams(input: RDD[LabeledPoint], regParam: Double, numIterations: Int, 
updater: Updater, stepSize: Double) = {
    val lr = new LogisticRegressionWithSGD
    lr.optimizer.setNumIterations(numIterations).setUpdater(updater).setRegParam(regParam).setStepSize(stepSize)
    lr.run(input)
}

def createMetrics(label: String, data: RDD[LabeledPoint], model: ClassificationModel) = {
    val scoreAndLabels = data.map { point =>
        (model.predict(point.features), point.label)
    }
    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    (label, metrics.areaUnderPR, metrics.areaUnderROC)
}

In [31]:
scaledDataCats.cache()

MapPartitionsRDD[236] at map at <console>:48

### 3.3.1.1 numIteration

In [32]:
val iterResults = Seq(1, 5, 10, 50, 100).map{ param =>
    val model = trainWithParams(scaledDataCats, 0.0, param, new SimpleUpdater, 1.0)
    createMetrics(s"$param iterations", scaledDataCats, model)
}
iterResults.foreach{ case (param, pr, auc) => println(f"$param, PR =${pr * 100}%2.2f%% , AUC = ${auc * 100}%2.2f%%")}

1 iterations, PR =74.59% , AUC = 64.95%
5 iterations, PR =75.80% , AUC = 66.62%
10 iterations, PR =75.80% , AUC = 66.55%
50 iterations, PR =76.09% , AUC = 66.81%
100 iterations, PR =76.08% , AUC = 66.75%


### 3.3.1.2 stepSize(learning_rate)

In [33]:
val stepResults = Seq(0.001, 0.01, 0.1, 1.0, 10.0, 100.0).map { param =>
    val model = trainWithParams(scaledDataCats, 0.0, numIters, new SimpleUpdater, param)
    createMetrics(s"$param step size", scaledDataCats, model)
}
stepResults.foreach{ case (param, pr, auc) => println(f"$param, PR =${pr * 100}%2.2f%%, AUC = ${auc * 100}%2.2f%%")}

0.001 step size, PR =74.60%, AUC = 64.97%
0.01 step size, PR =74.60%, AUC = 64.96%
0.1 step size, PR =75.00%, AUC = 65.52%
1.0 step size, PR =75.80%, AUC = 66.55%
10.0 step size, PR =72.57%, AUC = 61.92%
100.0 step size, PR =64.97%, AUC = 52.06%


### 3.3.1.3 regularization
- **SimpleUpdater** : regularization을 적용하지 않는 것과 같음. 로지스틱회귀 기본 
- **SquaredL2Updater** : 가중치 벡터의 제곱(L2). SVM 모델의 기본
- **L1Updater** : 가중치 벡터의 절대값(L1)
---
참고
- Regularization : https://en.wikipedia.org/wiki/Regularization_(mathematics)
- L2 norm : https://en.wikipedia.org/wiki/Tikhonov_regularization
- Overfitting vs Underfitting : https://en.wikipedia.org/wiki/Overfitting

In [34]:
val regResults = Seq(0.001, 0.01, 0.1, 1.0, 10.0).map { param =>
    val model = trainWithParams(scaledDataCats, param, numIters, new SquaredL2Updater, 1.0)
    createMetrics(s"$param L2 regularization parameter", scaledDataCats, model)
}
regResults.foreach { case (param, pr, auc) => println(f"$param, PR =${pr * 100}%2.2f%%, AUC = ${auc * 100}%2.2f%%") }

0.001 L2 regularization parameter, PR =75.80%, AUC = 66.55%
0.01 L2 regularization parameter, PR =75.80%, AUC = 66.55%
0.1 L2 regularization parameter, PR =75.84%, AUC = 66.63%
1.0 L2 regularization parameter, PR =75.37%, AUC = 66.04%
10.0 L2 regularization parameter, PR =52.48%, AUC = 35.33%


In [35]:
val regResults = Seq(0.001, 0.01, 0.1, 1.0, 10.0).map { param =>
    val model = trainWithParams(scaledDataCats, param, numIters, new L1Updater, 1.0)
    createMetrics(s"$param L2 regularization parameter", scaledDataCats, model)
}
regResults.foreach { case (param, pr, auc) => println(f"$param, PR =${pr * 100}%2.2f%%, AUC = ${auc * 100}%2.2f%%") }

0.001 L2 regularization parameter, PR =75.79%, AUC = 66.53%
0.01 L2 regularization parameter, PR =75.82%, AUC = 66.47%
0.1 L2 regularization parameter, PR =75.67%, AUC = 50.00%
1.0 L2 regularization parameter, PR =75.67%, AUC = 50.00%
10.0 L2 regularization parameter, PR =75.67%, AUC = 50.00%


### 3.3.2 DecisionTree

In [36]:
import org.apache.spark.mllib.tree.impurity.Impurity
import org.apache.spark.mllib.tree.impurity.Entropy
import org.apache.spark.mllib.tree.impurity.Gini

### 3.3.2.1 Entropy

In [37]:
def trainDTWithParams(input: RDD[LabeledPoint], maxDepth: Int, impurity:Impurity) = {
    DecisionTree.train(input, Algo.Classification, impurity, maxDepth)
}

In [38]:
// data : 원 데이터
// maxDepth는 30이 최대
val dtResultsEntropy = Seq(1, 2, 3, 4, 5, 10, 20, 30).map{ param =>
    val model = trainDTWithParams(data, param, Entropy)
    val scoreAndLabels = data.map{point =>
        val score = model.predict(point.features)
        (if (score > 0.5) 1.0 else 0.0, point.label)
    }
    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    (s"$param tree depth", metrics.areaUnderROC)
}

dtResultsEntropy.foreach{ case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%")}

1 tree depth, AUC = 59.33%
2 tree depth, AUC = 61.68%
3 tree depth, AUC = 62.61%
4 tree depth, AUC = 63.63%
5 tree depth, AUC = 64.88%
10 tree depth, AUC = 76.26%
20 tree depth, AUC = 98.45%
30 tree depth, AUC = 99.95%


In [39]:
// scaledData : 연속형변수만 정규화된 데이터
val dtResultsEntropy = Seq(1, 2, 3, 4, 5, 10, 20, 30).map{ param =>
    val model = trainDTWithParams(scaledData, param, Entropy)
    val scoreAndLabels = scaledData.map{point =>
        val score = model.predict(point.features)
        (if (score > 0.5) 1.0 else 0.0, point.label)
    }
    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    (s"$param tree depth", metrics.areaUnderROC)
}

dtResultsEntropy.foreach{ case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%")}

1 tree depth, AUC = 59.33%
2 tree depth, AUC = 61.68%
3 tree depth, AUC = 62.61%
4 tree depth, AUC = 63.63%
5 tree depth, AUC = 64.88%
10 tree depth, AUC = 76.26%
20 tree depth, AUC = 98.45%
30 tree depth, AUC = 99.95%


In [40]:
// scaledDataCats : 연속형, 범주형 정규화 데이터
val dtResultsEntropy = Seq(1, 2, 3, 4, 5, 10, 20, 30).map{ param =>
    val model = trainDTWithParams(scaledDataCats, param, Entropy)
    val scoreAndLabels = scaledDataCats.map{point =>
        val score = model.predict(point.features)
        (if (score > 0.5) 1.0 else 0.0, point.label)
    }
    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    (s"$param tree depth", metrics.areaUnderROC)
}

dtResultsEntropy.foreach{ case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%")}

1 tree depth, AUC = 59.33%
2 tree depth, AUC = 59.33%
3 tree depth, AUC = 61.83%
4 tree depth, AUC = 62.15%
5 tree depth, AUC = 66.50%
10 tree depth, AUC = 75.91%
20 tree depth, AUC = 96.43%
30 tree depth, AUC = 99.74%


### 3.2.2.2 Gini

In [41]:
val dtResultsGini = Seq(1, 2, 3, 4, 5, 10, 20, 30).map{ param =>
    val model = trainDTWithParams(data, param, Gini)
    val scoreAndLabels = data.map{point =>
        val score = model.predict(point.features)
        (if (score > 0.5) 1.0 else 0.0, point.label)
    }
    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    (s"$param tree depth", metrics.areaUnderROC)
}
dtResultsGini.foreach{ case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%")}

1 tree depth, AUC = 59.33%
2 tree depth, AUC = 61.68%
3 tree depth, AUC = 62.61%
4 tree depth, AUC = 63.63%
5 tree depth, AUC = 64.89%
10 tree depth, AUC = 78.37%
20 tree depth, AUC = 98.87%
30 tree depth, AUC = 99.95%


In [42]:
val dtResultsGini = Seq(1, 2, 3, 4, 5, 10, 20, 30).map{ param =>
    val model = trainDTWithParams(scaledData, param, Gini)
    val scoreAndLabels = scaledData.map{point =>
        val score = model.predict(point.features)
        (if (score > 0.5) 1.0 else 0.0, point.label)
    }
    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    (s"$param tree depth", metrics.areaUnderROC)
}
dtResultsGini.foreach{ case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%")}

1 tree depth, AUC = 59.33%
2 tree depth, AUC = 61.68%
3 tree depth, AUC = 62.61%
4 tree depth, AUC = 63.63%
5 tree depth, AUC = 64.89%
10 tree depth, AUC = 78.37%
20 tree depth, AUC = 98.87%
30 tree depth, AUC = 99.95%


In [43]:
val dtResultsGini = Seq(1, 2, 3, 4, 5, 10, 20, 30).map{ param =>
    val model = trainDTWithParams(scaledDataCats, param, Gini)
    val scoreAndLabels = scaledDataCats.map{point =>
        val score = model.predict(point.features)
        (if (score > 0.5) 1.0 else 0.0, point.label)
    }
    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    (s"$param tree depth", metrics.areaUnderROC)
}
dtResultsGini.foreach{ case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%")}

1 tree depth, AUC = 59.33%
2 tree depth, AUC = 61.61%
3 tree depth, AUC = 61.83%
4 tree depth, AUC = 62.04%
5 tree depth, AUC = 66.45%
10 tree depth, AUC = 76.90%
20 tree depth, AUC = 98.35%
30 tree depth, AUC = 99.93%


### 3.3.3 NaiveBayes

In [44]:
def trainNBWithParams(input: RDD[LabeledPoint], lambda: Double) = {
    val nb = new NaiveBayes
    nb.setLambda(lambda)
    nb.run(input)
}

In [45]:
val nbResults = Seq(0.001, 0.01, 0.1, 1.0, 10.0, 100.0).map { param =>
    val model = trainNBWithParams(dataNB, param)
    val scoreAndLabels = dataNB.map{ point =>
        (model.predict(point.features), point.label)
    }
    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
    (s"$param lambda", metrics.areaUnderROC)
}
nbResults.foreach { case (param, auc) => println(f"$param, AUC = ${auc * 100}%2.2f%%") }

0.001 lambda, AUC = 60.51%
0.01 lambda, AUC = 60.51%
0.1 lambda, AUC = 60.51%
1.0 lambda, AUC = 60.51%
10.0 lambda, AUC = 60.51%
100.0 lambda, AUC = 60.51%


## 3.4 CV

In [46]:
val trainTestSplit = scaledDataCats.randomSplit(Array(0.6, 0.4), 123)
val train = trainTestSplit(0)
val test = trainTestSplit(1)

val regResultsTest = Seq(0.0, 0.001, 0.0025, 0.005, 0.01).map { param =>
    val model = trainWithParams(train, param, numIters, new SquaredL2Updater, 1.0)
    createMetrics(s"$param L2 regularization parameter", test, model)
}
regResultsTest.foreach { case (param, pr, auc) => println(f"$param, PR = ${pr * 100}%2.6f%%, AUC = ${auc * 100}%2.6f%%") }

0.0 L2 regularization parameter, PR = 74.769566%, AUC = 66.126842%
0.001 L2 regularization parameter, PR = 74.769566%, AUC = 66.126842%
0.0025 L2 regularization parameter, PR = 74.769566%, AUC = 66.126842%
0.005 L2 regularization parameter, PR = 74.769566%, AUC = 66.126842%
0.01 L2 regularization parameter, PR = 74.741549%, AUC = 66.093195%


In [47]:
val regResultsTrain = Seq(0.0, 0.001, 0.0025, 0.005, 0.01).map { param =>
    val model = trainWithParams(train, param, numIters, new SquaredL2Updater, 1.0)
    createMetrics(s"$param L2 regularization parameter", train, model)
}
regResultsTrain.foreach { case (param, pr, auc) => println(f"$param, PR = ${pr * 100} ,AUC = ${auc * 100}%2.6f%%") }

0.0 L2 regularization parameter, PR = 76.02586670823597 ,AUC = 66.233459%
0.001 L2 regularization parameter, PR = 76.02586670823597 ,AUC = 66.233459%
0.0025 L2 regularization parameter, PR = 76.02586670823597 ,AUC = 66.233459%
0.005 L2 regularization parameter, PR = 76.04103125345007 ,AUC = 66.257100%
0.01 L2 regularization parameter, PR = 76.0584542877353 ,AUC = 66.278745%
