# Goal of this exercise is to better understand the map, zip, flatMap, reduce, and fold methods in Scala with a simple data set.

## Recreate Pearson Correlation Matrix
* So to practice map, fold, zip, etc. again w/o relying too much on Sim's solution

In [1]:
// Generate two sequences of random numbers to work with
import scala.util.Random
val seq1 = Seq.fill(100)(Random.nextGaussian)
val seq2 = Seq.fill(100)(Random.nextGaussian)

[32mimport [39m[36mscala.util.Random
[39m
[36mseq1[39m: [32mSeq[39m[[32mDouble[39m] = [33mList[39m(
  [32m-0.02621587199930509[39m,
  [32m-0.5988565582169093[39m,
  [32m2.0153334167677848[39m,
  [32m-1.5079830388457967[39m,
  [32m0.3882963547739467[39m,
  [32m-0.2012493088900946[39m,
  [32m0.7519417767175839[39m,
  [32m-0.2486259751106985[39m,
  [32m-0.31826274453428394[39m,
  [32m-0.507560149407706[39m,
  [32m-1.693129457185314[39m,
[33m...[39m
[36mseq2[39m: [32mSeq[39m[[32mDouble[39m] = [33mList[39m(
  [32m0.20584426469701886[39m,
  [32m-0.9088442180418637[39m,
  [32m1.3062387983612476[39m,
  [32m0.2628230642989509[39m,
  [32m-1.0063039569431265[39m,
  [32m-0.3720657588005266[39m,
  [32m-0.13207037158409038[39m,
  [32m0.8457036132549793[39m,
  [32m-0.210805008668713[39m,
  [32m-0.9049437775532733[39m,
  [32m-0.9663508903440864[39m,
[33m...[39m

In [35]:
// Functions for creating Pearson correlation matrix
def sum(x: Seq[Double]) = {
    x.sum
}
def mean(x: Seq[Double]) = {
    sum(x) / x.length
}
def sumOfSquares(x: Seq[Double]) = {
    x.fold(0.0)((a,b) => a + b*b)
}
def variance(x: Seq[Double]) = {
    val meanx = mean(x)
    sumOfSquares(x) / x.length - meanx * meanx
}
def stddev(x: Seq[Double]) = {
    math.sqrt(variance(x))
}
def zscore(x: Seq[Double]) = {
    val meanx = mean(x)
    val sigx = stddev(x)
    x.map(a => (a - meanx) / sigx)
}
def pearsonCorrelation(x: Seq[Double], y:Seq[Double]) = {
    val pairs = zscore(x).zip(zscore(y))
    val prdt = pairs.foldLeft(0.0)((a,b) => a + b._1 * b._2)
    prdt / x.length
}

defined [32mfunction[39m [36msum[39m
defined [32mfunction[39m [36mmean[39m
defined [32mfunction[39m [36msumOfSquares[39m
defined [32mfunction[39m [36mvariance[39m
defined [32mfunction[39m [36mstddev[39m
defined [32mfunction[39m [36mzscore[39m
defined [32mfunction[39m [36mpearsonCorrelation[39m

In [36]:
pearsonCorrelation(seq1, seq1)
pearsonCorrelation(seq2, seq2)
pearsonCorrelation(seq1, seq2)

[36mres35_0[39m: [32mDouble[39m = [32m1.0[39m
[36mres35_1[39m: [32mDouble[39m = [32m0.9999999999999993[39m
[36mres35_2[39m: [32mDouble[39m = [32m0.27675173576163115[39m

## Read in Iris Data
* Read through lines in csv file
* Store each column as separate array

In [77]:
// csv file
val csvSource = io.Source.fromFile("/Users/Kim/Documents/data/Test Data Sets/iris.csv")
// initialize output arrays for each column
import scala.collection.mutable.ArrayBuffer
val sepal_length = ArrayBuffer[Double]()
val sepal_width = ArrayBuffer[Double]()
val petal_length = ArrayBuffer[Double]()
val petal_width = ArrayBuffer[Double]()
val iris_class = ArrayBuffer[String]()
for (line <- csvSource.getLines.drop(1)) {
    val cols = line.split(",").map(_.trim)
    //println(cols(0).toDouble)
    sepal_length += cols(0).toDouble
    sepal_width += cols(1).toDouble
    petal_length += cols(2).toDouble
    petal_width += cols(3).toDouble
    iris_class += cols(4)
//    println(s"${cols(0)}|${cols(1)}|${cols(2)}|${cols(3)}|${cols(4)}")
}
//csvSource.close
val iris_class2 = for (item <- iris_class) yield item.replace("Iris-","")

[36mcsvSource[39m: [32mio[39m.[32mBufferedSource[39m = empty iterator
[32mimport [39m[36mscala.collection.mutable.ArrayBuffer
[39m
[36msepal_length[39m: [32mArrayBuffer[39m[[32mDouble[39m] = [33mArrayBuffer[39m(
  [32m5.1[39m,
  [32m4.9[39m,
  [32m4.7[39m,
  [32m4.6[39m,
  [32m5.0[39m,
  [32m5.4[39m,
  [32m4.6[39m,
  [32m5.0[39m,
  [32m4.4[39m,
  [32m4.9[39m,
  [32m5.4[39m,
[33m...[39m
[36msepal_width[39m: [32mArrayBuffer[39m[[32mDouble[39m] = [33mArrayBuffer[39m(
  [32m3.5[39m,
  [32m3.0[39m,
  [32m3.2[39m,
  [32m3.1[39m,
  [32m3.6[39m,
  [32m3.9[39m,
  [32m3.4[39m,
  [32m3.4[39m,
  [32m2.9[39m,
  [32m3.1[39m,
  [32m3.7[39m,
[33m...[39m
[36mpetal_length[39m: [32mArrayBuffer[39m[[32mDouble[39m] = [33mArrayBuffer[39m(
  [32m1.4[39m,
  [32m1.4[39m,
  [32m1.3[39m,
  [32m1.5[39m,
  [32m1.4[39m,
  [32m1.7[39m,
  [32m1.4[39m,
  [32m1.5[39m,
  [32m1.4[39m,
  [32m1.5[39m,
  [32m1.5[39m,
[33

## Example Statistics

### 1: Compute average Eulidean distance between two arrays

In [87]:
def colMean (x: ArrayBuffer[Double]) = {
    x.sum / x.length
}
def squaredMeanDiff (x: ArrayBuffer[Double]) = {
    val xMean = colMean(x)
    x.map(item => item - xMean).map(item => item * item)
}
def avgEuclideanDistance (x: ArrayBuffer[Double], y: ArrayBuffer[Double]) = {
    val xySqDff = squaredMeanDiff(x).zip(squaredMeanDiff(y))
    val sumDist = xySqDff.foldLeft(0.0)((total,pair) => total + math.sqrt(pair._1 + pair._2) )
    sumDist / x.length
}

defined [32mfunction[39m [36mcolMean[39m
defined [32mfunction[39m [36msquaredMeanDiff[39m
defined [32mfunction[39m [36mavgEuclideanDistance[39m

In [89]:
avgEuclideanDistance(sepal_length, sepal_width)

[36mres88[39m: [32mDouble[39m = [32m0.8349878501506484[39m