## Spark Notebook Setup

In [33]:
import $ivy.`org.apache.spark::spark-sql:2.4.0`
import org.apache.spark.sql._
import org.apache.spark.{SparkConf, SparkContext}

[32mimport [39m[36m$ivy.$[39m
[32mimport [39m[36morg.apache.spark.sql._[39m
[32mimport [39m[36morg.apache.spark.{SparkConf, SparkContext}[39m

In [34]:
import org.apache.log4j.{Level, Logger}
Logger.getLogger("org").setLevel(Level.OFF)

[32mimport [39m[36morg.apache.log4j.{Level, Logger}[39m

In [35]:
val spark = {
  NotebookSparkSession.builder()
    .master("local[*]")
    .getOrCreate()
}

[36mspark[39m: [32mSparkSession[39m = org.apache.spark.sql.SparkSession@39ff85d5

## Preprocessing Data

In [36]:
val moviePath = "notebooks/data/movies.csv"
val stopWordsPath = "notebooks/data/stopwords.txt"
val lemmatizationPath = "notebooks/data/lemmatization.txt"

[36mmoviePath[39m: [32mString[39m = [32m"notebooks/data/movies.csv"[39m
[36mstopWordsPath[39m: [32mString[39m = [32m"notebooks/data/stopwords.txt"[39m
[36mlemmatizationPath[39m: [32mString[39m = [32m"notebooks/data/lemmatization.txt"[39m

In [37]:
val moviesRaw = spark.read
    .option("header", "true")
    .option("multiline", "true")
    .option("escape", "\"")
    .csv(moviePath)
    .rdd
    .zipWithIndex
    .map(x => (x._2, x._1.getString(0), x._1.getString(1), x._1.getString(7))) // (id, title, genres)

moviesRaw.take(5).foreach(println)

(0,1901,Kansas Saloon Smashers,A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1])
(1,1901,Love by the Light of the Moon,The moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything

[36mmoviesRaw[39m: [32mRDD[39m[([32mLong[39m, [32mString[39m, [32mString[39m, [32mString[39m)] = MapPartitionsRDD[105] at map at cmd37.sc:8

In [38]:
val stopWords = spark.read
    .textFile(stopWordsPath)
    .rdd
    .map(_.trim.toLowerCase.replaceAll("[^a-z]", ""))
    .collect()
    .toSet

[36mstopWords[39m: [32mSet[39m[[32mString[39m] = [33mSet[39m(
  [32m"quotient"[39m,
  [32m"snow"[39m,
  [32m"liquid"[39m,
  [32m"eye"[39m,
  [32m"down"[39m,
  [32m"wash"[39m,
  [32m"rub"[39m,
  [32m"side"[39m,
  [32m"measure"[39m,
  [32m"please"[39m,
  [32m"trouble"[39m,
  [32m"finger"[39m,
  [32m"read"[39m,
  [32m"number"[39m,
  [32m"mass"[39m,
  [32m"able"[39m,
  [32m"behind"[39m,
  [32m"mother"[39m,
  [32m"chief"[39m,
  [32m"hurry"[39m,
  [32m"for"[39m,
  [32m"drive"[39m,
  [32m"magnet"[39m,
  [32m"tire"[39m,
  [32m"find"[39m,
  [32m"neighbor"[39m,
  [32m"wild"[39m,
  [32m"support"[39m,
  [32m"produce"[39m,
  [32m"school"[39m,
  [32m"rock"[39m,
  [32m"soldier"[39m,
  [32m"weight"[39m,
  [32m"question"[39m,
  [32m"art"[39m,
  [32m"parent"[39m,
  [32m"shop"[39m,
  [32m"method"[39m,
...

In [39]:
val lemmatization = spark.read
    .textFile(lemmatizationPath)
    .rdd
    .map(_.split("\\s+"))
    .map(x => (x(1), x(0)))
    .collect()
    .toMap

lemmatization.take(5).foreach(println)

(professed,profess)
(pathogens,pathogen)
(purifies,purify)
(phosphates,phosphate)
(buns,bun)


[36mlemmatization[39m: [32mMap[39m[[32mString[39m, [32mString[39m] = [33mMap[39m(
  [32m"professed"[39m -> [32m"profess"[39m,
  [32m"pathogens"[39m -> [32m"pathogen"[39m,
  [32m"purifies"[39m -> [32m"purify"[39m,
  [32m"phosphates"[39m -> [32m"phosphate"[39m,
  [32m"buns"[39m -> [32m"bun"[39m,
  [32m"fathering"[39m -> [32m"father"[39m,
  [32m"soapiest"[39m -> [32m"soapy"[39m,
  [32m"basils"[39m -> [32m"basil"[39m,
  [32m"showdowns"[39m -> [32m"showdown"[39m,
  [32m"redcurrants"[39m -> [32m"redcurrant"[39m,
  [32m"night-watchmen"[39m -> [32m"night-watchman"[39m,
  [32m"regularizing"[39m -> [32m"regularize"[39m,
  [32m"boutiques"[39m -> [32m"boutique"[39m,
  [32m"satsumas"[39m -> [32m"satsuma"[39m,
  [32m"healings"[39m -> [32m"healing"[39m,
  [32m"breaks"[39m -> [32m"break"[39m,
  [32m"cut-backs"[39m -> [32m"cut-back"[39m,
  [32m"sneezed"[39m -> [32m"sneeze"[39m,
  [32m"forgotten"[39m -> [32m"forget"[39

In [40]:
val movies = moviesRaw.map { case (id, year, title, plot) =>
    val processedPlot = plot.toLowerCase
        .replaceAll("[^a-z ]", " ")
        .split(" ")
        .filterNot(stopWords.contains)
        .filterNot(_.isEmpty)
        .map(token => lemmatization.getOrElse(token, token))

    (id, year, title, processedPlot)
}

movies.take(5).foreach(println)

(0,1901,Kansas Saloon Smashers,[Ljava.lang.String;@60ecfa8e)
(1,1901,Love by the Light of the Moon,[Ljava.lang.String;@7e792983)
(2,1901,The Martyred Presidents,[Ljava.lang.String;@51e4c3dd)
(3,1901,Terrible Teddy, the Grizzly King,[Ljava.lang.String;@5dbb21cd)
(4,1902,Jack and the Beanstalk,[Ljava.lang.String;@2ef9067b)


[36mmovies[39m: [32mRDD[39m[([32mLong[39m, [32mString[39m, [32mString[39m, [32mArray[39m[[32mString[39m])] = MapPartitionsRDD[117] at map at cmd40.sc:1

In [41]:
val vocabulary = movies 
    .flatMap { case (_, _, _, cleanedPlot) => cleanedPlot }
    .distinct

vocabulary.take(10).foreach(println)

mistretta
shh
sammee
bone
gerven
nothin
rostom
mislabel
fred
bresac


[36mvocabulary[39m: [32mRDD[39m[[32mString[39m] = MapPartitionsRDD[121] at distinct at cmd41.sc:2

In [42]:
val vocabularyIndexed = vocabulary
    .zipWithIndex
    .map(_.swap)

vocabularyIndexed.take(10).foreach(println)

(0,mistretta)
(1,shh)
(2,sammee)
(3,bone)
(4,gerven)
(5,nothin)
(6,rostom)
(7,mislabel)
(8,fred)
(9,bresac)


[36mvocabularyIndexed[39m: [32mRDD[39m[([32mLong[39m, [32mString[39m)] = MapPartitionsRDD[123] at map at cmd42.sc:3

In [43]:
def dfToIdf(df: Int, totalDocs: Long): Double = {
    math.log(totalDocs.toDouble / df)
}

defined [32mfunction[39m [36mdfToIdf[39m

In [44]:
val numDocs = movies.count
val vocabularySize = vocabularyIndexed.count.toInt

[36mnumDocs[39m: [32mLong[39m = [32m34886L[39m
[36mvocabularySize[39m: [32mInt[39m = [32m112805[39m

In [45]:
val vocabIdfs = movies
    .flatMap { case (_, _, _, cleanedPlot) => cleanedPlot }
    .map { word => (word, 1) }
    .reduceByKey(_ + _)
    .zipWithIndex
    .map { case ((word, count), id) => (word, id, count) }
    .map { case (word, id, count) => (word, id, dfToIdf(count, numDocs)) }

vocabIdfs.take(10).foreach(println)

(mistretta,0,8.513930732522766)
(shh,1,10.45984088157808)
(sammee,2,10.45984088157808)
(bone,3,5.250354728736659)
(gerven,4,10.45984088157808)
(nothin,5,9.36122859290997)
(rostom,6,8.061945608779709)
(mislabel,7,9.36122859290997)
(fred,8,3.370597726550566)
(bresac,9,10.45984088157808)


[36mvocabIdfs[39m: [32mRDD[39m[([32mString[39m, [32mLong[39m, [32mDouble[39m)] = MapPartitionsRDD[129] at map at cmd45.sc:7

In [46]:
import org.apache.spark.rdd.RDD
import $ivy.`org.apache.spark::spark-mllib:2.4.0`
import org.apache.spark.mllib.linalg.{Vector, Vectors}

[32mimport [39m[36morg.apache.spark.rdd.RDD[39m
[32mimport [39m[36m$ivy.$[39m
[32mimport [39m[36morg.apache.spark.mllib.linalg.{Vector, Vectors}[39m

In [47]:
def tokenListToTfIdf(tokenList: List[String], vocabulary: RDD[(String, Long, Double)]): List[Double] = {
    val tfs = spark.sparkContext.parallelize(tokenList)
        .map((_, 1))
        .reduceByKey(_ + _)

    val idfs = vocabulary 
        .map { case (word, id, idf) => (word, (id, idf)) }

    val tfidfs = tfs.rightOuterJoin(idfs)
        .map { case (word, (tfOpt, (id, idf))) => (word, tfOpt.getOrElse(0), idf, id) }
        .map { case (word, tf, idf, id) => (id, tf * idf) }
        .sortBy(_._1)
        .collect
        .map(_._2)
        .toList

    tfidfs
}

defined [32mfunction[39m [36mtokenListToTfIdf[39m

In [47]:
// val tokenList = List("apple", "banana", "orange", "apple")
// val vocabulary = spark.sparkContext.parallelize(Seq(("apple", 0, 2), ("banana", 1, 2), ("orange", 2, 1)))

In [47]:
// val vocabIdfs = vocabulary
//     .map { case (word, id, df) => (word, id, dfToIdf(df, 3)) }
// 
// val tf = tokenListToTfIdf(tokenList, vocabIdfs)
// println("test", tf)

In [48]:
val firstEntry = movies.first()

[36mfirstEntry[39m: ([32mLong[39m, [32mString[39m, [32mString[39m, [32mArray[39m[[32mString[39m]) = (
  [32m0L[39m,
  [32m"1901"[39m,
  [32m"Kansas Saloon Smashers"[39m,
  [33mArray[39m(
    [32m"bartender"[39m,
    [32m"work"[39m,
    [32m"saloon"[39m,
    [32m"serve"[39m,
    [32m"drink"[39m,
    [32m"customer"[39m,
    [32m"fill"[39m,
    [32m"stereotypically"[39m,
    [32m"irish"[39m,
    [32m"s"[39m,
    [32m"bucket"[39m,
    [32m"beer"[39m,
    [32m"carrie"[39m,
    [32m"follower"[39m,
    [32m"burst"[39m,
    [32m"inside"[39m,
    [32m"assault"[39m,
    [32m"irish"[39m,
    [32m"pull"[39m,
    [32m"eye"[39m,
    [32m"dump"[39m,
    [32m"beer"[39m,
    [32m"wreck"[39m,
    [32m"smash"[39m,
    [32m"fixture"[39m,
    [32m"mirror"[39m,
    [32m"break"[39m,
    [32m"cash"[39m,
    [32m"register"[39m,
    [32m"bartender"[39m,
    [32m"spray"[39m,
    [32m"seltzer"[39m,
    [32m"s"[39m,
    [32m"pol

In [49]:
vocabIdfs.take(10).foreach(println)

(mistretta,0,8.513930732522766)
(shh,1,10.45984088157808)
(sammee,2,10.45984088157808)
(bone,3,5.250354728736659)
(gerven,4,10.45984088157808)
(nothin,5,9.36122859290997)
(rostom,6,8.061945608779709)
(mislabel,7,9.36122859290997)
(fred,8,3.370597726550566)
(bresac,9,10.45984088157808)


In [50]:
val firstEntryTfIdf = tokenListToTfIdf(firstEntry._4.toList, vocabIdfs)
println(firstEntryTfIdf)

List(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

[36mfirstEntryTfIdf[39m: [32mList[39m[[32mDouble[39m] = [33mList[39m(
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
  [32m0.0[39m,
...

In [51]:
println(firstEntryTfIdf.sum)

136.8393980256314


In [52]:
def countTokens(tokens: List[String]): Map[String, Int] = {
    tokens.groupBy(identity).mapValues(_.size)
}

defined [32mfunction[39m [36mcountTokens[39m

In [53]:
val tokenCounts = movies
    .flatMap { case (id, title, genres, plotTokens) => {
        val tfs = plotTokens
            .groupBy(identity)
            .mapValues(_.size)
            .map { case (token, count) => (token, id, count) }
            .toList
        tfs
    }}

tokenCounts.take(10).foreach(println)

(eye,0,1)
(register,0,1)
(s,0,2)
(wreck,0,1)
(break,0,1)
(serve,0,1)
(burst,0,1)
(everybody,0,1)
(carrie,0,1)
(dump,0,1)


[36mtokenCounts[39m: [32mRDD[39m[([32mString[39m, [32mLong[39m, [32mInt[39m)] = MapPartitionsRDD[144] at flatMap at cmd53.sc:2

In [54]:
vocabIdfs.take(10).foreach(println)

(mistretta,0,8.513930732522766)
(shh,1,10.45984088157808)
(sammee,2,10.45984088157808)
(bone,3,5.250354728736659)
(gerven,4,10.45984088157808)
(nothin,5,9.36122859290997)
(rostom,6,8.061945608779709)
(mislabel,7,9.36122859290997)
(fred,8,3.370597726550566)
(bresac,9,10.45984088157808)


In [64]:
import scala.collection.immutable.Map

[32mimport [39m[36mscala.collection.immutable.Map[39m

In [65]:
def calc_magnitude(v: List[Double]) = {
  math.sqrt(v.map(x => x * x).sum)
}

def calc_cosign(doc_map_1: Map[Long, Double], doc_map_2: Map[Long, Double]) = {
  //This is a wierd way of doing the dot product, but it works.
  //So, for every value in map_1, we get the corresponding value in map_2
  //If the value isn't in map_2, include it as a 0
  //If the value is in map_2 and not map_1, then don't include it
  //Not including a value be the same as putting a zero
  //then, simply sum the list
  val dot_product = doc_map_1
    .map({case (k, v) => (k, (v, doc_map_2.get(k)))})
    .map({
      case (k, (v1, Some(v2))) => (v1 * v2)
      case (k, (v1, None)) => (0)
    }).sum

  val v1 = doc_map_1.map({case (k, v) => v}).toList
  val v2 = doc_map_2.map({case (k, v) => v}).toList

  dot_product / (calc_magnitude(v1) * calc_magnitude(v2))
}


defined [32mfunction[39m [36mcalc_magnitude[39m
defined [32mfunction[39m [36mcalc_cosign[39m

In [62]:
val tfIdfs = tokenCounts
    .map { case (token, docId, tf) => (token, (docId, tf)) }
    .join(vocabIdfs.map { case (word, wordId, idf) => (word, (wordId, idf)) })
    .map { case (token, ((docId, tf), (wordId, idf))) => (docId, (wordId, tf * idf)) }
    .groupByKey
    .mapValues(_.toMap)

tfIdfs.take(10).foreach(println)

(19021,Map(42181 -> 4.038218613771562, 84168 -> 1.878359200279481, 110851 -> 40.78627894292018, 92548 -> 6.103132054888489, 108663 -> 4.051312090518582, 89888 -> 2.3993008350394405, 13110 -> 3.0653477743590423, 105116 -> 0.6733928911433047, 84915 -> 3.328142371111169, 4739 -> 4.37990768648249, 54027 -> 3.5641581838302123, 32823 -> 4.286054777676143, 75016 -> 3.3107092830206737, 25639 -> 1.4977054915114785, 68266 -> 2.2426722858120103, 44880 -> 5.71490875321483, 99973 -> 2.1842098270000703, 11913 -> 2.7465029927062097, 77305 -> 5.107982748102014, 23204 -> 3.8371045576282405, 109115 -> 0.7377553087765231, 10982 -> 9.552522228478797, 12924 -> 3.4131936037293245, 84393 -> 3.2828221156681803, 32484 -> 2.1345348518254985, 15505 -> 4.448573707173918, 24006 -> 5.329942166655006, 79331 -> 3.1972122806038388, 55608 -> 3.123555221556783, 56794 -> 2.138176074443079, 14291 -> 4.119481577850328, 1836 -> 5.161098792991619, 89564 -> 4.793414193465648, 111124 -> 2.5981138037540994, 650 -> 7.11936888382

[36mtfIdfs[39m: [32mRDD[39m[([32mLong[39m, [32mMap[39m[[32mLong[39m, [32mDouble[39m])] = MapPartitionsRDD[178] at mapValues at cmd62.sc:6

In [66]:
def cosineBetweenDocs(doc1: Int, doc2: Int, tfIdfs: RDD[(Long, Map[Long, Double])]): Double = {
    val doc1TfIdfs = tfIdfs.filter(_._1 == doc1).collect.head._2
    val doc2TfIdfs = tfIdfs.filter(_._1 == doc2).collect.head._2

    calc_cosign(doc1TfIdfs, doc2TfIdfs)
}

defined [32mfunction[39m [36mcosineBetweenDocs[39m

In [67]:
cosineBetweenDocs(0, 1, tfIdfs)

[36mres67[39m: [32mDouble[39m = [32m0.01807862983103303[39m

In [68]:
cosineBetweenDocs(4, 4, tfIdfs)