# Scala Spark Wordcount Example (using DataFrame)

In [None]:
//Using SparkContext, which is already present in the environment
val schema = "col1 String"

val df_inp = spark.read.schema(schema).csv("/user/s0m0158/data/test01")
df_inp.show(3)

In [8]:
val input_file = System.getProperty("user.dir")+"/../../data/wordcount-input.txt"
// print(input_file)

input_file = /home/jovyan/jupyter/scala/../../data/wordcount-input.txt


/home/jovyan/jupyter/scala/../../data/wordcount-input.txt

In [15]:
// Read the file into a DataFrame
val schema = "col1 string"
val input_df = spark.read.schema(schema).option("delimiter", "|").csv(input_file)
input_df.show(3)

+--------------------+
|                col1|
+--------------------+
|In 2011, Marc And...|
|For example, imag...|
|Now, let’s take t...|
+--------------------+



schema = col1 string
input_df = [col1: string]


[col1: string]

In [17]:
// All these packages are pre-imported in the kernel, but listing them here for knowledge sake
import org.apache.spark.sql.SparkSession
//import spark.implicits._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window

// Split words on each line by space
val df_line_split = input_df.withColumn("split_line", split(input_df("col1"), " "))

// Explode each split line array into individual words
val df_word_explode = df_line_split.withColumn("word", explode(df_line_split("split_line")))
                                        .drop("col1", "split_line")

df_word_explode.show(2)

// Aggregate on words to get their respective counts
val df_word_count = df_word_explode.groupBy("word").agg(count("word").alias("cnt"))
                                    .withColumn("dummy", lit("1"))

df_word_count.show(5)

// Define windowing function
val windw = Window.partitionBy("dummy").orderBy(desc("cnt"))

val df_word_rank = df_word_count.withColumn("rank", rank().over(windw)).drop("dummy")

df_word_rank.show(3)

df_word_rank.filter(df_word_rank("rank") === 5).show()

+-----+
| word|
+-----+
|   In|
|2011,|
+-----+
only showing top 2 rows

+-------+---+-----+
|   word|cnt|dummy|
+-------+---+-----+
| online|  3|    1|
|    few|  1|    1|
|   some|  1|    1|
| upsale|  1|    1|
|world,”|  1|    1|
+-------+---+-----+
only showing top 5 rows

+----+---+----+
|word|cnt|rank|
+----+---+----+
| the| 15|   1|
|   a| 12|   2|
|  of|  8|   3|
+----+---+----+
only showing top 3 rows

+----+---+----+
|word|cnt|rank|
+----+---+----+
| and|  6|   5|
|  we|  6|   5|
+----+---+----+



df_line_split = [col1: string, split_line: array<string>]
df_word_explode = [word: string]
df_word_count = [word: string, cnt: bigint ... 1 more field]
windw = org.apache.spark.sql.expressions.WindowSpec@252a72b
df_word_rank = [word: string, cnt: bigint ... 1 more field]


[word: string, cnt: bigint ... 1 more field]