# Scala Spark Wordcount Example (using DataFrame)

In [None]:
//Using SparkContext, which is already present in the environment
val schema = "col1 String"

val df_inp = spark.read.schema(schema).csv("/user/sxxx/data/test01")
df_inp.show(3)

In [None]:
val input_file = System.getProperty("user.dir")+"/../../data/wordcount-input.txt"
// print(input_file)

In [None]:
// Read the file into a DataFrame
val schema = "col1 string"
val input_df = spark.read.schema(schema).option("delimiter", "|").csv(input_file)
input_df.show(3)

In [None]:
// All these packages are pre-imported in the kernel, but listing them here for knowledge sake
import org.apache.spark.sql.SparkSession
//import spark.implicits._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window

// Split words on each line by space
val df_line_split = input_df.withColumn("split_line", split(input_df("col1"), " "))

// Explode each split line array into individual words
val df_word_explode = df_line_split.withColumn("word", explode(df_line_split("split_line")))
                                        .drop("col1", "split_line")

df_word_explode.show(2)

// Aggregate on words to get their respective counts
val df_word_count = df_word_explode.groupBy("word").agg(count("word").alias("cnt"))
                                    .withColumn("dummy", lit("1"))

df_word_count.show(5)

// Define windowing function
val windw = Window.partitionBy("dummy").orderBy(desc("cnt"))

val df_word_rank = df_word_count.withColumn("rank", rank().over(windw)).drop("dummy")

df_word_rank.show(3)

df_word_rank.filter(df_word_rank("rank") === 5).show()