'''



    @Author: Shivraj Yelave
    @Date: 02-09-24
    @Last modified by: Shivraj Yelave
    @Last modified time: 
    @Title: Wordcount using pyspark (sparkContext)



'''

In [19]:
sc.stop()


In [20]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.sql.functions import col

# Initialize SparkConf and SparkContext
conf = SparkConf().setAppName("WordCount")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

# Path to your JSON file
json_file_path = "file:///C:/Users/Admin/Documents/pyspark/PYSPARK/text.json"

# Read JSON file into DataFrame using SQLContext
df = sqlContext.read.json(json_file_path)

# Drop rows with null values in the 'text' column
df = df.select(col('text')).na.drop()

# Show the DataFrame to understand its structure
df.show(truncate=False)

# Extract the 'text' field and split it into words, flatMap to separate the words
words_rdd = df.rdd.flatMap(lambda row: row['text'].split(" "))

# Convert words to lowercase to avoid case sensitivity issues
words_rdd = words_rdd.map(lambda word: word.lower())

# Map each word to a (word, 1) pair
word_pairs_rdd = words_rdd.map(lambda word: (word, 1))

# Reduce by key to count occurrences of each word
word_count_rdd = word_pairs_rdd.reduceByKey(lambda a, b: a + b)

# Collect and print the word counts
word_counts = word_count_rdd.collect()

# Print the results, sorted for better readability
for word, count in sorted(word_counts):
    print(f"{word}: {count}")

# Stop SparkContext
sc.stop()


+----------------------------------+
|text                              |
+----------------------------------+
|Spark is amazing                  |
|PySpark makes data processing easy|
|I love Spark                      |
+----------------------------------+

amazing: 1
data: 1
easy: 1
i: 1
is: 1
love: 1
makes: 1
processing: 1
pyspark: 1
spark: 2
