In [None]:
!pip install pyspark



In [None]:
import pyspark
print(pyspark.__version__)

3.5.4


In [12]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("WordCount").getOrCreate()

# Get SparkContext
sc = spark.sparkContext


In [13]:
# Get SparkContext
sc = spark.sparkContext

In [14]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
!ls /content/drive/MyDrive/spark-1/README.md

/content/drive/MyDrive/spark-1/README.md


In [16]:
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, col

In [18]:
file_path = "/content/drive/MyDrive/spark-1/README.md"
rdd = sc.textFile(file_path)

In [19]:
# Read file into RDD
text_rdd = spark.sparkContext.textFile(file_path)

In [20]:
# Start timer to measure execution time
start_time = time.time()

In [21]:
# Perform word count
word_counts = (
    rdd.flatMap(lambda line: line.split())  # Split lines into words
       .map(lambda word: (word, 1))         # Assign count 1 to each word
       .reduceByKey(lambda a, b: a + b)     # Aggregate counts
)


In [22]:
# Collect results
word_counts_collected = word_counts.collect()


In [23]:
# Stop the timer
end_time = time.time()
execution_time = end_time - start_time

In [24]:
# Convert results to dictionary for analysis
word_counts_dict = dict(word_counts_collected)

In [25]:
# Get Hadoop count
hadoop_count = word_counts_dict.get("Hadoop", 0)

In [26]:
# Find the most common  words
most_common_word = max(word_counts_dict, key=word_counts_dict.get)
most_common_count = word_counts_dict[most_common_word]

In [27]:
# Find the  least common words
least_common_words = [word for word, count in word_counts_dict.items() if count == min(word_counts_dict.values())]
least_common_count = min(word_counts_dict.values())

In [28]:
# Display results
print(f"Execution Time: {execution_time:.2f} seconds")
print(f'Hadoop Count: {hadoop_count}')
print(f'Most Common Word: "{most_common_word}" appears {most_common_count} times')
print(f'Least Common Word(s): {least_common_words}')
print(f'Least Common Word Count: {least_common_count}')


Execution Time: 11.99 seconds
Hadoop Count: 3
Most Common Word: "the" appears 23 times
Least Common Word(s): ['#', 'analytics', 'Scala,', 'optimized', 'analysis.', 'higher-level', 'workloads,', 'MLlib', 'machine', 'learning,', 'GraphX', 'graph', 'processing,', '<https://spark.apache.org/>', '[![AppVeyor', '[![PySpark', '[![PyPI', 'Documentation', 'find', 'latest', '[project', 'web', 'README', 'file', 'basic', 'setup', 'Building', 'built', 'run:', './build/mvn', '-DskipTests', 'clean', 'this', 'available', 'from', 'site,', '["Building', 'Spark"](https://spark.apache.org/docs/latest/building-spark.html).', 'IDE,', '["Useful', 'Developer', 'easiest', 'start', '```scala', 'scala>', 'comes', 'sample', 'directory.', 'them,', '`./bin/run-example', 'will', 'MASTER', 'environment', 'running', 'submit', 'cluster.', 'mesos://', 'spark://', 'YARN,', '"local"', 'thread,', '"local[N]"', 'threads.', 'name', 'instance:', 'MASTER=spark://host:7077', 'usage', 'help', 'no', 'params', 'are', 'Running', 'T

In [None]:
# Show job execution details
print("\nCheck the Spark UI at: http://<your_colab_ip>:4040/jobs")


Check the Spark UI at: http://<your_colab_ip>:4040/jobs
