In [9]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("RDD VS DataFrame").getOrCreate()

In [10]:
sc = spark.sparkContext

## RDD vs Data Frame

# RDD Version (Low-level approach)

In [12]:
data = [("John", 2000), ("Mary", 3000)]

rdd = sc.parallelize(data)

result = rdd.map(lambda x: (x[0], x[1] * 10))
print(result.collect())


[('John', 20000), ('Mary', 30000)]


# DataFrame Version (High-level & optimized)

In [13]:
df = spark.createDataFrame(data, ["name", "salary"])

df2 = df.withColumn("salary", df.salary * 10)
df2.show()


+----+------+
|name|salary|
+----+------+
|John| 20000|
|Mary| 30000|
+----+------+



# Filtering Operation

In [14]:
nums = sc.parallelize([1,2,3,4,5,6])
evens = nums.filter(lambda x: x % 2 == 0)
print(evens.collect())


[2, 4, 6]


In [15]:
df = spark.createDataFrame([(1,), (2,), (3,), (4,), (5,), (6,)], ["num"])
df.filter(df.num % 2 == 0).show()


+---+
|num|
+---+
|  2|
|  4|
|  6|
+---+



# Word Count (RDD vs DataFrame)

In [16]:
rdd = sc.textFile("README.md")

result = (
    rdd.flatMap(lambda x: x.split(" "))
       .map(lambda x: (x, 1))
       .reduceByKey(lambda a, b: a + b)
)

print(result.collect())


[('#', 3), ('SQL', 2), ('query', 1), ('from', 1), ('excel', 1), ('&amp;', 1), ('##', 1), ('', 40), ('Streamlit-based', 1), ('for', 2), ('using', 1), ('natural', 1), ('language', 1), ('and', 1), ('LLMs.', 1), ('---', 2), ('###', 5), ('‚úÖ', 1), ('Prerequisites', 1), ('-', 3), ('Python', 1), ('`pip`', 1), ('`conda`', 1), ('management', 1), ('Recommended:', 1), ('use', 1), ('environment', 1), ('(`venv`', 1), ('`conda`)', 1), ('üõ†Ô∏è', 1), ('1.', 1), ('**Clone', 1), ('https://github.com/ramoji4b5/GenAI_Usecase_OpenAI', 1), ('python', 1), ('-m', 1), ('venv', 1), ('Mac/Linux', 1), ('.venv\\Scripts\\activate', 1), ('```', 6), ('bash', 1), ('pip', 1), ('install', 1), ('requirements.txt', 1), ('Run', 1), ('streamlit', 1), ('base', 1), ('URL', 1), ('Breakdown:', 1), ('postgres:', 1), ('admin:', 1), ('localhost:', 1), ('host', 1), ('IP', 1), ('domain)', 1), ('name', 1), ('GenAI_Usecase_OpenAI', 1), ('database', 3), ('üöÄ', 1), ('Getting', 1), ('Started', 1), ('This', 1), ('project', 1), ('is',

In [18]:
from pyspark.sql.functions import explode, split

df = spark.read.text("README.md")

df2 = df.select(explode(split(df.value, " ")).alias("word")) \
        .groupBy("word") \
        .count()

df2.show(100)

+--------------------+-----+
|                word|count|
+--------------------+-----+
|                port|    1|
|    requirements.txt|    1|
|          management|    1|
|             ```bash|    2|
|                Data|    1|
|                  -r|    1|
|            password|    1|
|             (`venv`|    1|
|                name|    1|
|https://github.co...|    1|
|                   ‚úÖ|    1|
|              format|    1|
|               using|    1|
|             **Clone|    1|
|                  -m|    1|
|                 for|    2|
|                (can|    1|
|            `conda`)|    1|
|                 URL|    1|
|               query|    1|
|          localhost:|    1|
|           postgres:|    1|
|               excel|    1|
|                  be|    1|
|               .venv|    1|
|            database|    3|
|              admin:|    1|
|          PostgreSQL|    3|
|                your|    3|
|             package|    1|
|  sql_chatbot/app.py|    1|
|         sq