<a href="https://colab.research.google.com/github/sirishaallarapu/PySpark/blob/main/RDD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

spark = SparkSession.builder.appName("Spark Examples").getOrCreate()
sc = spark.sparkContext

data = [("Nagendra", 28), ("Sirisha", 24)]
rdd = sc.parallelize(data)

mapped_rdd = rdd.map(lambda x: (x[0], x[1] + 5))
print(mapped_rdd.collect())

df = spark.createDataFrame(rdd, ["Name", "Age"])
df.show()
df.filter(df.Age > 30).show()

schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True)
])

df_with_schema = spark.createDataFrame(data, schema=schema)
df_with_schema.printSchema()
df_with_schema.show()
df_with_schema.select("Name").show()



[('Nagendra', 33), ('Sirisha', 29)]
+--------+---+
|    Name|Age|
+--------+---+
|Nagendra| 28|
| Sirisha| 24|
+--------+---+

+----+---+
|Name|Age|
+----+---+
+----+---+

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)

+--------+---+
|    Name|Age|
+--------+---+
|Nagendra| 28|
| Sirisha| 24|
+--------+---+

+--------+
|    Name|
+--------+
|Nagendra|
| Sirisha|
+--------+



In [None]:
df = spark.createDataFrame(rdd, ["Name", "Age"])
df.show()

+--------+---+
|    Name|Age|
+--------+---+
|Nagendra| 28|
| Sirisha| 24|
+--------+---+



In [None]:
df.withColumn("NewAge", df.Age + 5).show()

+--------+---+------+
|    Name|Age|NewAge|
+--------+---+------+
|Nagendra| 28|    33|
| Sirisha| 24|    29|
+--------+---+------+



In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True)
])

data = [("Nagendra", 28), ("Sirisha", 32), ("Kiran", 35)]
df = spark.createDataFrame(data, schema=schema)

df.printSchema()
df.show()

df.orderBy(df.Age.desc()).show()
df.agg({"Age": "avg"}).show()


root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)

+--------+---+
|    Name|Age|
+--------+---+
|Nagendra| 28|
| Sirisha| 32|
|   Kiran| 35|
+--------+---+

+--------+---+
|    Name|Age|
+--------+---+
|   Kiran| 35|
| Sirisha| 32|
|Nagendra| 28|
+--------+---+

+------------------+
|          avg(Age)|
+------------------+
|31.666666666666668|
+------------------+



In [None]:
df.createOrReplaceTempView("people")

result = spark.sql("SELECT Name, Age FROM people WHERE Age > 30")
result.show()

+-------+---+
|   Name|Age|
+-------+---+
|Sirisha| 32|
|  Kiran| 35|
+-------+---+



In [None]:
df.write.json("people.json")

df_json = spark.read.json("people.json")
df_json.show()


+---+--------+
|Age|    Name|
+---+--------+
| 32| Sirisha|
| 35|   Kiran|
| 28|Nagendra|
+---+--------+



In [None]:
sales_data = [
    ("Nagendra", "Electronics", 200),
    ("Sirisha", "Clothing", 150),
    ("Nagendra", "Groceries", 100),
    ("Sirisha", "Electronics", 300),
    ("Nagendra", "Clothing", 250)
]

rdd = sc.parallelize(sales_data)

total_sales = rdd.map(lambda x: (x[0], x[2])).reduceByKey(lambda a, b: a + b)
print("Total sales per customer:", total_sales.collect())


Total sales per customer: [('Nagendra', 550), ('Sirisha', 450)]


In [None]:
tweets_data = [
    ("Nagendra", "Learning #Spark is amazing!"),
    ("Sirisha", "Big data with #Hadoop and #Spark"),
    ("Nagendra", "Exploring #AI and #MachineLearning"),
    ("Sirisha", "Data Science with #Python and #Spark")
]

rdd = sc.parallelize(tweets_data)

hashtags = rdd.flatMap(lambda x: x[1].split(" ")).filter(lambda word: word.startswith("#"))
hashtag_counts = hashtags.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b)

print("Hashtag counts:", hashtag_counts.collect())


Hashtag counts: [('#Spark', 3), ('#Hadoop', 1), ('#AI', 1), ('#MachineLearning', 1), ('#Python', 1)]


In [None]:
employees_rdd = sc.parallelize([
    ("Nagendra", "IT", 70000),
    ("Sirisha", "HR", 65000),
    ("Kiran", "Finance", 80000),
    ("Nagendra", "IT", 75000),
    ("Sirisha", "HR", 70000)
])

high_salary_rdd = employees_rdd.filter(lambda x: x[2] > 70000)
print("High Salary Employees:", high_salary_rdd.collect())


High Salary Employees: [('Kiran', 'Finance', 80000), ('Nagendra', 'IT', 75000)]
