<a href="https://colab.research.google.com/github/sirishaallarapu/PySpark/blob/main/Spark_architecture.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ColabSpark").getOrCreate()
print(spark)


<pyspark.sql.session.SparkSession object at 0x7a34fae94610>


In [None]:
from pyspark.sql import Row

data = [Row(id=1, name="Nagendra", age=27),
        Row(id=2, name="Sirisha", age=23),
        Row(id=3, name="Lishi", age=5)]

df = spark.createDataFrame(data)
df.show()


+---+--------+---+
| id|    name|age|
+---+--------+---+
|  1|Nagendra| 27|
|  2| Sirisha| 23|
|  3|   Lishi|  5|
+---+--------+---+



In [None]:
df.select("name", "age").show()

df.filter(df.age > 22).show()


+--------+---+
|    name|age|
+--------+---+
|Nagendra| 27|
| Sirisha| 23|
|   Lishi|  5|
+--------+---+

+---+--------+---+
| id|    name|age|
+---+--------+---+
|  1|Nagendra| 27|
|  2| Sirisha| 23|
+---+--------+---+



In [None]:
df.groupBy("age").count().show()


+---+-----+
|age|count|
+---+-----+
| 27|    1|
|  5|    1|
| 23|    1|
+---+-----+



In [None]:
rdd = spark.sparkContext.parallelize(["hello world", "hello PySpark", "hello Spark"])
words = rdd.flatMap(lambda line: line.split(" "))
word_counts = words.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
word_counts.collect()


[('hello', 3), ('world', 1), ('PySpark', 1), ('Spark', 1)]

In [None]:
sc = spark.sparkContext
print("Number of Executors:", sc.defaultParallelism)



Number of Executors: 2


In [None]:
rdd = spark.sparkContext.parallelize([1, 2, 3, 4, 5], numSlices=2)

squared_rdd = rdd.map(lambda x: x * x)

print(squared_rdd.collect())


[1, 4, 9, 16, 25]


In [None]:
print(sc.master)



local[*]


In [None]:
df.cache()

df.count()

print("Is Cached?", df.is_cached)


Is Cached? True


In [None]:
from pyspark.sql.functions import col

data = [(1, "Alice", "Math"), (2, "Bob", "Science"), (3, "Charlie", "Math"), (4, "David", "Science")]
df = spark.createDataFrame(data, ["id", "name", "subject"])

df_repartitioned = df.repartition(2)

df_repartitioned.groupBy("subject").count().show()



+-------+-----+
|subject|count|
+-------+-----+
|Science|    2|
|   Math|    2|
+-------+-----+



In [None]:
print("Number of Partitions:", rdd.getNumPartitions())

def print_partition(index, iterator):
    return [("Partition: " + str(index), list(iterator))]

partition_rdd = rdd.mapPartitionsWithIndex(print_partition)
print(partition_rdd.collect())


Number of Partitions: 2
[('Partition: 0', [1, 2]), ('Partition: 1', [3, 4, 5])]
