# Spark session 22 February _Puranjay Kwatra

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

# Creating a spark session named spark

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

# Reading the data from the text file (country code Data)

In [None]:
df = spark.read.options(delimiter='|',infeschema=True,header=True).csv("/content/file.csv.txt")
df.show()
df.printSchema()

+------------+------------+----+-------+
|Country Name|Country Code|Year|Value  |
+------------+------------+----+-------+
|       India|        In01|2000|     90|
|         USA|        US03|2001|     18|
|       China|        ch07|1999|     78|
|       Japan|       jap82|2005|     45|
|       Saudi|       sau81|2003|     56|
+------------+------------+----+-------+

root
 |-- Country Name: string (nullable = true)
 |-- Country Code: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Value  : string (nullable = true)



## Sorting and order

In [None]:
df.select("Country Code","Value  ").show()

+------------+-------+
|Country Code|Value  |
+------------+-------+
|        In01|     90|
|        US03|     18|
|        ch07|     78|
|       jap82|     45|
|       sau81|     56|
+------------+-------+



### using orderBy

In [None]:
df.select('Country','Value  ').orderBy('Value  ',ascending=False).show()

+-------+-------+
|Country|Value  |
+-------+-------+
|  India|     90|
|  China|     78|
|  Saudi|     56|
|  Japan|     45|
|    USA|     18|
+-------+-------+



# Condition for year in 2000 OR 2005

In [None]:
from pyspark.sql.functions import col
filtered_dataframe =df.filter(col('Year').isin(2000,2005) )
filtered_dataframe.show()


+------------+------------+----+-------+
|Country Name|Country Code|Year|Value  |
+------------+------------+----+-------+
|       India|        In01|2000|     90|
|       Japan|       jap82|2005|     45|
+------------+------------+----+-------+



# Condition for year in 2000 and 2005

In [None]:
from pyspark.sql.functions import col
filtered_dataframe = df.filter((col('Year') == 2000) & (col('Year') == 2005))
filtered_dataframe.show()
# Empty result because there is no matching column

+------------+------------+----+-------+
|Country Name|Country Code|Year|Value  |
+------------+------------+----+-------+
+------------+------------+----+-------+



# Filter in Pyspark

In [None]:
from pyspark.sql.functions import col
filtered_dataframe =df.filter(col('Country Name')=='India' )
filtered_dataframe.show()


+------------+------------+----+-------+
|Country Name|Country Code|Year|Value  |
+------------+------------+----+-------+
|       India|        In01|2000|     90|
+------------+------------+----+-------+



# Column renamed function


In [None]:
# new_data={'country':'countryname'}
df =df.withColumnRenamed("Country Name","Country")
# df_renamed.show()
df.show()


+-------+------------+----+-------+
|Country|Country Code|Year|Value  |
+-------+------------+----+-------+
|  India|        In01|2000|     90|
|    USA|        US03|2001|     18|
|  China|        ch07|1999|     78|
|  Japan|       jap82|2005|     45|
|  Saudi|       sau81|2003|     56|
+-------+------------+----+-------+



# Filter,groupby and aggregation all in One line

In [None]:
new_data_frame = df.filter(col('Year') == 2000).groupby("Value  ").agg({'Value  ':'count'})
new_data_frame.show()

+-------+--------------+
|Value  |count(Value  )|
+-------+--------------+
|     90|             1|
+-------+--------------+



## Changing names of Multiple Columns

In [None]:
rename = {"Country" : "Country Name", "Country Code" : "Code"}
df.toDF(*[rename.get(col, col) for col in df.columns]).show()

+------------+-----+----+-------+
|Country Name| Code|Year|Value  |
+------------+-----+----+-------+
|       India| In01|2000|     90|
|         USA| US03|2001|     18|
|       China| ch07|1999|     78|
|       Japan|jap82|2005|     45|
|       Saudi|sau81|2003|     56|
+------------+-----+----+-------+



# Partioning,re-partioining and Shuffling

Shuffling- The Spark SQL shuffle is a mechanism for redistributing or re-partitioning data so that the data is grouped differently across partitions. Based on your data size you may need to reduce or increase the number of partitions of RDD/DataFrame using spark.sql.shuffle.partitions configuration or through code.


In [None]:

# Import required modules
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("hash\_partitioning").getOrCreate()

# Create a sample DataFrame
df = spark.createDataFrame([
    (1, "Alice", 25),
    (2, "Bob", 30),
    (3, "Charlie", 35),
    (4, "Dave", 40),
    (5, "Eve", 45),
    (6, "Frank", 50)
], ["id", "name", "age"])

# Print the DataFrame
df.show()
# Perform hash partitioning on the
# DataFrame based on the "id" column
df = df.repartition(4, "id")

# Print the elements in each partition
print(df.rdd.glom().collect())

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  3|Charlie| 35|
|  4|   Dave| 40|
|  5|    Eve| 45|
|  6|  Frank| 50|
+---+-------+---+

[[Row(id=2, name='Bob', age=30), Row(id=4, name='Dave', age=40), Row(id=5, name='Eve', age=45)], [Row(id=1, name='Alice', age=25), Row(id=6, name='Frank', age=50)], [], [Row(id=3, name='Charlie', age=35)]]
