In [207]:
# import OS package
import os

In [208]:
#set the PySpark environment veriable
os.environ['JAVA_HOME']="C:\Program Files\Java\jdk-22"
os.environ['SPARK_HOME'] = "E:\spark-3.5.1-bin-hadoop3"
os.environ['PYSPARK_DRIVER_PYTHON'] = "jupyter"
os.environ['PYSPARK_DRIVER_PYTHON_OPS'] = "notebook"
os.environ['PYSPARK_PYTHON'] = "python"

In [209]:
#Import Pyspark 
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

In [210]:
spark = SparkSession.builder.appName("PySpark-App-Started")\
.config("spark.executor.memory","4g")\
.config("spark.sql.shuffle.partitions","4")\
.config("spark.jars","E:\jdbc-driver\postgresql-42.7.3.jar")\
.getOrCreate()
# .config("spark.driver.extraClassPath", "E:\jdbc-driver\*")\
sc = spark.sparkContext
sc

In [211]:
# Create DataFrame 
columns = ["id", "name","age","gender"]
data = [(1, "James",30,"M"), (2, "Ann",40,"F"),
    (3, "Jeff",41,"M"),(4, "Jennifer",20,"F")]

In [212]:
sampleDF = spark.sparkContext.parallelize(data).toDF(columns)

In [213]:
sampleDF.show()

+---+--------+---+------+
| id|    name|age|gender|
+---+--------+---+------+
|  1|   James| 30|     M|
|  2|     Ann| 40|     F|
|  3|    Jeff| 41|     M|
|  4|Jennifer| 20|     F|
+---+--------+---+------+



In [214]:
#Test the setup
data1 = [("Alice", 25), ("Bob", 36),("Charlee", 80)]
df1 = spark.createDataFrame(data1, ["Name", "Age"])
df1.show()

+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 25|
|    Bob| 36|
|Charlee| 80|
+-------+---+



### Database Connection Postgress using pyspark

In [215]:
jdbc_url = 'jdbc:postgresql://localhost:5432/mldb'
props = {'user': 'postgres',
              'password': 'postgres',
               'driver': 'org.postgresql.Driver'
             }
sql_df = spark.read.jdbc(url=jdbc_url, table='employee', properties=props)

In [216]:
sql_df.show()

+---+-----------------+---------------+---+--------------------+--------+
| id|             name|        company|age|             address|  salary|
+---+-----------------+---------------+---+--------------------+--------+
|101|    Krishna Rajan|      Accenture| 35|#Banglaore sasadr...|120000.0|
|102|     Surendra Rai|            IBM| 40|#Banglaore, singa...|180000.0|
|103|       Goopy Nath|Mphasis Limited| 35|#Banglaore, RT Na...| 90000.0|
|104|     Manish Thapa|Mantree Limited| 35|#Banglaore Bamana...|102000.0|
|105|Bina Kumar jhakri|            IBM| 35|#Banglaore HSR La...|150000.0|
|106|   Roshan Chetrry|       Indexcel| 35|#Banglaore, Korma...|110000.0|
+---+-----------------+---------------+---+--------------------+--------+



RDDs( Resilient Distributed Datasets)

- Backbone of Data Processing in Spark
- Distributed , fault-tolerant, and parallelizable data structure
- Efficiently Processes Large datasets across a cluster
- key characteristrics : Immutable , Distributed , resilient, lazily evaluated , fault-tolerant operation ( Map, filter, reduce, collect, count , save, map)

Transformations and action




In [217]:
#simple Array list
list_number = [2,5,6,7,8,9]

#Create the RDD List
rdd = sc.parallelize(list_number)

In [218]:
# Collect action  will Retrive all elment of the RDD
rdd.collect()

[2, 5, 6, 7, 8, 9]

In [219]:
#Create RDD List Tuples
list_data = [("Alice", 26), ("Nishant", 30),("Minika", 35), ("Abrain", 25),("Bishnu", 40), ("surendra", 42), ("tripty", 24)]

In [220]:
list_data

[('Alice', 26),
 ('Nishant', 30),
 ('Minika', 35),
 ('Abrain', 25),
 ('Bishnu', 40),
 ('surendra', 42),
 ('tripty', 24)]

In [221]:
rdd1 = sc.parallelize(list_data)

In [222]:
# collect action to retrive all the elment of RDDS
print("All the Elment of RDD: ", rdd1.collect())

All the Elment of RDD:  [('Alice', 26), ('Nishant', 30), ('Minika', 35), ('Abrain', 25), ('Bishnu', 40), ('surendra', 42), ('tripty', 24)]


### RDDS Opetion and Action for Data computation in Distributed system

In [223]:
#Count action: it will count the number of Elemnt in the RDD
count = rdd1.count()
print("Total Number RDD Elment in rdd1 Object:", count)

Total Number RDD Elment in rdd1 Object: 7


In [224]:
# First Action : Retrive the first Elment of RDD
first_elm = rdd1.first()
print("Fist Elemtnt of rdd1 is :", first_elm)

Fist Elemtnt of rdd1 is : ('Alice', 26)


In [225]:
# take action : its retrive specifyed number of element from RDD Object
two_elm = rdd1.take(2)
four_elm = rdd1.take(4)
five_elm = rdd1.take(5)

print("two Elemtnt of rdd1 is :", two_elm)
print("four Elemtnt of rdd1 is :", four_elm)
print("fize Elemtnt of rdd1 is :", five_elm)


two Elemtnt of rdd1 is : [('Alice', 26), ('Nishant', 30)]
four Elemtnt of rdd1 is : [('Alice', 26), ('Nishant', 30), ('Minika', 35), ('Abrain', 25)]
fize Elemtnt of rdd1 is : [('Alice', 26), ('Nishant', 30), ('Minika', 35), ('Abrain', 25), ('Bishnu', 40)]


In [226]:
# foreach  action will  just print all elment of RDD
def show_rdd(item):
    print(item)
    
rdd1.foreach(show_rdd)
#or 
rdd1.foreach(lambda x : print(x))


### Rdd Operation : Transformation  using spark method like ( map, filter, reducebykey, sortBy)

In [227]:
#Map transformation : conver the name to uppercase 
map_rdd = rdd1.map(lambda x : ( x[0].upper(), x[1])) 
result = map_rdd.collect()
print("All the Name Upper Case: ", result)

map_rdd1 = rdd1.map(lambda x : ( x[0].title(), x[1])) 
result_title = map_rdd1.collect()
print("All the Name Title Case: ", result_title)


All the Name Upper Case:  [('ALICE', 26), ('NISHANT', 30), ('MINIKA', 35), ('ABRAIN', 25), ('BISHNU', 40), ('SURENDRA', 42), ('TRIPTY', 24)]
All the Name Title Case:  [('Alice', 26), ('Nishant', 30), ('Minika', 35), ('Abrain', 25), ('Bishnu', 40), ('Surendra', 42), ('Tripty', 24)]


In [228]:
#Filter transformation Filter the where as the greaten then 30
filter_rdd = rdd1.filter(lambda x : x[1] > 30)
filter_rdd.collect()

[('Minika', 35), ('Bishnu', 40), ('surendra', 42)]

In [229]:
#Reduce by key Transformation :Calculate the total age of each element
redurce_rdd= rdd1.reduceByKey(lambda x, y: x + y)
redurce_rdd.collect()

[('tripty', 24),
 ('Bishnu', 40),
 ('surendra', 42),
 ('Minika', 35),
 ('Abrain', 25),
 ('Nishant', 30),
 ('Alice', 26)]

In [230]:
#sortby Transformation : it will sort rdd by age decending order
sortby_rdd = rdd1.sortBy(lambda x : x[1], ascending=False)
sortby_rdd.collect()

[('surendra', 42),
 ('Bishnu', 40),
 ('Minika', 35),
 ('Nishant', 30),
 ('Alice', 26),
 ('Abrain', 25),
 ('tripty', 24)]

In [231]:
#flatMap() method Transfomation here string of array word spliging by space
words_rdd = sc.parallelize (
  ["scala", 
   "java", 
   "javaScript",
   "HTML",
   "Python",
   "go languge",
   "hadoop", 
   "spark", 
   "akka",
   "spark vs hadoop", 
   "pyspark",
   "pyspark and spark"]
)
words_rdd.flatMap(lambda x: x.split(" ")).collect()

['scala',
 'java',
 'javaScript',
 'HTML',
 'Python',
 'go',
 'languge',
 'hadoop',
 'spark',
 'akka',
 'spark',
 'vs',
 'hadoop',
 'pyspark',
 'pyspark',
 'and',
 'spark']

In [232]:
# .repartition(n) — makes n number of partitions on RDD
print("Number of Partitions Before: ",words_rdd.getNumPartitions())

Number of Partitions Before:  8


In [233]:
words_rdd.collect()

['scala',
 'java',
 'javaScript',
 'HTML',
 'Python',
 'go languge',
 'hadoop',
 'spark',
 'akka',
 'spark vs hadoop',
 'pyspark',
 'pyspark and spark']

In [234]:
my_rdd = spark.sparkContext.parallelize(["chandu",1,"rohith",2,"minu",3,"karthik",4])
my_rdd.collect()

['chandu', 1, 'rohith', 2, 'minu', 3, 'karthik', 4]

In [235]:
my_rdd.take(4)

['chandu', 1, 'rohith', 2]

### Read and Write Rdd From the Text file

In [236]:
#SaveAsTextFile Rdd 
#rdd1.saveAsTextFile('data/rdd1.txt')

In [237]:
#Reading TextFile RDD
text_file_rdd  = sc.textFile('data/rdd1.txt')
text_file_rdd.collect()

["('Alice', 26)",
 "('Nishant', 30)",
 "('Minika', 35)",
 "('Abrain', 25)",
 "('Bishnu', 40)",
 "('surendra', 42)",
 "('tripty', 24)"]

In [238]:
# getNumPartitions() — returns the number of Partitions
text_file_rdd.getNumPartitions()

7

In [239]:
#Import Dataframe Csv Data
df_pyspark = spark.read.csv("data/Amazon_Sale_Report.csv", inferSchema=True, header=True)

In [240]:
df_pyspark.show()

+-----+-------------------+--------+--------------------+----------+--------------+------------------+--------+-------------------+-------------+----+----------+--------------+---+--------+------+-----------+--------------+----------------+------------+--------------------+-----+------------+-----------+
|index|           Order ID|    Date|              Status|Fulfilment|Sales Channel |ship-service-level|   Style|                SKU|     Category|Size|      ASIN|Courier Status|Qty|currency|Amount|  ship-city|    ship-state|ship-postal-code|ship-country|       promotion-ids|  B2B|fulfilled-by|Unnamed: 22|
+-----+-------------------+--------+--------------------+----------+--------------+------------------+--------+-------------------+-------------+----+----------+--------------+---+--------+------+-----------+--------------+----------------+------------+--------------------+-----+------------+-----------+
|    0|405-8078784-5731545|04-30-22|           Cancelled|  Merchant|     Amazon.in

In [241]:
#To view the all the schemal
df_pyspark.printSchema()

root
 |-- index: integer (nullable = true)
 |-- Order ID: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Fulfilment: string (nullable = true)
 |-- Sales Channel : string (nullable = true)
 |-- ship-service-level: string (nullable = true)
 |-- Style: string (nullable = true)
 |-- SKU: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- ASIN: string (nullable = true)
 |-- Courier Status: string (nullable = true)
 |-- Qty: integer (nullable = true)
 |-- currency: string (nullable = true)
 |-- Amount: double (nullable = true)
 |-- ship-city: string (nullable = true)
 |-- ship-state: string (nullable = true)
 |-- ship-postal-code: double (nullable = true)
 |-- ship-country: string (nullable = true)
 |-- promotion-ids: string (nullable = true)
 |-- B2B: boolean (nullable = true)
 |-- fulfilled-by: string (nullable = true)
 |-- Unnamed: 22: boolean (nullable = true)



In [242]:
#dataFrame  the number of rows in a DataFrame
df_pyspark.count()  

128975

In [243]:
print(df_pyspark.columns)   # lists all columns in DataFrame
print("Column No:",len(df_pyspark.columns))   # lists all columns in DataFrame

['index', 'Order ID', 'Date', 'Status', 'Fulfilment', 'Sales Channel ', 'ship-service-level', 'Style', 'SKU', 'Category', 'Size', 'ASIN', 'Courier Status', 'Qty', 'currency', 'Amount', 'ship-city', 'ship-state', 'ship-postal-code', 'ship-country', 'promotion-ids', 'B2B', 'fulfilled-by', 'Unnamed: 22']
Column No: 24


In [244]:
#checking Data types
print(df_pyspark.dtypes)

[('index', 'int'), ('Order ID', 'string'), ('Date', 'string'), ('Status', 'string'), ('Fulfilment', 'string'), ('Sales Channel ', 'string'), ('ship-service-level', 'string'), ('Style', 'string'), ('SKU', 'string'), ('Category', 'string'), ('Size', 'string'), ('ASIN', 'string'), ('Courier Status', 'string'), ('Qty', 'int'), ('currency', 'string'), ('Amount', 'double'), ('ship-city', 'string'), ('ship-state', 'string'), ('ship-postal-code', 'double'), ('ship-country', 'string'), ('promotion-ids', 'string'), ('B2B', 'boolean'), ('fulfilled-by', 'string'), ('Unnamed: 22', 'boolean')]


In [245]:
#describing Numerical method
#df_pyspark.describe().show()

In [246]:
df_pyspark.select(["Category","Size","Qty", "Amount"]).show()

+-------------+----+---+------+
|     Category|Size|Qty|Amount|
+-------------+----+---+------+
|          Set|   S|  0|647.62|
|        kurta| 3XL|  1| 406.0|
|        kurta|  XL|  1| 329.0|
|Western Dress|   L|  0|753.33|
|          Top| 3XL|  1| 574.0|
|          Set|  XL|  1| 824.0|
|          Set|   L|  1| 653.0|
|        kurta|   S|  1| 399.0|
|          Set| 3XL|  0|  NULL|
|        kurta| XXL|  1| 363.0|
|        kurta|   S|  1| 685.0|
|        kurta|  XS|  1| 364.0|
|        kurta|  XS|  1| 399.0|
|          Set|  XS|  1| 657.0|
|          Set|   L|  1| 771.0|
|        kurta| 6XL|  1| 544.0|
|        kurta| XXL|  1| 329.0|
|        kurta|  XL|  1| 399.0|
|        kurta| XXL|  1| 458.0|
|          Set|  XS|  1| 886.0|
+-------------+----+---+------+
only showing top 20 rows



In [247]:
df_pyspark.select(["Category"]).distinct().show()
print("Total District product No:",df_pyspark.select(["Category"]).distinct().count())

+-------------+
|     Category|
+-------------+
|        kurta|
| Ethnic Dress|
|        Saree|
|       Blouse|
|          Top|
|       Bottom|
|          Set|
|Western Dress|
|      Dupatta|
+-------------+

Total District product No: 9


## RDD to DataFrame and Viceversa

In [248]:
#Convert to rdd to dataframe user toDF() method
rdd_df = rdd1.toDF()
rdd_df.show()

+--------+---+
|      _1| _2|
+--------+---+
|   Alice| 26|
| Nishant| 30|
|  Minika| 35|
|  Abrain| 25|
|  Bishnu| 40|
|surendra| 42|
|  tripty| 24|
+--------+---+



In [249]:
# using createDataFrame() - Convert DataFrame to RDD
rdd_df2 = spark.createDataFrame(rdd1).toDF("Name","Age")
rdd_df2.show()

+--------+---+
|    Name|Age|
+--------+---+
|   Alice| 26|
| Nishant| 30|
|  Minika| 35|
|  Abrain| 25|
|  Bishnu| 40|
|surendra| 42|
|  tripty| 24|
+--------+---+



In [250]:
# Convert DataFrame to RDD
rdd_from_df = rdd_df2.rdd 

In [251]:
rdd_from_df.collect()

[Row(Name='Alice', Age=26),
 Row(Name='Nishant', Age=30),
 Row(Name='Minika', Age=35),
 Row(Name='Abrain', Age=25),
 Row(Name='Bishnu', Age=40),
 Row(Name='surendra', Age=42),
 Row(Name='tripty', Age=24)]

In [252]:
# Data base Interface
# spark.stop()

#### Reference
https://sparkbyexamples.com/pyspark-rdd/

https://www.youtube.com/watch?v=EB8lfdxpirM

https://pub.towardsai.net/pyspark-for-beginners-part-1-introduction-638fb16c5092

https://medium.com/codex/pyspark-for-beginners-part-4-pyspark-rdd-7b5587347b4c

https://medium.com/codex/pyspark-for-begineers-part-2-pyspark-dataframe-60008da53e30

https://blog.devgenius.io/pyspark-for-begineers-part-3-pyspark-dataframe-db02f0fcd275