In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Partitioning_Demo").master("yarn").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/19 03:28:10 INFO SparkEnv: Registering MapOutputTracker
25/01/19 03:28:10 INFO SparkEnv: Registering BlockManagerMaster
25/01/19 03:28:10 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
25/01/19 03:28:10 INFO SparkEnv: Registering OutputCommitCoordinator


In [3]:
spark

In [8]:
data = [
"Goku Vegeta Gohan",
"Goku Frieza Goku",
"Vegeta Goku Gohan Frieza",
"Gohan Frieza Goku Goku" ] 

In [9]:
rdd = spark.sparkContext.parallelize(data,1)

In [10]:
rdd.getNumPartitions()

1

In [None]:
# 1 gb data ->  8 Blocks -> 8 Partitions 
# 512 mb data -> 4 blocks -> 4 partition 
# 100 mb data -> 1 blocks -> 1 partition 

In [4]:
hdfs_path = "/tmp/input.txt"
rdd1 = spark.sparkContext.textFile(hdfs_path)

In [5]:
rdd1.getNumPartitions()

2

In [6]:
# Default parallelism
print(f"Default parallelism: {spark.sparkContext.defaultParallelism}")

Default parallelism: 2


In [13]:
rdd1 = rdd1.repartition(200)

In [14]:
rdd1.getNumPartitions()

200

In [15]:
spark.stop()

# Restart kernel

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("RDD_Operations").master("yarn").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/19 04:00:03 INFO SparkEnv: Registering MapOutputTracker
25/01/19 04:00:03 INFO SparkEnv: Registering BlockManagerMaster
25/01/19 04:00:04 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
25/01/19 04:00:04 INFO SparkEnv: Registering OutputCommitCoordinator


In [7]:
customers_data = [
"customer_id,name,city,state,country,registration_date,is_active",
"0,Customer_0,Bangalore,Karnataka,India,2023-11-11,True",
"1,Customer_1,Hyderabad,Delhi,India,2023-08-26,True",
"2,Customer_2,Ahmedabad,West Bengal,India,2023-06-23,True",
"3,Customer_3,Bangalore,Tamil Nadu,India,2023-03-24,False",
"4,Customer_4,Bangalore,Gujarat,India,2023-06-06,False",
"5,Customer_5,Delhi,Maharashtra,India,2023-04-19,False"]


In [8]:
data_rdd = spark.sparkContext.parallelize(customers_data)

In [9]:
data_rdd.collect()

                                                                                

['customer_id,name,city,state,country,registration_date,is_active',
 '0,Customer_0,Bangalore,Karnataka,India,2023-11-11,True',
 '1,Customer_1,Hyderabad,Delhi,India,2023-08-26,True',
 '2,Customer_2,Ahmedabad,West Bengal,India,2023-06-23,True',
 '3,Customer_3,Bangalore,Tamil Nadu,India,2023-03-24,False',
 '4,Customer_4,Bangalore,Gujarat,India,2023-06-06,False',
 '5,Customer_5,Delhi,Maharashtra,India,2023-04-19,False']

In [10]:
# first () -> Action

header = data_rdd.first()
print(header)

customer_id,name,city,state,country,registration_date,is_active


In [11]:
# Filter () -> Transformation

data_rdd = data_rdd.filter(lambda row : row!=header)
print(data_rdd.collect())

['0,Customer_0,Bangalore,Karnataka,India,2023-11-11,True', '1,Customer_1,Hyderabad,Delhi,India,2023-08-26,True', '2,Customer_2,Ahmedabad,West Bengal,India,2023-06-23,True', '3,Customer_3,Bangalore,Tamil Nadu,India,2023-03-24,False', '4,Customer_4,Bangalore,Gujarat,India,2023-06-06,False', '5,Customer_5,Delhi,Maharashtra,India,2023-04-19,False']


# Map

In [12]:
test_data = '0,Customer_0,Bangalore,Karnataka,India,2023-11-11,True'
print(test_data.split(','))

['0', 'Customer_0', 'Bangalore', 'Karnataka', 'India', '2023-11-11', 'True']


In [14]:
def parse_data(row):
    fields = row.split(',')
    
    return (
        int(fields[0]),
        fields[1],
        fields[2],
        fields[3],
        fields[4],
        fields[5],
        fields[6]=='True'
    )

In [15]:
parsed_rdd = data_rdd.map(parse_data)

In [16]:
parsed_rdd.collect()

[(0, 'Customer_0', 'Bangalore', 'Karnataka', 'India', '2023-11-11', True),
 (1, 'Customer_1', 'Hyderabad', 'Delhi', 'India', '2023-08-26', True),
 (2, 'Customer_2', 'Ahmedabad', 'West Bengal', 'India', '2023-06-23', True),
 (3, 'Customer_3', 'Bangalore', 'Tamil Nadu', 'India', '2023-03-24', False),
 (4, 'Customer_4', 'Bangalore', 'Gujarat', 'India', '2023-06-06', False),
 (5, 'Customer_5', 'Delhi', 'Maharashtra', 'India', '2023-04-19', False)]

In [17]:
name_city_rdd = parsed_rdd.map(lambda row : (row[1],row[2]))
name_city_rdd.collect()

[('Customer_0', 'Bangalore'),
 ('Customer_1', 'Hyderabad'),
 ('Customer_2', 'Ahmedabad'),
 ('Customer_3', 'Bangalore'),
 ('Customer_4', 'Bangalore'),
 ('Customer_5', 'Delhi')]

In [19]:
cities_rdd = parsed_rdd.map(lambda row : row[2]).distinct()
cities_rdd.take(3)


['Hyderabad', 'Ahmedabad', 'Delhi']

In [20]:
# Reduce By Key

customers_rdd = parsed_rdd.map(lambda row : (row[2],1)).reduceByKey(lambda x,y:x+y)
print(customers_rdd.collect())

[('Hyderabad', 1), ('Ahmedabad', 1), ('Delhi', 1), ('Bangalore', 3)]


In [21]:
cities_rdd.collect()

['Hyderabad', 'Ahmedabad', 'Delhi', 'Bangalore']

In [22]:
parsed_rdd.map(lambda row : row[2]).countByValue()

defaultdict(int, {'Bangalore': 3, 'Hyderabad': 1, 'Ahmedabad': 1, 'Delhi': 1})

In [23]:
parsed_rdd.map(lambda row : (row[2],1)).reduceByKey(lambda x,y:x+y)

PythonRDD[30] at RDD at PythonRDD.scala:53

In [None]:
# Find cities with active customer

In [24]:
active_cities = parsed_rdd.filter(lambda row : row[6]==True).map(lambda row:row[2]).distinct()
active_cities.collect()

['Hyderabad', 'Ahmedabad', 'Bangalore']

In [None]:
# Count Customer By State - Homework

In [None]:
# Saveastextfile

In [25]:
spark.stop()

In [27]:
active_cities.collect()

AttributeError: 'NoneType' object has no attribute 'setCallSite'

In [None]:
# Transformation - Narrow vs Wide