In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Test").getOrCreate()

print(spark.version)


3.5.4


In [1]:
from pyspark import SparkContext

In [3]:
sc = SparkContext()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/06 21:41:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 50039)
Traceback (most recent call last):
  File "/opt/anaconda3/envs/pyspark_env/lib/python3.9/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/anaconda3/envs/pyspark_env/lib/python3.9/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/opt/anaconda3/envs/pyspark_env/lib/python3.9/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/anaconda3/envs/pyspark_env/lib/python3.9/socketserver.py", line 747, in __init__
    self.handl

In [9]:
%%writefile example.txt
first line
second line
third line
forth line

Writing example.txt


In [11]:
# Create RDD using a textFile method
textFile=sc.textFile('example.txt')

## RDD Actions

In [14]:
textFile.count()

                                                                                

4

In [16]:
textFile.first()

'first line'

## RDD Transformation

In [19]:
# step 1 transformation
secfind=textFile.filter(lambda line:'line' in line)

In [21]:
# step 2 action
secfind.collect()

['first line', 'second line', 'third line', 'forth line']

In [23]:
secfind.count()

4

In [1]:
%%writefile example2.txt
first 
second line
the third line
then a fourth line

Writing example2.txt


In [3]:
from pyspark import SparkContext

In [5]:
sc = SparkContext()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/09 23:35:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
# Show RDD
sc.textFile('example2.txt')

example2.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [9]:
# Save a reference to this RDD
text_rdd = sc.textFile('example2.txt')

In [13]:
# Map a function (or lambda expression) to each line
# Then collect the results.
text_rdd.map(lambda line: line.split()).collect()

[['first'],
 ['second', 'line'],
 ['the', 'third', 'line'],
 ['then', 'a', 'fourth', 'line']]

In [15]:
# Collect everything as a single flat map
text_rdd.flatMap(lambda line: line.split()).collect()

['first',
 'second',
 'line',
 'the',
 'third',
 'line',
 'then',
 'a',
 'fourth',
 'line']

## RDDs and Key Value Pairs


In [18]:
%%writefile services.txt
#EventId    Timestamp    Customer   State    ServiceID    Amount
201       10/13/2017      100       NY       131          100.00
204       10/18/2017      700       TX       129          450.00
202       10/15/2017      203       CA       121          200.00
206       10/19/2017      202       CA       131          500.00
203       10/17/2017      101       NY       173          750.00
205       10/19/2017      202       TX       121          200.00

Writing services.txt


In [21]:
services = sc.textFile('services.txt')

In [25]:
# First 2 elements of RDD
services.take(2)

['#EventId    Timestamp    Customer   State    ServiceID    Amount',
 '201       10/13/2017      100       NY       131          100.00']

In [27]:
services.map(lambda x: x.split())

PythonRDD[11] at RDD at PythonRDD.scala:53

In [31]:
services.map(lambda x: x.split()).collect()

[['#EventId', 'Timestamp', 'Customer', 'State', 'ServiceID', 'Amount'],
 ['201', '10/13/2017', '100', 'NY', '131', '100.00'],
 ['204', '10/18/2017', '700', 'TX', '129', '450.00'],
 ['202', '10/15/2017', '203', 'CA', '121', '200.00'],
 ['206', '10/19/2017', '202', 'CA', '131', '500.00'],
 ['203', '10/17/2017', '101', 'NY', '173', '750.00'],
 ['205', '10/19/2017', '202', 'TX', '121', '200.00']]

In [29]:
services.map(lambda x: x.split()).take(3)

[['#EventId', 'Timestamp', 'Customer', 'State', 'ServiceID', 'Amount'],
 ['201', '10/13/2017', '100', 'NY', '131', '100.00'],
 ['204', '10/18/2017', '700', 'TX', '129', '450.00']]

In [35]:
# Removing hash tag
services.map(lambda x: x[1:] if x[0]=='#' else x).collect()

['EventId    Timestamp    Customer   State    ServiceID    Amount',
 '201       10/13/2017      100       NY       131          100.00',
 '204       10/18/2017      700       TX       129          450.00',
 '202       10/15/2017      203       CA       121          200.00',
 '206       10/19/2017      202       CA       131          500.00',
 '203       10/17/2017      101       NY       173          750.00',
 '205       10/19/2017      202       TX       121          200.00']

In [37]:
services.map(lambda x: x[1:] if x[0]=='#' else x).map(lambda x: x.split()).collect()

[['EventId', 'Timestamp', 'Customer', 'State', 'ServiceID', 'Amount'],
 ['201', '10/13/2017', '100', 'NY', '131', '100.00'],
 ['204', '10/18/2017', '700', 'TX', '129', '450.00'],
 ['202', '10/15/2017', '203', 'CA', '121', '200.00'],
 ['206', '10/19/2017', '202', 'CA', '131', '500.00'],
 ['203', '10/17/2017', '101', 'NY', '173', '750.00'],
 ['205', '10/19/2017', '202', 'TX', '121', '200.00']]

Here's your **Jupyter Notebook Markdown tutorial** for **RDD Actions** in PySpark. It includes a **pretext table** explaining each action and the corresponding **Python code** in proper markdown formatting.

---

### **RDD Actions in PySpark**  

#### **Action Description Table**
| Action             | Description |
|--------------------|------------|
| `count()`         | Counts total rows |
| `first()`         | Retrieves first row |
| `collect()`       | Converts RDD to list |
| `take(n)`        | Fetches first `n` rows |
| `distinct()`      | Returns distinct rows |
| `reduce()`        | Applies a function to aggregate values |
| `max()`           | Finds the maximum value |
| `min()`           | Finds the minimum value |
| `countByKey()`    | Counts occurrences of each key |
| `keys()`          | Retrieves all keys in an RDD |
| `values()`        | Retrieves all values in an RDD |
| `isEmpty()`       | Checks if RDD is empty |
| `takeSample(False, n)` | Takes `n` random samples |

---

### Load Data into an RDD**
```python
# Import PySpark
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("RDD Actions Tutorial").getOrCreate()
sc = spark.sparkContext

# Create a text file (only needed if running locally)
%%writefile data.txt
#EventId    Timestamp    Customer   State    ServiceID    Amount
201       10/13/2017      100       NY       131          100.00
204       10/18/2017      700       TX       129          450.00
202       10/15/2017      203       CA       121          200.00
205       10/20/2017      404       FL       131          300.00
203       10/17/2017      305       TX       122          150.00

# Load data.txt into an RDD (excluding header)
rdd = sc.textFile("data.txt").filter(lambda line: not line.startswith("#"))
```

---


---

### **Numeric Operations**
```python
# Extract the "Amount" column (last column) and convert to float
amount_rdd = rdd.map(lambda line: float(line.split()[-1]))

# Find the total sum of Amount
total_amount = amount_rdd.reduce(lambda x, y: x + y)
print("Total Amount:", total_amount)

# Find the maximum transaction amount
max_amount = amount_rdd.max()
print("Max Amount:", max_amount)

# Find the minimum transaction amount
min_amount = amount_rdd.min()
print("Min Amount:", min_amount)

# Compute average amount
avg_amount = total_amount / amount_rdd.count()
print("Average Amount:", avg_amount)
```

---

### **Other Useful Actions**
```python
# Get the first 2 records as key-value pairs (ServiceID, Amount)
kv_rdd = rdd.map(lambda line: (line.split()[4], float(line.split()[-1])))
print("Key-Value pairs:", kv_rdd.take(2))

# Count by key (ServiceID)
service_count = kv_rdd.countByKey()
print("Count per ServiceID:", dict(service_count))

# Fetch all keys (ServiceIDs)
print("All ServiceIDs:", kv_rdd.keys().collect())

# Fetch all values (Amounts)
print("All Amounts:", kv_rdd.values().collect())

# Check if RDD is empty
print("Is RDD empty?", rdd.isEmpty())

# Take a sample (without replacement)
print("Sample Records:", rdd.takeSample(False, 2))
```

---

### **🎯 Summary**
- This tutorial covers **RDD Actions** in PySpark.
- You learned **how to load data, manipulate RDDs, and apply various actions**.
- Actions are operations that **trigger computation and return values to the driver**.

Would you like me to add **RDD Transformations** (`map`, `filter`, `groupByKey`, etc.) as well? 🚀

In [39]:
%%writefile data.txt
#ID    Name    Age    City    Score
1      John    25     NY      85.5
2      Jane    30     TX      92.0
3      Alice   27     CA      88.0
4      Bob     24     FL      79.5
5      Eve     29     WA      95.0


Writing data.txt


In [43]:
# To check the saved file, run:
import os
print(os.getcwd())  # Shows current working directory
print(os.listdir())  # Lists files in the directory


/Users/apple/Documents/GitHub/Python/BigData
['services.txt', '.DS_Store', 'example2.txt', 'lambda_expressions.ipynb', 'example.txt', '.ipynb_checkpoints', 'pyspark.ipynb', 'data.txt']


In [None]:
# Load data.txt into an RDD (excluding header)
rdd = sc.textFile("data.txt").filter(lambda line: not line.startswith("#"))

# Display first few lines
rdd.take(3)