## Downloading data as json

In [5]:
from decouple import config, AutoConfig
from datetime import datetime, timedelta
import os
import requests
from pathlib import Path

In [6]:
config = AutoConfig(search_path='.env') # <-- .env file located next to manage.py

In [7]:
# download the json by supplying the api token in the header
def get_json(endpoint, headers):
    """download the json by supplying the api token in the header"""
    headers['Accept'] = 'application/json' # csv?
    # pull_date = (datetime.now() - timedelta(days=180)).strftime("%Y-%m-%dT%H:%M:%S") # year, month, day, hour, minute, seconds, microseconds
    combined = []
    offset, counter = 0, 1
    error = False
    params = f"""$query=SELECT:*,* ORDER BY :id LIMIT 2000"""
    # response has two parts .json() and .headers https://www.w3schools.com/python/ref_requests_response.asp
    response = requests.get(endpoint, headers=headers, params=params)
    captured = response.json()
    combined.extend(captured)
    print('get_json complete')
    return combined

In [8]:
# Sodu API Credentials
API_TOKEN = config("API_TOKEN")
API_KEY_ID = config("API_KEY_ID")
API_KEY_SECRET = config("API_KEY_SECRET")

source_path_json = '/home/sanyashireen/sf_eviction/data_eviction/2023/3/26/api_raw_eviction_2023-03-26.json'
data_dir = Path('/home/sanyashireen/sf_eviction/data_eviction/2023/3/26')
data_dir.mkdir(parents=True, exist_ok=True)

SODA_url = 'https://data.sfgov.org/resource/5cei-gny5'
SODA_headers = {
    'keyId': API_KEY_ID,
    'keySecret': API_KEY_SECRET
}
content = get_json(SODA_url, SODA_headers)


get_json complete


In [3]:
source_path_json = '/home/sanyashireen/sf_eviction/data_eviction/2023/3/26/api_raw_eviction_2023-03-26.json'

In [6]:
type(content)

list

In [138]:
print(type(content[1]))

<class 'dict'>


In [18]:
# jsdon.dumps() returns a json str
type(json.dumps(content, indent=4))

str

## TESTING: json.dump() instead of json.dumps()

In [9]:
# testing writing the list of dictionaries (response.json()) returned by the API to a file locally as json file
# using json.dump
import json
out_file = open(source_path_json,"w", encoding='utf8')
json.dump(content, out_file, indent=4)
out_file.close()


In [17]:
# Creating and testing script to clean and transform csv data using pyspark
# spark related packages

import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]") \
    .appName('test') \
    .getOrCreate()


In [10]:
source_path_json = '/home/sanyashireen/sf_eviction/data_eviction/2023/3/26/raw_eviction_2023-03-26.json'
df = spark.read.option("multiline","true").json(str(source_path_json))

                                                                                

In [11]:
df.count()

                                                                                

177578

In [12]:
df.printSchema()

root
 |-- :@computed_region_26cr_cadq: string (nullable = true)
 |-- :@computed_region_6ezc_tdp2: string (nullable = true)
 |-- :@computed_region_6pnf_4xz7: string (nullable = true)
 |-- :@computed_region_6qbp_sg9q: string (nullable = true)
 |-- :@computed_region_9jxd_iqea: string (nullable = true)
 |-- :@computed_region_ajp5_b2md: string (nullable = true)
 |-- :@computed_region_bh8s_q3mv: string (nullable = true)
 |-- :@computed_region_fyvs_ahh9: string (nullable = true)
 |-- :@computed_region_h4ep_8xdi: string (nullable = true)
 |-- :@computed_region_jwn9_ihcz: string (nullable = true)
 |-- :@computed_region_p5aj_wyqh: string (nullable = true)
 |-- :@computed_region_pigm_ib2e: string (nullable = true)
 |-- :@computed_region_qgnn_b9vv: string (nullable = true)
 |-- :@computed_region_rxqg_mtj9: string (nullable = true)
 |-- :@computed_region_yftq_j783: string (nullable = true)
 |-- :created_at: string (nullable = true)
 |-- :id: string (nullable = true)
 |-- :updated_at: string (nullab

In [13]:
df.head(3)

23/03/27 14:34:00 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

[Row(:@computed_region_26cr_cadq='2', :@computed_region_6ezc_tdp2=None, :@computed_region_6pnf_4xz7='2', :@computed_region_6qbp_sg9q='53', :@computed_region_9jxd_iqea=None, :@computed_region_ajp5_b2md='20', :@computed_region_bh8s_q3mv='28859', :@computed_region_fyvs_ahh9='19', :@computed_region_h4ep_8xdi=None, :@computed_region_jwn9_ihcz='53', :@computed_region_p5aj_wyqh='4', :@computed_region_pigm_ib2e=None, :@computed_region_qgnn_b9vv='3', :@computed_region_rxqg_mtj9='7', :@computed_region_yftq_j783='2', :created_at='2023-01-23T23:46:16.858Z', :id='row-ufzj_22gk~drek', :updated_at='2023-01-23T23:46:33.925Z', :version='rv-wryr~aezj~xte7', access_denial=False, address='2500 Block Of Folsom  Street', breach=False, capital_improvement=False, city='San Francisco', client_location=Row(human_address='{"address": "", "city": "", "state": "", "zip": ""}', latitude='37.75649855484188', longitude='-122.41446453496935'), condo_conversion=False, constraints_date=None, demolition=False, developmen

In [156]:
df.show()

[Stage 9:>                                                          (0 + 1) / 1]

+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+---------------------------+--------------------+------------------+--------------------+-----------------+-------------+--------------------+------+-------------------+-------------+--------------------+----------------+----------------+----------+-----------+--------------------+-----------+-----------------------+--------------------+-------------------+-----------+-------------+----------------+--------------------+-----------+--------+-----------+-------------+------------------+--------------------+-----+-----------------+-------------------+--------------------+-----+
|:@computed_region_26cr_c

                                                                                

In [31]:
df.columns

[':@computed_region_26cr_cadq',
 ':@computed_region_6ezc_tdp2',
 ':@computed_region_6pnf_4xz7',
 ':@computed_region_6qbp_sg9q',
 ':@computed_region_9jxd_iqea',
 ':@computed_region_ajp5_b2md',
 ':@computed_region_bh8s_q3mv',
 ':@computed_region_fyvs_ahh9',
 ':@computed_region_h4ep_8xdi',
 ':@computed_region_jwn9_ihcz',
 ':@computed_region_p5aj_wyqh',
 ':@computed_region_pigm_ib2e',
 ':@computed_region_qgnn_b9vv',
 ':@computed_region_rxqg_mtj9',
 ':@computed_region_yftq_j783',
 ':created_at',
 ':id',
 ':updated_at',
 ':version',
 'access_denial',
 'address',
 'breach',
 'capital_improvement',
 'city',
 'client_location',
 'condo_conversion',
 'constraints_date',
 'demolition',
 'development',
 'ellis_act_withdrawal',
 'eviction_id',
 'failure_to_sign_renewal',
 'file_date',
 'good_samaritan_ends',
 'illegal_use',
 'late_payments',
 'lead_remediation',
 'neighborhood',
 'non_payment',
 'nuisance',
 'other_cause',
 'owner_move_in',
 'roommate_same_unit',
 'shape',
 'state',
 'substantial_r

In [14]:
len(df.columns)

49

In [15]:
print(f'The pyspark df is a list of records where each record is of type {type(df.head(1)[0])} \n\n {df.head(1)[0]}')

[Stage 6:>                                                          (0 + 1) / 1]

The pyspark df is a list of records where each record is of type <class 'pyspark.sql.types.Row'> 

 Row(:@computed_region_26cr_cadq='2', :@computed_region_6ezc_tdp2=None, :@computed_region_6pnf_4xz7='2', :@computed_region_6qbp_sg9q='53', :@computed_region_9jxd_iqea=None, :@computed_region_ajp5_b2md='20', :@computed_region_bh8s_q3mv='28859', :@computed_region_fyvs_ahh9='19', :@computed_region_h4ep_8xdi=None, :@computed_region_jwn9_ihcz='53', :@computed_region_p5aj_wyqh='4', :@computed_region_pigm_ib2e=None, :@computed_region_qgnn_b9vv='3', :@computed_region_rxqg_mtj9='7', :@computed_region_yftq_j783='2', :created_at='2023-01-23T23:46:16.858Z', :id='row-ufzj_22gk~drek', :updated_at='2023-01-23T23:46:33.925Z', :version='rv-wryr~aezj~xte7', access_denial=False, address='2500 Block Of Folsom  Street', breach=False, capital_improvement=False, city='San Francisco', client_location=Row(human_address='{"address": "", "city": "", "state": "", "zip": ""}', latitude='37.75649855484188', longitude=

                                                                                

In [157]:
# Result of json.dumps - to compare
# print(f'The pyspark df is a list of records where each record is of type {type(df.head(1)[0])} \n\n {df.head(1)[0]}')

[Stage 11:>                                                         (0 + 1) / 1]

The pyspark df is a list of records where each record is of type <class 'pyspark.sql.types.Row'> 

 Row(:@computed_region_26cr_cadq='2', :@computed_region_6ezc_tdp2=None, :@computed_region_6pnf_4xz7='2', :@computed_region_6qbp_sg9q='53', :@computed_region_9jxd_iqea=None, :@computed_region_ajp5_b2md='20', :@computed_region_bh8s_q3mv='28859', :@computed_region_fyvs_ahh9='19', :@computed_region_h4ep_8xdi=None, :@computed_region_jwn9_ihcz='53', :@computed_region_p5aj_wyqh='4', :@computed_region_pigm_ib2e=None, :@computed_region_qgnn_b9vv='3', :@computed_region_rxqg_mtj9='7', :@computed_region_yftq_j783='2', :created_at='2023-01-23T23:46:16.858Z', :id='row-ufzj_22gk~drek', :updated_at='2023-01-23T23:46:33.925Z', :version='rv-wryr~aezj~xte7', access_denial=False, address='2500 Block Of Folsom  Street', breach=False, capital_improvement=False, city='San Francisco', client_location=Row(human_address='{"address": "", "city": "", "state": "", "zip": ""}', latitude='37.75649855484188', longitude=

                                                                                

## TESTING: If Pyspark can read the json data directly from json string
* Not working
* Ref here for later
    - https://ch-nabarun.medium.com/read-json-using-pyspark-f792bda95741
    - the above blog uses SparkContext (to convert json to RDD) and SparkSession (to convert RDD to DF)
    - Cant figure out how to 
    - https://stackoverflow.com/questions/39818368/convert-lines-of-json-in-rdd-to-dataframe-in-apache-spark
    

In [26]:
json_string = json.dumps(content, indent=4)
#json_string = json.dumps(content, indent=4, ensure_ascii=False).encode('utf8')

In [28]:
# Creating and testing script to clean and transform csv data using pyspark
# spark related packages

import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]") \
    .appName('test') \
    .getOrCreate()


In [27]:
#spark.stop()

In [29]:
# testing if pyspark can directly read the json data

# Step 1 convert the json data to rdd using sparkContext
# convert into RDD
rdd = spark.sparkContext.parallelize(json_string)


In [32]:
type(rdd)

pyspark.rdd.RDD

In [33]:
rdd.take(1)

23/03/27 15:27:18 WARN TaskSetManager: Stage 2 contains a task of very large size (1993 KiB). The maximum recommended task size is 1000 KiB.
23/03/27 15:27:18 ERROR Executor: Exception in task 0.0 in stage 2.0 (TID 8)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/sanyashireen/spark/spark-3.3.2-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 540, in main
    raise RuntimeError(
RuntimeError: Python in worker has different version 3.9 than that in driver 3.10, PySpark cannot run with different minor versions. Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:552)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:758)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:740)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIt

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 2.0 failed 1 times, most recent failure: Lost task 0.0 in stage 2.0 (TID 8) (de-zoomcamp.us-central1-c.c.blissful-flames-375219.internal executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/sanyashireen/spark/spark-3.3.2-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 540, in main
    raise RuntimeError(
RuntimeError: Python in worker has different version 3.9 than that in driver 3.10, PySpark cannot run with different minor versions. Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:552)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:758)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:740)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:505)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:166)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2278)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:834)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2238)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2259)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2278)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/sanyashireen/spark/spark-3.3.2-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 540, in main
    raise RuntimeError(
RuntimeError: Python in worker has different version 3.9 than that in driver 3.10, PySpark cannot run with different minor versions. Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:552)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:758)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:740)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:505)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:166)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2278)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more


In [31]:
# create a Dataframe
df = spark.read.option("multiline","true").json(rdd)

23/03/27 15:26:14 WARN TaskSetManager: Stage 1 contains a task of very large size (1993 KiB). The maximum recommended task size is 1000 KiB.
23/03/27 15:26:14 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 4)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/sanyashireen/spark/spark-3.3.2-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 540, in main
    raise RuntimeError(
RuntimeError: Python in worker has different version 3.9 than that in driver 3.10, PySpark cannot run with different minor versions. Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:552)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:758)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:740)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIt

Py4JJavaError: An error occurred while calling o248.json.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 1 times, most recent failure: Lost task 0.0 in stage 1.0 (TID 4) (de-zoomcamp.us-central1-c.c.blissful-flames-375219.internal executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/sanyashireen/spark/spark-3.3.2-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 540, in main
    raise RuntimeError(
RuntimeError: Python in worker has different version 3.9 than that in driver 3.10, PySpark cannot run with different minor versions. Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:552)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:758)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:740)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:505)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator.isEmpty(Iterator.scala:387)
	at scala.collection.Iterator.isEmpty$(Iterator.scala:387)
	at scala.collection.AbstractIterator.isEmpty(Iterator.scala:1431)
	at scala.collection.TraversableOnce.reduceLeftOption(TraversableOnce.scala:249)
	at scala.collection.TraversableOnce.reduceLeftOption$(TraversableOnce.scala:248)
	at scala.collection.AbstractIterator.reduceLeftOption(Iterator.scala:1431)
	at scala.collection.TraversableOnce.reduceOption(TraversableOnce.scala:256)
	at scala.collection.TraversableOnce.reduceOption$(TraversableOnce.scala:256)
	at scala.collection.AbstractIterator.reduceOption(Iterator.scala:1431)
	at org.apache.spark.sql.catalyst.json.JsonInferSchema.$anonfun$infer$1(JsonInferSchema.scala:103)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:855)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:855)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:834)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2238)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2333)
	at org.apache.spark.sql.catalyst.json.JsonInferSchema.infer(JsonInferSchema.scala:116)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$.$anonfun$inferFromDataset$5(JsonDataSource.scala:110)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$.inferFromDataset(JsonDataSource.scala:110)
	at org.apache.spark.sql.DataFrameReader.$anonfun$json$6(DataFrameReader.scala:415)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:415)
	at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:390)
	at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:376)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/sanyashireen/spark/spark-3.3.2-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 540, in main
    raise RuntimeError(
RuntimeError: Python in worker has different version 3.9 than that in driver 3.10, PySpark cannot run with different minor versions. Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:552)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:758)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:740)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:505)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator.isEmpty(Iterator.scala:387)
	at scala.collection.Iterator.isEmpty$(Iterator.scala:387)
	at scala.collection.AbstractIterator.isEmpty(Iterator.scala:1431)
	at scala.collection.TraversableOnce.reduceLeftOption(TraversableOnce.scala:249)
	at scala.collection.TraversableOnce.reduceLeftOption$(TraversableOnce.scala:248)
	at scala.collection.AbstractIterator.reduceLeftOption(Iterator.scala:1431)
	at scala.collection.TraversableOnce.reduceOption(TraversableOnce.scala:256)
	at scala.collection.TraversableOnce.reduceOption$(TraversableOnce.scala:256)
	at scala.collection.AbstractIterator.reduceOption(Iterator.scala:1431)
	at org.apache.spark.sql.catalyst.json.JsonInferSchema.$anonfun$infer$1(JsonInferSchema.scala:103)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:855)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:855)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
