In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()

spark= SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
loan_schema = """loan_id string, member_id string, loan_amount float, 
funded_amount float, loan_term_months string, interest_rate float, 
monthly_installment float, issue_date string, loan_status string, 
loan_purpose string, loan_title string"""

In [3]:
loans_raw_df = spark.read \
.format("csv") \
.option("header",True) \
.schema(loan_schema) \
.load("/user/itv015703/lendingclubproject/raw/loans_data_csv")

In [4]:
loans_raw_df 

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title
94761152,1ae08eb0c074de6fb...,26975.0,26975.0,60 months,30.84,886.71,Dec-2016,Fully Paid,debt_consolidation,Debt consolidation
94093520,82ae44edab97e1c95...,28000.0,28000.0,60 months,7.99,567.61,Dec-2016,Current,credit_card,Credit card refin...
95126205,7a92464a7eef5fa98...,12000.0,12000.0,36 months,12.74,402.83,Dec-2016,Current,debt_consolidation,Debt consolidation
95217505,6fc2a824c46b88831...,9600.0,9600.0,36 months,13.99,328.06,Dec-2016,Late (31-120 days),debt_consolidation,Debt consolidation
95096173,d72283a720a1bb13c...,2700.0,2700.0,36 months,8.24,84.91,Dec-2016,Fully Paid,credit_card,Credit card refin...
95148408,0b0616b87c7476b53...,22400.0,22400.0,36 months,7.24,694.11,Dec-2016,Fully Paid,debt_consolidation,Debt consolidation
93793227,b89d45166d32ba422...,12000.0,12000.0,36 months,15.99,421.83,Dec-2016,Fully Paid,credit_card,Credit card refin...
94566736,4f170572ef915ba17...,35000.0,35000.0,60 months,13.99,814.21,Dec-2016,Current,debt_consolidation,Debt consolidation
95256545,30ca62c00e346e480...,5600.0,5600.0,36 months,13.99,191.37,Dec-2016,Current,other,Other
95272111,64f06e51bc16579dd...,5000.0,5000.0,36 months,7.99,156.66,Dec-2016,Fully Paid,medical,Medical expenses


In [5]:
loans_raw_df .printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_months: string (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)



In [6]:
from pyspark.sql.functions import current_timestamp

loans_ingested_df = loans_raw_df.withColumn("ingest_date", current_timestamp())

In [7]:
loans_ingested_df

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
94761152,1ae08eb0c074de6fb...,26975.0,26975.0,60 months,30.84,886.71,Dec-2016,Fully Paid,debt_consolidation,Debt consolidation,2025-03-30 06:49:...
94093520,82ae44edab97e1c95...,28000.0,28000.0,60 months,7.99,567.61,Dec-2016,Current,credit_card,Credit card refin...,2025-03-30 06:49:...
95126205,7a92464a7eef5fa98...,12000.0,12000.0,36 months,12.74,402.83,Dec-2016,Current,debt_consolidation,Debt consolidation,2025-03-30 06:49:...
95217505,6fc2a824c46b88831...,9600.0,9600.0,36 months,13.99,328.06,Dec-2016,Late (31-120 days),debt_consolidation,Debt consolidation,2025-03-30 06:49:...
95096173,d72283a720a1bb13c...,2700.0,2700.0,36 months,8.24,84.91,Dec-2016,Fully Paid,credit_card,Credit card refin...,2025-03-30 06:49:...
95148408,0b0616b87c7476b53...,22400.0,22400.0,36 months,7.24,694.11,Dec-2016,Fully Paid,debt_consolidation,Debt consolidation,2025-03-30 06:49:...
93793227,b89d45166d32ba422...,12000.0,12000.0,36 months,15.99,421.83,Dec-2016,Fully Paid,credit_card,Credit card refin...,2025-03-30 06:49:...
94566736,4f170572ef915ba17...,35000.0,35000.0,60 months,13.99,814.21,Dec-2016,Current,debt_consolidation,Debt consolidation,2025-03-30 06:49:...
95256545,30ca62c00e346e480...,5600.0,5600.0,36 months,13.99,191.37,Dec-2016,Current,other,Other,2025-03-30 06:49:...
95272111,64f06e51bc16579dd...,5000.0,5000.0,36 months,7.99,156.66,Dec-2016,Fully Paid,medical,Medical expenses,2025-03-30 06:49:...


In [8]:
loans_ingested_df.createOrReplaceTempView("loans")

In [9]:
spark.sql("select * from loans where loan_amount is null")

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2025-03-30 06:49:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2025-03-30 06:49:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2025-03-30 06:49:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2025-03-30 06:49:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2025-03-30 06:49:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2025-03-30 06:49:...
Loans that do not...,e3b0c44298fc1c149...,,,,,,,,,,2025-03-30 06:49:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2025-03-30 06:49:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2025-03-30 06:49:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2025-03-30 06:49:...


In [10]:
columns_to_check = ["loan_amount", "funded_amount", "loan_term_months", "interest_rate", "monthly_installment", "issue_date", "loan_status", "loan_purpose"]

In [11]:
loans_filtered_df = loans_ingested_df.na.drop(subset=columns_to_check)

In [12]:
loans_filtered_df

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
94761152,1ae08eb0c074de6fb...,26975.0,26975.0,60 months,30.84,886.71,Dec-2016,Fully Paid,debt_consolidation,Debt consolidation,2025-03-30 06:49:...
94093520,82ae44edab97e1c95...,28000.0,28000.0,60 months,7.99,567.61,Dec-2016,Current,credit_card,Credit card refin...,2025-03-30 06:49:...
95126205,7a92464a7eef5fa98...,12000.0,12000.0,36 months,12.74,402.83,Dec-2016,Current,debt_consolidation,Debt consolidation,2025-03-30 06:49:...
95217505,6fc2a824c46b88831...,9600.0,9600.0,36 months,13.99,328.06,Dec-2016,Late (31-120 days),debt_consolidation,Debt consolidation,2025-03-30 06:49:...
95096173,d72283a720a1bb13c...,2700.0,2700.0,36 months,8.24,84.91,Dec-2016,Fully Paid,credit_card,Credit card refin...,2025-03-30 06:49:...
95148408,0b0616b87c7476b53...,22400.0,22400.0,36 months,7.24,694.11,Dec-2016,Fully Paid,debt_consolidation,Debt consolidation,2025-03-30 06:49:...
93793227,b89d45166d32ba422...,12000.0,12000.0,36 months,15.99,421.83,Dec-2016,Fully Paid,credit_card,Credit card refin...,2025-03-30 06:49:...
94566736,4f170572ef915ba17...,35000.0,35000.0,60 months,13.99,814.21,Dec-2016,Current,debt_consolidation,Debt consolidation,2025-03-30 06:49:...
95256545,30ca62c00e346e480...,5600.0,5600.0,36 months,13.99,191.37,Dec-2016,Current,other,Other,2025-03-30 06:49:...
95272111,64f06e51bc16579dd...,5000.0,5000.0,36 months,7.99,156.66,Dec-2016,Fully Paid,medical,Medical expenses,2025-03-30 06:49:...


In [13]:
loans_filtered_df.createOrReplaceTempView("loans")

In [14]:
from pyspark.sql.functions import regexp_replace, col
loans_terms_modified= loans_filtered_df.withColumn("loan_term_months", (regexp_replace(col("loan_term_months"), " months", "") \
                                         .cast("int")/12) \
                                         .cast("int")) \
                                         .withColumnRenamed("loan_term_months", "loan_term_years")

In [15]:
loans_terms_modified.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_years: integer (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [16]:
loans_terms_modified.createOrReplaceTempView("loans")

In [17]:
spark.sql("select distinct(loan_purpose) from loans")

loan_purpose
"guaranteed!"""
and if they are a...
never had any tro...
<br/><br/>Lending...
Bank of America c...
stocks
please feel free ...
I became his prim...
brakes
on one of the bus...


In [18]:
spark.sql("select loan_purpose, count(*) as total from loans group by loan_purpose order by total desc")

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139413
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [19]:
loan_purpose_lookup = ["debt_consolidation", "credit_card", "home_improvement", "other", "major_purchase", "medical", "small_business", "car", "vacation", "moving", "house", "wedding", "renewable_energy", "educational"]

In [20]:
from pyspark.sql.functions import when, col

loan_purpose_modified_df = loans_terms_modified.withColumn("loan_purpose", \
                                when(col("loan_purpose").isin(loan_purpose_lookup), col("loan_purpose")).\
                                otherwise("other"))

In [38]:
loan_purpose_modified_df.createOrReplaceTempView("loans")

In [39]:
spark.sql("select loan_purpose, count(*) as total from loans group by loan_purpose order by total desc")

Py4JJavaError: An error occurred while calling o242.showString.
: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree:
Exchange hashpartitioning(loan_purpose#773, 200), ENSURE_REQUIREMENTS, [id=#298]
+- *(1) HashAggregate(keys=[loan_purpose#773], functions=[partial_count(1)], output=[loan_purpose#773, count#810L])
   +- *(1) Project [CASE WHEN loan_purpose#9 INSET (home_improvement,medical,credit_card,other,moving,renewable_energy,vacation,small_business,house,debt_consolidation,educational,wedding,major_purchase,car) THEN loan_purpose#9 ELSE other END AS loan_purpose#773]
      +- *(1) Filter AtLeastNNulls(n, loan_amount#2,funded_amount#3,loan_term_months#4,interest_rate#5,monthly_installment#6,issue_date#7,loan_status#8,loan_purpose#9)
         +- FileScan csv [loan_amount#2,funded_amount#3,loan_term_months#4,interest_rate#5,monthly_installment#6,issue_date#7,loan_status#8,loan_purpose#9] Batched: false, DataFilters: [AtLeastNNulls(n, loan_amount#2,funded_amount#3,loan_term_months#4,interest_rate#5,monthly_instal..., Format: CSV, Location: InMemoryFileIndex[hdfs://m01.itversity.com:9000/user/itv015703/lendingclubproject/raw/loans_data_..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<loan_amount:float,funded_amount:float,loan_term_months:string,interest_rate:float,monthly_...

	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:56)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:163)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:525)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:453)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:452)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:496)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:141)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:746)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.TakeOrderedAndProjectExec.executeCollect(limit.scala:187)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3696)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2929)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:301)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:338)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.
This stopped SparkContext was created at:

org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.lang.reflect.Constructor.newInstance(Constructor.java:423)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
py4j.GatewayConnection.run(GatewayConnection.java:238)
java.lang.Thread.run(Thread.java:748)

The currently active SparkContext was created at:

(No active SparkContext.)
         
	at org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:118)
	at org.apache.spark.SparkContext.broadcast(SparkContext.scala:1506)
	at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.buildReader(CSVFileFormat.scala:102)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues(FileFormat.scala:130)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues$(FileFormat.scala:121)
	at org.apache.spark.sql.execution.datasources.TextBasedFileFormat.buildReaderWithPartitionValues(FileFormat.scala:170)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:407)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:398)
	at org.apache.spark.sql.execution.FileSourceScanExec.doExecute(DataSourceScanExec.scala:485)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:525)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:453)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:452)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:496)
	at org.apache.spark.sql.execution.FilterExec.inputRDDs(basicPhysicalOperators.scala:149)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:50)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:141)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:746)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.inputRDD$lzycompute(ShuffleExchangeExec.scala:118)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.inputRDD(ShuffleExchangeExec.scala:118)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency$lzycompute(ShuffleExchangeExec.scala:151)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency(ShuffleExchangeExec.scala:149)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.$anonfun$doExecute$1(ShuffleExchangeExec.scala:166)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
	... 42 more


Py4JJavaError: An error occurred while calling o242.getRowsToPython.
: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree:
Exchange hashpartitioning(loan_purpose#773, 200), ENSURE_REQUIREMENTS, [id=#328]
+- *(1) HashAggregate(keys=[loan_purpose#773], functions=[partial_count(1)], output=[loan_purpose#773, count#810L])
   +- *(1) Project [CASE WHEN loan_purpose#9 INSET (home_improvement,medical,credit_card,other,moving,renewable_energy,vacation,small_business,house,debt_consolidation,educational,wedding,major_purchase,car) THEN loan_purpose#9 ELSE other END AS loan_purpose#773]
      +- *(1) Filter AtLeastNNulls(n, loan_amount#2,funded_amount#3,loan_term_months#4,interest_rate#5,monthly_installment#6,issue_date#7,loan_status#8,loan_purpose#9)
         +- FileScan csv [loan_amount#2,funded_amount#3,loan_term_months#4,interest_rate#5,monthly_installment#6,issue_date#7,loan_status#8,loan_purpose#9] Batched: false, DataFilters: [AtLeastNNulls(n, loan_amount#2,funded_amount#3,loan_term_months#4,interest_rate#5,monthly_instal..., Format: CSV, Location: InMemoryFileIndex[hdfs://m01.itversity.com:9000/user/itv015703/lendingclubproject/raw/loans_data_..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<loan_amount:float,funded_amount:float,loan_term_months:string,interest_rate:float,monthly_...

	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:56)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:163)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:525)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:453)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:452)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:496)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:141)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:746)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.TakeOrderedAndProjectExec.executeCollect(limit.scala:187)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3696)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2929)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:301)
	at org.apache.spark.sql.Dataset.getRowsToPython(Dataset.scala:3539)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.
This stopped SparkContext was created at:

org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.lang.reflect.Constructor.newInstance(Constructor.java:423)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
py4j.GatewayConnection.run(GatewayConnection.java:238)
java.lang.Thread.run(Thread.java:748)

The currently active SparkContext was created at:

(No active SparkContext.)
         
	at org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:118)
	at org.apache.spark.SparkContext.broadcast(SparkContext.scala:1506)
	at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.buildReader(CSVFileFormat.scala:102)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues(FileFormat.scala:130)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues$(FileFormat.scala:121)
	at org.apache.spark.sql.execution.datasources.TextBasedFileFormat.buildReaderWithPartitionValues(FileFormat.scala:170)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:407)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:398)
	at org.apache.spark.sql.execution.FileSourceScanExec.doExecute(DataSourceScanExec.scala:485)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:525)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:453)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:452)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:496)
	at org.apache.spark.sql.execution.FilterExec.inputRDDs(basicPhysicalOperators.scala:149)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:50)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:141)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:746)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.inputRDD$lzycompute(ShuffleExchangeExec.scala:118)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.inputRDD(ShuffleExchangeExec.scala:118)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency$lzycompute(ShuffleExchangeExec.scala:151)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency(ShuffleExchangeExec.scala:149)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.$anonfun$doExecute$1(ShuffleExchangeExec.scala:166)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
	... 42 more
