# Spark LLM Assistant

## Initialization

In [1]:
from langchain.chat_models import ChatOpenAI
from spark_llm import SparkLLMAssistant

llm = ChatOpenAI(model_name='gpt-4') # using gpt-4 can achieve better results
assistant=SparkLLMAssistant(llm=llm, verbose=True)
assistant.activate() # active partial functions for Spark DataFrame

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/21 01:15:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Example 1: Auto sales by brand in US 2022

In [2]:
# Search and ingest web content into a DataFrame
auto_df = assistant.create_df("2022 USA national auto sales by brand")
auto_df.show()

[92mINFO: [0mParsing URL: https://www.carpro.com/blog/full-year-2022-national-auto-sales-by-brand

[92mINFO: [0mSQL query for the ingestion:

[92mINFO: [0m[34mCREATE[39;49;00m[37m [39;49;00m[34mOR[39;49;00m[37m [39;49;00m[34mREPLACE[39;49;00m[37m [39;49;00mTEMP[37m [39;49;00m[34mVIEW[39;49;00m[37m [39;49;00mauto_sales_2022[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00m[34mSELECT[39;49;00m[37m [39;49;00m*[37m [39;49;00m[34mFROM[39;49;00m[37m [39;49;00m[34mVALUES[39;49;00m[37m[39;49;00m
([33m'Toyota'[39;49;00m,[37m [39;49;00m[34m1849751[39;49;00m,[37m [39;49;00m-[34m9[39;49;00m),[37m[39;49;00m
([33m'Ford'[39;49;00m,[37m [39;49;00m[34m1767439[39;49;00m,[37m [39;49;00m-[34m2[39;49;00m),[37m[39;49;00m
([33m'Chevrolet'[39;49;00m,[37m [39;49;00m[34m1502389[39;49;00m,[37m [39;49;00m[34m6[39;49;00m),[37m[39;49;00m
([33m'Honda'[39;49;00m,[37m [39;49;00m[34m881201[39;49;00m,[37m [39;49;00m-[34m33[39;49;00m),

+-------------+-------------+-------+
|        brand|us_sales_2022|vs_2021|
+-------------+-------------+-------+
|       Toyota|      1849751|     -9|
|         Ford|      1767439|     -2|
|    Chevrolet|      1502389|      6|
|        Honda|       881201|    -33|
|      Hyundai|       724265|     -2|
|          Kia|       693549|     -1|
|         Jeep|       684612|    -12|
|       Nissan|       682731|    -25|
|       Subaru|       556581|     -5|
|   Ram Trucks|       545194|    -16|
|          GMC|       517649|      7|
|Mercedes-Benz|       350949|      7|
|          BMW|       332388|     -1|
|   Volkswagen|       301069|    -20|
|        Mazda|       294908|    -11|
|        Lexus|       258704|    -15|
|        Dodge|       190793|    -12|
|         Audi|       186875|     -5|
|     Cadillac|       134726|     14|
|     Chrysler|       112713|     -2|
+-------------+-------------+-------+
only showing top 20 rows



In [3]:
auto_df.llm.plot()

[92mINFO: [0m```
[34mimport[39;49;00m [04m[36mplotly[39;49;00m[04m[36m.[39;49;00m[04m[36mgraph_objects[39;49;00m [34mas[39;49;00m [04m[36mgo[39;49;00m
[34mfrom[39;49;00m [04m[36mplotly[39;49;00m[04m[36m.[39;49;00m[04m[36msubplots[39;49;00m [34mimport[39;49;00m make_subplots

[37m# Assuming df is a valid PySpark DataFrame[39;49;00m
pdf = df.toPandas()

fig = make_subplots(specs=[[{[33m"[39;49;00m[33msecondary_y[39;49;00m[33m"[39;49;00m: [34mTrue[39;49;00m}]])

[37m# Add bars for US sales in 2022[39;49;00m
fig.add_trace(go.Bar(x=pdf[[33m"[39;49;00m[33mbrand[39;49;00m[33m"[39;49;00m], y=pdf[[33m"[39;49;00m[33mus_sales_2022[39;49;00m[33m"[39;49;00m], name=[33m"[39;49;00m[33mUS Sales in 2022[39;49;00m[33m"[39;49;00m), secondary_y=[34mFalse[39;49;00m)

[37m# Add line for comparison to 2021[39;49;00m
fig.add_trace(go.Scatter(x=pdf[[33m"[39;49;00m[33mbrand[39;49;00m[33m"[39;49;00m], y=pdf[[33m"[39;49;00m[33mvs_2021[39;4

In [None]:
auto_df.llm.plot("pie char for top 5 brands and the others' market shares")

In [None]:
# Apply transforms to a Dataframe
auto_top_growth_df=auto_df.llm.transform("top brand with the highest growth")
auto_top_growth_df.show()

In [None]:
# Explain what a DataFrame is retrieving.
auto_top_growth_df.llm.explain()

In [None]:
auto_top_growth_df.llm.verify("expect sales change percentage to be between -100 to 100")

## Example 2: USA Presidents

In [None]:
# You can also specify the expected columns for the ingestion.
df=assistant.create_df("USA presidents", ["president", "vice_president"])
df.show()

In [None]:
presidents_who_were_vp = df.llm.transform("presidents who were also vice presidents")
presidents_who_were_vp.show()

In [None]:
presidents_who_were_vp.llm.explain()

In [None]:
presidents_who_were_vp.llm.verify("expect no NULL values")

# Example 3: Top 10 tech companies

In [None]:
# Search and ingest web content into a DataFrame
company_df=assistant.create_df("Top 10 tech companies by market cap", ['company', 'cap', 'country'])
company_df.show()

In [None]:
us_company_df=company_df.llm.transform("companies in United States")
us_company_df.show()

In [None]:
us_company_df.llm.explain()

In [None]:
us_company_df.llm.verify("expect all company names to be unique")

## Example 4: Ingestion from a URL
Instead of searching for the web page, you can also ask the assistant to ingest from a URL.

In [None]:
best_albums_df = assistant.create_df('https://time.com/6235186/best-albums-2022/', ["album", "artist", "year"])
best_albums_df.show()

In [None]:
best_albums_df.llm.verify("expect each year to be 2022")

## Example 5: UDF Generation

You can also ask the assistant to generate code for a Spark UDF by providing.

In [None]:
@assistant.udf
def convert_grades(grade_percent: float) -> str:
    """Convert the grade percent to a letter grade using standard cutoffs"""
    ...

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.udf.register("convert_grades", convert_grades)
percentGrades = [(1, 97.8), (2, 72.3), (3, 81.2)]
df = spark.createDataFrame(percentGrades, ["student_id", "grade_percent"])
df.selectExpr("student_id", "convert_grades(grade_percent)").show()

# Cache
The SparkLLMAssistant supports a simple in-memory and persistent cache system. It keeps an in-memory staging cache, which gets updated for LLM and web search results. The staging cache can be persisted through the commit() method. Cache lookup is always performed on the persistent cache only.

In [None]:
assistant.commit()