# Spark LLM Assistant

## Initialization

In [1]:
from langchain.chat_models import ChatOpenAI
from spark_llm import SparkLLMAssistant

llm = ChatOpenAI(model_name='gpt-4') # using gpt-4 can achieve better results
assistant=SparkLLMAssistant(llm=llm, verbose=True)
assistant.activate() # active partial functions for Spark DataFrame

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/09 23:31:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Example 1: Auto sales by brand in US 2022

In [2]:
# Search and ingest web content into a DataFrame
auto_df = assistant.create_df("2022 USA national auto sales by brand")
auto_df.show()

Parsing URL: https://www.carpro.com/blog/full-year-2022-national-auto-sales-by-brand

SQL query for the ingestion:
 CREATE OR REPLACE TEMP VIEW auto_sales_2022 AS SELECT * FROM VALUES
(1, 'Toyota', 1849751, -9),
(2, 'Ford', 1767439, -2),
(3, 'Chevrolet', 1502389, 6),
(4, 'Honda', 881201, -33),
(5, 'Hyundai', 724265, -2),
(6, 'Kia', 693549, -1),
(7, 'Jeep', 684612, -12),
(8, 'Nissan', 682731, -25),
(9, 'Subaru', 556581, -5),
(10, 'Ram Trucks', 545194, -16),
(11, 'GMC', 517649, 7),
(12, 'Mercedes-Benz', 350949, 7),
(13, 'BMW', 332388, -1),
(14, 'Volkswagen', 301069, -20),
(15, 'Mazda', 294908, -11),
(16, 'Lexus', 258704, -15),
(17, 'Dodge', 190793, -12),
(18, 'Audi', 186875, -5),
(19, 'Cadillac', 134726, 14),
(20, 'Chrysler', 112713, -2),
(21, 'Buick', 103519, -42),
(22, 'Acura', 102306, -35),
(23, 'Volvo', 102038, -16),
(24, 'Mitsubishi', 102037, -16),
(25, 'Lincoln', 83486, -4),
(26, 'Porsche', 70065, 0),
(27, 'Genesis', 56410, 14),
(28, 'INFINITI', 46619, -20),
(29, 'MINI', 29504, -1)

In [3]:
# Apply transforms to a Dataframe
auto_top_growth_df=auto_df.llm_transform("top brand with the highest growth")
auto_top_growth_df.show()

SQL query for the transform:
SELECT brand, sales_change
FROM temp_view_for_transform
ORDER BY sales_change DESC
LIMIT 1
+--------+------------+
|   brand|sales_change|
+--------+------------+
|Cadillac|          14|
+--------+------------+



In [4]:
# Explain what a DataFrame is retrieving.
auto_top_growth_df.llm_explain()

'In summary, this dataframe is retrieving the brand with the highest sales change from the "auto_sales_2022" table. It presents the results sorted by sales change in descending order, and only returns the top result.'

## Example 2: USA Presidents

In [5]:
# You can also specify the expected columns for the ingestion.
df=assistant.create_df("USA presidents", ["president", "vice_president"])
df.show()

Parsing URL: https://www.loc.gov/rr/print/list/057_chron.html

SQL query for the ingestion:
 CREATE OR REPLACE TEMP VIEW presidents AS SELECT * FROM VALUES
('George Washington', 'John Adams'),
('John Adams', 'Thomas Jefferson'),
('Thomas Jefferson', 'Aaron Burr'),
('Thomas Jefferson', 'George Clinton'),
('James Madison', 'George Clinton'),
('James Madison', 'Elbridge Gerry'),
('James Monroe', 'Daniel D. Tompkins'),
('John Quincy Adams', 'John C. Calhoun'),
('Andrew Jackson', 'John C. Calhoun'),
('Andrew Jackson', 'Martin Van Buren'),
('Martin Van Buren', 'Richard M. Johnson'),
('William Henry Harrison', 'John Tyler'),
('John Tyler', NULL),
('James K. Polk', 'George M. Dallas'),
('Zachary Taylor', 'Millard Fillmore'),
('Millard Fillmore', NULL),
('Franklin Pierce', 'William R. King'),
('Franklin Pierce', NULL),
('James Buchanan', 'John C. Breckinridge'),
('Abraham Lincoln', 'Hannibal Hamlin'),
('Abraham Lincoln', 'Andrew Johnson'),
('Andrew Johnson', NULL),
('Ulysses S. Grant', 'Schuyle

In [6]:
presidents_who_were_vp = df.llm_transform("presidents who were also vice presidents")
presidents_who_were_vp.show()

SQL query for the transform:
SELECT DISTINCT president
FROM temp_view_for_transform
WHERE president IN (SELECT DISTINCT vice_president FROM temp_view_for_transform)
+------------------+
|         president|
+------------------+
|        John Adams|
|  Thomas Jefferson|
|  Martin Van Buren|
|  Millard Fillmore|
|        John Tyler|
|    Andrew Johnson|
| Chester A. Arthur|
|Theodore Roosevelt|
|   Calvin Coolidge|
|   Harry S. Truman|
|    Gerald R. Ford|
| Lyndon B. Johnson|
|  Richard M. Nixon|
|       George Bush|
|   Joseph R. Biden|
+------------------+



In [7]:
presidents_who_were_vp.llm_explain()

'In summary, this dataframe is retrieving the distinct list of presidents that have also served as vice presidents.'

# Example 3: Top 10 tech companies

In [8]:
# Search and ingest web content into a DataFrame
company_df=assistant.create_df("Top 10 tech companies by market cap", ['company', 'cap', 'country'])
company_df.show()

Parsing URL: https://www.statista.com/statistics/1350976/leading-tech-companies-worldwide-by-market-cap/

SQL query for the ingestion:
 CREATE OR REPLACE TEMP VIEW top_tech_companies AS SELECT * FROM VALUES
('Apple', 2242, 'United States'),
('Microsoft', 1821, 'United States'),
('Alphabet (Google)', 1229, 'United States'),
('Amazon', 902.4, 'United States'),
('Tesla', 541.4, 'United States'),
('TSMC', 410.9, 'Taiwan'),
('NVIDIA', 401.7, 'United States'),
('Tencent', 377.8, 'China'),
('Meta Platforms (Facebook)', 302.1, 'United States'),
('Samsung', 301.7, 'South Korea')
AS v1(company, cap, country)

Storing data into temp view: top_tech_companies

+--------------------+------+-------------+
|             company|   cap|      country|
+--------------------+------+-------------+
|               Apple|2242.0|United States|
|           Microsoft|1821.0|United States|
|   Alphabet (Google)|1229.0|United States|
|              Amazon| 902.4|United States|
|               Tesla| 541.4|United 

In [9]:
us_company_df=company_df.llm_transform("companies in United States")
us_company_df.show()

SQL query for the transform:
SELECT company, cap, country
FROM temp_view_for_transform
WHERE country = 'United States'
+--------------------+------+-------------+
|             company|   cap|      country|
+--------------------+------+-------------+
|               Apple|2242.0|United States|
|           Microsoft|1821.0|United States|
|   Alphabet (Google)|1229.0|United States|
|              Amazon| 902.4|United States|
|               Tesla| 541.4|United States|
|              NVIDIA| 401.7|United States|
|Meta Platforms (F...| 302.1|United States|
+--------------------+------+-------------+



In [10]:
us_company_df.llm_explain()

'In summary, this dataframe is retrieving the company, market capitalization, and country of the top tech companies that are located in the United States.'

## Example 4: Ingestion from a URL
Instead of searching for the web page, you can also ask the assistant to ingest from a URL.

In [11]:
assistant.create_df('https://time.com/6235186/best-albums-2022/').show()

Parsing URL: https://time.com/6235186/best-albums-2022/

SQL query for the ingestion:
 CREATE OR REPLACE TEMP VIEW best_albums_2022 AS SELECT * FROM VALUES
('Motomami', 'Rosalía'),
('You Can’t Kill Me', '070 Shake'),
('Mr. Morale & The Big Steppers', 'Kendrick Lamar'),
('Big Time', 'Angel Olsen'),
('Electricity', 'Ibibio Sound Machine'),
('It’s Almost Dry', 'Pusha T'),
('Chloe and the Next 20th Century', 'Father John Misty'),
('Renaissance', 'Beyoncé'),
('19 Masters', 'Saya Gray'),
('Un Verano Sin Ti', 'Bad Bunny')
AS v1(album, artist);

Storing data into temp view: best_albums_2022

+--------------------+--------------------+
|               album|              artist|
+--------------------+--------------------+
|            Motomami|             Rosalía|
|   You Can’t Kill Me|           070 Shake|
|Mr. Morale & The ...|      Kendrick Lamar|
|            Big Time|         Angel Olsen|
|         Electricity|Ibibio Sound Machine|
|     It’s Almost Dry|             Pusha T|
|Chloe and th