<a href="https://colab.research.google.com/github/sugarforever/LangChain-Tutorials/blob/main/LangChain_Spark_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Connecting OpenAI with Apache Spark

Introduction of [pyspark-ai](https://github.com/databrickslabs/pyspark-ai)

Pyspark-AI takes English instructions and compile them into PySpark objects like DataFrames, to make Spark more user-friendly and accessible, allowing you to focus on extracting insights from your data.

In [1]:
!pip install --quiet --upgrade langchain openai pyspark-ai pyspark

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.2 MB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.2/1.2 MB[0m [31m19.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.0/143.0 kB[0m [3

In [42]:
import os
os.environ['OPENAI_API_KEY'] = 'your openai api key'

1. Initialize the Spark AI instance

In [43]:
from langchain.chat_models import ChatOpenAI
from pyspark_ai import SparkAI

# If 'gpt-4' is unavailable, use 'gpt-3.5-turbo' (might lower output quality)
llm = ChatOpenAI(model_name='gpt-4', temperature=0)

spark_ai = SparkAI(llm=llm, verbose=True)

# Activate partial functions for Spark DataFrame
spark_ai.activate()

2. Create a dataframe via a HTTP URL

In this case, we are fetching the share holders of Apple, one of the best performing stock in US market.

In [None]:
holders_dataframe = spark_ai.create_df("https://finance.yahoo.com/quote/AAPL/holders?p=AAPL")

[92mINFO: [0mParsing URL: https://finance.yahoo.com/quote/AAPL/holders?p=AAPL



INFO:spark_ai:Parsing URL: https://finance.yahoo.com/quote/AAPL/holders?p=AAPL



[92mINFO: [0mSQL query for the ingestion:
[34mCREATE[39;49;00m[37m [39;49;00m[34mOR[39;49;00m[37m [39;49;00m[34mREPLACE[39;49;00m[37m [39;49;00mTEMP[37m [39;49;00m[34mVIEW[39;49;00m[37m [39;49;00mapple_stock_holders[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00m
[34mSELECT[39;49;00m[37m [39;49;00m*[37m [39;49;00m[34mFROM[39;49;00m[37m [39;49;00m[34mVALUES[39;49;00m[37m[39;49;00m
([33m'Vanguard Group, Inc. (The)'[39;49;00m,[37m [39;49;00m[34m1309785362[39;49;00m,[37m [39;49;00m[33m'Mar 30, 2023'[39;49;00m,[37m [39;49;00m[34m8[39;49;00m.[34m33[39;49;00m,[37m [39;49;00m[34m254059068265[39;49;00m),[37m[39;49;00m
([33m'Blackrock Inc.'[39;49;00m,[37m [39;49;00m[34m1035008939[39;49;00m,[37m [39;49;00m[33m'Mar 30, 2023'[39;49;00m,[37m [39;49;00m[34m6[39;49;00m.[34m58[39;49;00m,[37m [39;49;00m[34m200760685161[39;49;00m),[37m[39;49;00m
([33m'Berkshire Hathaway, Inc'[39;49;00m,[37m [39;49;00m[34m915560382[39;

INFO:spark_ai:SQL query for the ingestion:
[34mCREATE[39;49;00m[37m [39;49;00m[34mOR[39;49;00m[37m [39;49;00m[34mREPLACE[39;49;00m[37m [39;49;00mTEMP[37m [39;49;00m[34mVIEW[39;49;00m[37m [39;49;00mapple_stock_holders[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00m
[34mSELECT[39;49;00m[37m [39;49;00m*[37m [39;49;00m[34mFROM[39;49;00m[37m [39;49;00m[34mVALUES[39;49;00m[37m[39;49;00m
([33m'Vanguard Group, Inc. (The)'[39;49;00m,[37m [39;49;00m[34m1309785362[39;49;00m,[37m [39;49;00m[33m'Mar 30, 2023'[39;49;00m,[37m [39;49;00m[34m8[39;49;00m.[34m33[39;49;00m,[37m [39;49;00m[34m254059068265[39;49;00m),[37m[39;49;00m
([33m'Blackrock Inc.'[39;49;00m,[37m [39;49;00m[34m1035008939[39;49;00m,[37m [39;49;00m[33m'Mar 30, 2023'[39;49;00m,[37m [39;49;00m[34m6[39;49;00m.[34m58[39;49;00m,[37m [39;49;00m[34m200760685161[39;49;00m),[37m[39;49;00m
([33m'Berkshire Hathaway, Inc'[39;49;00m,[37m [39;49;00m[34m915560382[39;4

[92mINFO: [0mStoring data into temp view: apple_stock_holders



INFO:spark_ai:Storing data into temp view: apple_stock_holders



In [None]:
holders_dataframe.show(n=5)

+--------------------+----------+-------------+-----------+------------+
|         holder_name|    shares|date_reported|percent_out|       value|
+--------------------+----------+-------------+-----------+------------+
|Vanguard Group, I...|1309785362| Mar 30, 2023|       8.33|254059068265|
|      Blackrock Inc.|1035008939| Mar 30, 2023|       6.58|200760685161|
|Berkshire Hathawa...| 915560382| Mar 30, 2023|       5.82|177591248414|
|State Street Corp...| 576281774| Mar 30, 2023|       3.66|111781376406|
|            FMR, LLC| 311437576| Mar 30, 2023|       1.98| 60409546996|
+--------------------+----------+-------------+-----------+------------+
only showing top 5 rows



3. Plot

In [None]:
holders_dataframe.ai.plot()

[92mINFO: [0mHere is a Python code snippet that uses Plotly to visualize the data in the PySpark DataFrame `df`. This code assumes that you want to create a bar chart with `holder_name` on the x-axis and `shares` on the y-axis. 


```
[34mimport[39;49;00m [04m[36mplotly[39;49;00m[04m[36m.[39;49;00m[04m[36mgraph_objects[39;49;00m [34mas[39;49;00m [04m[36mgo[39;49;00m[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mpyspark[39;49;00m[04m[36m.[39;49;00m[04m[36msql[39;49;00m[04m[36m.[39;49;00m[04m[36mfunctions[39;49;00m [34mimport[39;49;00m col[37m[39;49;00m
[37m[39;49;00m
[37m# Convert the Spark DataFrame to a Pandas DataFrame[39;49;00m[37m[39;49;00m
pandas_df = df.select([33m"[39;49;00m[33m*[39;49;00m[33m"[39;49;00m).toPandas()[37m[39;49;00m
[37m[39;49;00m
[37m# Create a bar chart[39;49;00m[37m[39;49;00m
fig = go.Figure(data=go.Bar(x=pandas_df[[33m'[39;49;00m[33mholder_name[39;49;00m[33m'[39;49;00m], y=pandas_df[[33m'[39;49;00m

INFO:spark_ai:Here is a Python code snippet that uses Plotly to visualize the data in the PySpark DataFrame `df`. This code assumes that you want to create a bar chart with `holder_name` on the x-axis and `shares` on the y-axis. 


```
[34mimport[39;49;00m [04m[36mplotly[39;49;00m[04m[36m.[39;49;00m[04m[36mgraph_objects[39;49;00m [34mas[39;49;00m [04m[36mgo[39;49;00m[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mpyspark[39;49;00m[04m[36m.[39;49;00m[04m[36msql[39;49;00m[04m[36m.[39;49;00m[04m[36mfunctions[39;49;00m [34mimport[39;49;00m col[37m[39;49;00m
[37m[39;49;00m
[37m# Convert the Spark DataFrame to a Pandas DataFrame[39;49;00m[37m[39;49;00m
pandas_df = df.select([33m"[39;49;00m[33m*[39;49;00m[33m"[39;49;00m).toPandas()[37m[39;49;00m
[37m[39;49;00m
[37m# Create a bar chart[39;49;00m[37m[39;49;00m
fig = go.Figure(data=go.Bar(x=pandas_df[[33m'[39;49;00m[33mholder_name[39;49;00m[33m'[39;49;00m], y=pandas_df[[33m'[39;49;00m

In [None]:
holders_dataframe.ai.plot("Pie chart for Apple's top holders, show their name and share percentages")

[92mINFO: [0mHere is a Python code snippet that uses PySpark and Plotly to visualize the result of `df` as a pie chart. This code assumes that the `percent_out` column represents the share percentages of each holder.


```
[34mimport[39;49;00m [04m[36mplotly[39;49;00m[04m[36m.[39;49;00m[04m[36mgraph_objects[39;49;00m [34mas[39;49;00m [04m[36mgo[39;49;00m[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mpyspark[39;49;00m[04m[36m.[39;49;00m[04m[36msql[39;49;00m [34mimport[39;49;00m SparkSession[37m[39;49;00m
[37m[39;49;00m
[37m# Assuming that SparkSession is already initialized[39;49;00m[37m[39;49;00m
spark = SparkSession.builder.getOrCreate()[37m[39;49;00m
[37m[39;49;00m
[37m# Convert the Spark DataFrame to a Pandas DataFrame[39;49;00m[37m[39;49;00m
pandas_df = df.toPandas()[37m[39;49;00m
[37m[39;49;00m
[37m# Create a pie chart with Plotly[39;49;00m[37m[39;49;00m
fig = go.Figure(data=[go.Pie(labels=pandas_df[[33m'[39;49;00m[33mholder_n

INFO:spark_ai:Here is a Python code snippet that uses PySpark and Plotly to visualize the result of `df` as a pie chart. This code assumes that the `percent_out` column represents the share percentages of each holder.


```
[34mimport[39;49;00m [04m[36mplotly[39;49;00m[04m[36m.[39;49;00m[04m[36mgraph_objects[39;49;00m [34mas[39;49;00m [04m[36mgo[39;49;00m[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mpyspark[39;49;00m[04m[36m.[39;49;00m[04m[36msql[39;49;00m [34mimport[39;49;00m SparkSession[37m[39;49;00m
[37m[39;49;00m
[37m# Assuming that SparkSession is already initialized[39;49;00m[37m[39;49;00m
spark = SparkSession.builder.getOrCreate()[37m[39;49;00m
[37m[39;49;00m
[37m# Convert the Spark DataFrame to a Pandas DataFrame[39;49;00m[37m[39;49;00m
pandas_df = df.toPandas()[37m[39;49;00m
[37m[39;49;00m
[37m# Create a pie chart with Plotly[39;49;00m[37m[39;49;00m
fig = go.Figure(data=[go.Pie(labels=pandas_df[[33m'[39;49;00m[33mholder_na

In [None]:
top_holder_dataframe = holders_dataframe.ai.transform("name with the highest percentage, and its percentage")
top_holder_dataframe.show()

[92mINFO: [0mSQL query for the transform:
[34mSELECT[39;49;00m[37m [39;49;00mholder_name,[37m [39;49;00m[34mMAX[39;49;00m(percent_out)[37m [39;49;00m[34mas[39;49;00m[37m [39;49;00mmax_percent[37m[39;49;00m
[34mFROM[39;49;00m[37m [39;49;00mtemp_view_for_transform[37m[39;49;00m
[34mGROUP[39;49;00m[37m [39;49;00m[34mBY[39;49;00m[37m [39;49;00mholder_name[37m[39;49;00m
[34mORDER[39;49;00m[37m [39;49;00m[34mBY[39;49;00m[37m [39;49;00mmax_percent[37m [39;49;00m[34mDESC[39;49;00m[37m[39;49;00m
[34mLIMIT[39;49;00m[37m [39;49;00m[34m1[39;49;00m[37m[39;49;00m



INFO:spark_ai:SQL query for the transform:
[34mSELECT[39;49;00m[37m [39;49;00mholder_name,[37m [39;49;00m[34mMAX[39;49;00m(percent_out)[37m [39;49;00m[34mas[39;49;00m[37m [39;49;00mmax_percent[37m[39;49;00m
[34mFROM[39;49;00m[37m [39;49;00mtemp_view_for_transform[37m[39;49;00m
[34mGROUP[39;49;00m[37m [39;49;00m[34mBY[39;49;00m[37m [39;49;00mholder_name[37m[39;49;00m
[34mORDER[39;49;00m[37m [39;49;00m[34mBY[39;49;00m[37m [39;49;00mmax_percent[37m [39;49;00m[34mDESC[39;49;00m[37m[39;49;00m
[34mLIMIT[39;49;00m[37m [39;49;00m[34m1[39;49;00m[37m[39;49;00m



+--------------------+-----------+
|         holder_name|max_percent|
+--------------------+-----------+
|Vanguard Group, I...|       8.33|
+--------------------+-----------+



4. Explain what the AI did

In [None]:
top_holder_dataframe.ai.explain()

'In summary, this dataframe is retrieving the holder name with the highest percentage of Apple stocks out of all holders. It presents the results sorted by the percentage of stocks in descending order and limits the result to the top holder.'

5. Verify the dataframe attributes by giving the expectatoin in natural language

In [None]:
holders_dataframe.ai.verify("expect Apple's top holders have no more than 10% of shares")

[92mINFO: [0mGenerated code:
[34mdef[39;49;00m [32mcheck_apple_top_holders[39;49;00m(df) -> [36mbool[39;49;00m:[37m[39;49;00m
    [34mfrom[39;49;00m [04m[36mpyspark[39;49;00m[04m[36m.[39;49;00m[04m[36msql[39;49;00m[04m[36m.[39;49;00m[04m[36mfunctions[39;49;00m [34mimport[39;49;00m col[37m[39;49;00m
[37m[39;49;00m
    [37m# Filter the DataFrame for rows where the holder_name is 'Apple'[39;49;00m[37m[39;49;00m
    apple_df = df.filter(col([33m'[39;49;00m[33mholder_name[39;49;00m[33m'[39;49;00m) == [33m'[39;49;00m[33mApple[39;49;00m[33m'[39;49;00m)[37m[39;49;00m
[37m[39;49;00m
    [37m# Check if any row in the filtered DataFrame has a percent_out greater than 10[39;49;00m[37m[39;49;00m
    [34mif[39;49;00m apple_df.filter(col([33m'[39;49;00m[33mpercent_out[39;49;00m[33m'[39;49;00m) > [34m10[39;49;00m).count() > [34m0[39;49;00m:[37m[39;49;00m
        [34mreturn[39;49;00m [34mFalse[39;49;00m[37m[39;49;00m
    [34m

INFO:spark_ai:Generated code:
[34mdef[39;49;00m [32mcheck_apple_top_holders[39;49;00m(df) -> [36mbool[39;49;00m:[37m[39;49;00m
    [34mfrom[39;49;00m [04m[36mpyspark[39;49;00m[04m[36m.[39;49;00m[04m[36msql[39;49;00m[04m[36m.[39;49;00m[04m[36mfunctions[39;49;00m [34mimport[39;49;00m col[37m[39;49;00m
[37m[39;49;00m
    [37m# Filter the DataFrame for rows where the holder_name is 'Apple'[39;49;00m[37m[39;49;00m
    apple_df = df.filter(col([33m'[39;49;00m[33mholder_name[39;49;00m[33m'[39;49;00m) == [33m'[39;49;00m[33mApple[39;49;00m[33m'[39;49;00m)[37m[39;49;00m
[37m[39;49;00m
    [37m# Check if any row in the filtered DataFrame has a percent_out greater than 10[39;49;00m[37m[39;49;00m
    [34mif[39;49;00m apple_df.filter(col([33m'[39;49;00m[33mpercent_out[39;49;00m[33m'[39;49;00m) > [34m10[39;49;00m).count() > [34m0[39;49;00m:[37m[39;49;00m
        [34mreturn[39;49;00m [34mFalse[39;49;00m[37m[39;49;00m
    [34me

[92mINFO: [0m
Result: True


INFO:spark_ai:
Result: True


## More Exciting Use Cases

Let's see more use cases in which we will explore how we can use natual language in data processing and analysis more elegantly with pyspark-ai.

In [57]:
df = spark_ai.create_df("https://coinmarketcap.com/", ['name', 'price', 'market_cap', 'circulating_supply'])

[92mINFO: [0mParsing URL: https://coinmarketcap.com/



INFO:spark_ai:Parsing URL: https://coinmarketcap.com/



[92mINFO: [0mSQL query for the ingestion:
[34mCREATE[39;49;00m[37m [39;49;00m[34mOR[39;49;00m[37m [39;49;00m[34mREPLACE[39;49;00m[37m [39;49;00mTEMP[37m [39;49;00m[34mVIEW[39;49;00m[37m [39;49;00mcryptocurrencies[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00m
[34mSELECT[39;49;00m[37m [39;49;00m*[37m [39;49;00m[34mFROM[39;49;00m[37m [39;49;00m[34mVALUES[39;49;00m[37m[39;49;00m
([33m'Bitcoin'[39;49;00m,[37m [39;49;00m[33m'$31,050.33'[39;49;00m,[37m [39;49;00m[33m'$602.79B'[39;49;00m,[37m [39;49;00m[33m'19,419,081 BTC'[39;49;00m),[37m[39;49;00m
([33m'Ethereum'[39;49;00m,[37m [39;49;00m[33m'$1,957.54'[39;49;00m,[37m [39;49;00m[33m'$235.29B'[39;49;00m,[37m [39;49;00m[33m'120,219,234 ETH'[39;49;00m),[37m[39;49;00m
([33m'Tether'[39;49;00m,[37m [39;49;00m[33m'$1.00'[39;49;00m,[37m [39;49;00m[33m'$83.35B'[39;49;00m,[37m [39;49;00m[33m'83,341,708,027 USDT'[39;49;00m),[37m[39;49;00m
([33m'BNB'[39;49;00m,[37m

INFO:spark_ai:SQL query for the ingestion:
[34mCREATE[39;49;00m[37m [39;49;00m[34mOR[39;49;00m[37m [39;49;00m[34mREPLACE[39;49;00m[37m [39;49;00mTEMP[37m [39;49;00m[34mVIEW[39;49;00m[37m [39;49;00mcryptocurrencies[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00m
[34mSELECT[39;49;00m[37m [39;49;00m*[37m [39;49;00m[34mFROM[39;49;00m[37m [39;49;00m[34mVALUES[39;49;00m[37m[39;49;00m
([33m'Bitcoin'[39;49;00m,[37m [39;49;00m[33m'$31,050.33'[39;49;00m,[37m [39;49;00m[33m'$602.79B'[39;49;00m,[37m [39;49;00m[33m'19,419,081 BTC'[39;49;00m),[37m[39;49;00m
([33m'Ethereum'[39;49;00m,[37m [39;49;00m[33m'$1,957.54'[39;49;00m,[37m [39;49;00m[33m'$235.29B'[39;49;00m,[37m [39;49;00m[33m'120,219,234 ETH'[39;49;00m),[37m[39;49;00m
([33m'Tether'[39;49;00m,[37m [39;49;00m[33m'$1.00'[39;49;00m,[37m [39;49;00m[33m'$83.35B'[39;49;00m,[37m [39;49;00m[33m'83,341,708,027 USDT'[39;49;00m),[37m[39;49;00m
([33m'BNB'[39;49;00m,[37m 

[92mINFO: [0mStoring data into temp view: cryptocurrencies



INFO:spark_ai:Storing data into temp view: cryptocurrencies



In [58]:
df.show(n=5)

+--------+----------+----------+-------------------+
|    name|     price|market_cap| circulating_supply|
+--------+----------+----------+-------------------+
| Bitcoin|$31,050.33|  $602.79B|     19,419,081 BTC|
|Ethereum| $1,957.54|  $235.29B|    120,219,234 ETH|
|  Tether|     $1.00|   $83.35B|83,341,708,027 USDT|
|     BNB|   $247.17|   $38.52B|    155,850,829 BNB|
|USD Coin|     $1.00|   $27.66B|27,653,660,244 USDC|
+--------+----------+----------+-------------------+
only showing top 5 rows



In [59]:
df.ai.plot('Bar chart with name and price')

[92mINFO: [0mHere is a Python code snippet that uses Plotly to visualize the result of `df` as a bar chart with 'name' and 'price'. This code assumes that the 'price' column contains numerical values stored as strings.


```
[34mimport[39;49;00m [04m[36mplotly[39;49;00m[04m[36m.[39;49;00m[04m[36mexpress[39;49;00m [34mas[39;49;00m [04m[36mpx[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[37m# Convert Spark DataFrame to Pandas DataFrame[39;49;00m[37m[39;49;00m
pandas_df = df.toPandas()[37m[39;49;00m
[37m[39;49;00m
[37m# Convert price column to numeric[39;49;00m[37m[39;49;00m
pandas_df[[33m'[39;49;00m[33mprice[39;49;00m[33m'[39;49;00m] = pd.to_numeric(pandas_df[[33m'[39;49;00m[33mprice[39;49;00m[33m'[39;49;00m])[37m[39;49;00m
[37m[39;49;00m
[37m# Create a bar chart with 'name' and 'price'[39;49;00m[37m[39;49;00m
fig = px.bar(pandas_df

INFO:spark_ai:Here is a Python code snippet that uses Plotly to visualize the result of `df` as a bar chart with 'name' and 'price'. This code assumes that the 'price' column contains numerical values stored as strings.


```
[34mimport[39;49;00m [04m[36mplotly[39;49;00m[04m[36m.[39;49;00m[04m[36mexpress[39;49;00m [34mas[39;49;00m [04m[36mpx[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[37m# Convert Spark DataFrame to Pandas DataFrame[39;49;00m[37m[39;49;00m
pandas_df = df.toPandas()[37m[39;49;00m
[37m[39;49;00m
[37m# Convert price column to numeric[39;49;00m[37m[39;49;00m
pandas_df[[33m'[39;49;00m[33mprice[39;49;00m[33m'[39;49;00m] = pd.to_numeric(pandas_df[[33m'[39;49;00m[33mprice[39;49;00m[33m'[39;49;00m])[37m[39;49;00m
[37m[39;49;00m
[37m# Create a bar chart with 'name' and 'price'[39;49;00m[37m[39;49;00m
fig = px.bar(pandas_df,

ValueError: ignored

In [60]:
transformed_df = df.ai.transform(
    """
    The price column is of string in the US currency format with comma separators, and denoted by US dollar sign.
    You must process correctly with such format, and add a custom column 'price_float' that transforms the price column to float type
    """
)

[92mINFO: [0mSQL query for the transform:
[34mSELECT[39;49;00m[37m [39;49;00m
[37m    [39;49;00m*,[37m[39;49;00m
[37m    [39;49;00m[34mCAST[39;49;00m([34mREPLACE[39;49;00m([34mREPLACE[39;49;00m(price,[37m [39;49;00m[33m'$'[39;49;00m,[37m [39;49;00m[33m''[39;49;00m),[37m [39;49;00m[33m','[39;49;00m,[37m [39;49;00m[33m''[39;49;00m)[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00m[36mFLOAT[39;49;00m)[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00mprice_float[37m[39;49;00m
[34mFROM[39;49;00m[37m [39;49;00m
[37m    [39;49;00mtemp_view_for_transform[37m[39;49;00m



INFO:spark_ai:SQL query for the transform:
[34mSELECT[39;49;00m[37m [39;49;00m
[37m    [39;49;00m*,[37m[39;49;00m
[37m    [39;49;00m[34mCAST[39;49;00m([34mREPLACE[39;49;00m([34mREPLACE[39;49;00m(price,[37m [39;49;00m[33m'$'[39;49;00m,[37m [39;49;00m[33m''[39;49;00m),[37m [39;49;00m[33m','[39;49;00m,[37m [39;49;00m[33m''[39;49;00m)[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00m[36mFLOAT[39;49;00m)[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00mprice_float[37m[39;49;00m
[34mFROM[39;49;00m[37m [39;49;00m
[37m    [39;49;00mtemp_view_for_transform[37m[39;49;00m



In [61]:
transformed_df.show(n=5)

+--------+----------+----------+-------------------+-----------+
|    name|     price|market_cap| circulating_supply|price_float|
+--------+----------+----------+-------------------+-----------+
| Bitcoin|$31,050.33|  $602.79B|     19,419,081 BTC|   31050.33|
|Ethereum| $1,957.54|  $235.29B|    120,219,234 ETH|    1957.54|
|  Tether|     $1.00|   $83.35B|83,341,708,027 USDT|        1.0|
|     BNB|   $247.17|   $38.52B|    155,850,829 BNB|     247.17|
|USD Coin|     $1.00|   $27.66B|27,653,660,244 USDC|        1.0|
+--------+----------+----------+-------------------+-----------+
only showing top 5 rows



In [62]:
transformed_df.ai.plot('Bar chart with name and float type price')

[92mINFO: [0mHere is the Python code to visualize the result of `df` using plotly. This code assumes that you have already created the `df` dataframe.


```
[34mimport[39;49;00m [04m[36mplotly[39;49;00m[04m[36m.[39;49;00m[04m[36mexpress[39;49;00m [34mas[39;49;00m [04m[36mpx[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[37m# Convert Spark DataFrame to Pandas DataFrame[39;49;00m[37m[39;49;00m
pandas_df = df.toPandas()[37m[39;49;00m
[37m[39;49;00m
[37m# Create a bar chart with name and float type price[39;49;00m[37m[39;49;00m
fig = px.bar(pandas_df, x=[33m'[39;49;00m[33mname[39;49;00m[33m'[39;49;00m, y=[33m'[39;49;00m[33mprice_float[39;49;00m[33m'[39;49;00m, labels={[33m'[39;49;00m[33mx[39;49;00m[33m'[39;49;00m:[33m'[39;49;00m[33mCryptocurrency[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33my[39;49;00m[33m'[39;49;00m:[33m'[

INFO:spark_ai:Here is the Python code to visualize the result of `df` using plotly. This code assumes that you have already created the `df` dataframe.


```
[34mimport[39;49;00m [04m[36mplotly[39;49;00m[04m[36m.[39;49;00m[04m[36mexpress[39;49;00m [34mas[39;49;00m [04m[36mpx[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[37m# Convert Spark DataFrame to Pandas DataFrame[39;49;00m[37m[39;49;00m
pandas_df = df.toPandas()[37m[39;49;00m
[37m[39;49;00m
[37m# Create a bar chart with name and float type price[39;49;00m[37m[39;49;00m
fig = px.bar(pandas_df, x=[33m'[39;49;00m[33mname[39;49;00m[33m'[39;49;00m, y=[33m'[39;49;00m[33mprice_float[39;49;00m[33m'[39;49;00m, labels={[33m'[39;49;00m[33mx[39;49;00m[33m'[39;49;00m:[33m'[39;49;00m[33mCryptocurrency[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33my[39;49;00m[33m'[39;49;00m:[33m'[3

In [63]:
transformed_df = transformed_df.ai.transform(
    """
    The circulating_supply column is of string with comma separators, and trailing coin name symbol.
    Add a custom column 'circulating_supply_long' that transforms the circulating_supply column to long type
    """
)

[92mINFO: [0mSQL query for the transform:
[34mSELECT[39;49;00m[37m [39;49;00m
[37m    [39;49;00m*,[37m[39;49;00m
[37m    [39;49;00m[34mCAST[39;49;00m([34mREPLACE[39;49;00m(SUBSTRING_INDEX(circulating_supply,[37m [39;49;00m[33m' '[39;49;00m,[37m [39;49;00m[34m1[39;49;00m),[37m [39;49;00m[33m','[39;49;00m,[37m [39;49;00m[33m''[39;49;00m)[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00mLONG)[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00mcirculating_supply_long[37m[39;49;00m
[34mFROM[39;49;00m[37m [39;49;00m
[37m    [39;49;00mtemp_view_for_transform[37m[39;49;00m



INFO:spark_ai:SQL query for the transform:
[34mSELECT[39;49;00m[37m [39;49;00m
[37m    [39;49;00m*,[37m[39;49;00m
[37m    [39;49;00m[34mCAST[39;49;00m([34mREPLACE[39;49;00m(SUBSTRING_INDEX(circulating_supply,[37m [39;49;00m[33m' '[39;49;00m,[37m [39;49;00m[34m1[39;49;00m),[37m [39;49;00m[33m','[39;49;00m,[37m [39;49;00m[33m''[39;49;00m)[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00mLONG)[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00mcirculating_supply_long[37m[39;49;00m
[34mFROM[39;49;00m[37m [39;49;00m
[37m    [39;49;00mtemp_view_for_transform[37m[39;49;00m



In [64]:
transformed_df.show(n=5)

+--------+----------+----------+-------------------+-----------+-----------------------+
|    name|     price|market_cap| circulating_supply|price_float|circulating_supply_long|
+--------+----------+----------+-------------------+-----------+-----------------------+
| Bitcoin|$31,050.33|  $602.79B|     19,419,081 BTC|   31050.33|               19419081|
|Ethereum| $1,957.54|  $235.29B|    120,219,234 ETH|    1957.54|              120219234|
|  Tether|     $1.00|   $83.35B|83,341,708,027 USDT|        1.0|            83341708027|
|     BNB|   $247.17|   $38.52B|    155,850,829 BNB|     247.17|              155850829|
|USD Coin|     $1.00|   $27.66B|27,653,660,244 USDC|        1.0|            27653660244|
+--------+----------+----------+-------------------+-----------+-----------------------+
only showing top 5 rows



In [65]:
sorted_df = transformed_df.ai.transform('Sort by circulating_supply_long in a desc order')

[92mINFO: [0mSQL query for the transform:
[34mSELECT[39;49;00m[37m [39;49;00m*[37m [39;49;00m[34mFROM[39;49;00m[37m [39;49;00mtemp_view_for_transform[37m [39;49;00m[34mORDER[39;49;00m[37m [39;49;00m[34mBY[39;49;00m[37m [39;49;00mcirculating_supply_long[37m [39;49;00m[34mDESC[39;49;00m[37m[39;49;00m



INFO:spark_ai:SQL query for the transform:
[34mSELECT[39;49;00m[37m [39;49;00m*[37m [39;49;00m[34mFROM[39;49;00m[37m [39;49;00mtemp_view_for_transform[37m [39;49;00m[34mORDER[39;49;00m[37m [39;49;00m[34mBY[39;49;00m[37m [39;49;00mcirculating_supply_long[37m [39;49;00m[34mDESC[39;49;00m[37m[39;49;00m



In [66]:
sorted_df.show(n = 5)

+--------+--------+----------+--------------------+-----------+-----------------------+
|    name|   price|market_cap|  circulating_supply|price_float|circulating_supply_long|
+--------+--------+----------+--------------------+-----------+-----------------------+
|Dogecoin|$0.06809|    $9.53B|140,024,896,384 DOGE|    0.06809|           140024896384|
|  Tether|   $1.00|   $83.35B| 83,341,708,027 USDT|        1.0|            83341708027|
|     XRP| $0.4886|   $25.53B|  52,254,289,650 XRP|     0.4886|            52254289650|
| Cardano| $0.2963|   $10.36B|  34,953,231,640 ADA|     0.2963|            34953231640|
|USD Coin|   $1.00|   $27.66B| 27,653,660,244 USDC|        1.0|            27653660244|
+--------+--------+----------+--------------------+-----------+-----------------------+
only showing top 5 rows



In [67]:
sorted_df.ai.plot('Bar chart with name and long type circulating supply value')

[92mINFO: [0mHere is the Python code to visualize the result of `df` using plotly. This code will create a bar chart with the name of the cryptocurrency on the x-axis and the long type circulating supply value on the y-axis.


```
[34mimport[39;49;00m [04m[36mplotly[39;49;00m[04m[36m.[39;49;00m[04m[36mexpress[39;49;00m [34mas[39;49;00m [04m[36mpx[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mplotly[39;49;00m[04m[36m.[39;49;00m[04m[36mio[39;49;00m [34mas[39;49;00m [04m[36mpio[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[37m# Convert Spark DataFrame to Pandas DataFrame[39;49;00m[37m[39;49;00m
pandas_df = df.toPandas()[37m[39;49;00m
[37m[39;49;00m
[37m# Create a bar chart[39;49;00m[37m[39;49;00m
fig = px.bar(pandas_df, x=[33m'[39;49;00m[33mname[39;49;00m[33m'[39;49;00m, y=[33m'[39;49;00m[33mcirculating_supply_long[39;49;00m[33m'[39;49;00m)[37m[39;49;00m
[37m[39;49;00m
[37m# Display the plot[39;49;00m[37m[39;49;00m
p

INFO:spark_ai:Here is the Python code to visualize the result of `df` using plotly. This code will create a bar chart with the name of the cryptocurrency on the x-axis and the long type circulating supply value on the y-axis.


```
[34mimport[39;49;00m [04m[36mplotly[39;49;00m[04m[36m.[39;49;00m[04m[36mexpress[39;49;00m [34mas[39;49;00m [04m[36mpx[39;49;00m[37m[39;49;00m
[34mimport[39;49;00m [04m[36mplotly[39;49;00m[04m[36m.[39;49;00m[04m[36mio[39;49;00m [34mas[39;49;00m [04m[36mpio[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[37m# Convert Spark DataFrame to Pandas DataFrame[39;49;00m[37m[39;49;00m
pandas_df = df.toPandas()[37m[39;49;00m
[37m[39;49;00m
[37m# Create a bar chart[39;49;00m[37m[39;49;00m
fig = px.bar(pandas_df, x=[33m'[39;49;00m[33mname[39;49;00m[33m'[39;49;00m, y=[33m'[39;49;00m[33mcirculating_supply_long[39;49;00m[33m'[39;49;00m)[37m[39;49;00m
[37m[39;49;00m
[37m# Display the plot[39;49;00m[37m[39;49;00m
pi