In [None]:
!pip install langchain google-cloud-aiplatform google-auth

In [None]:
from langchain.llms import VertexAI
from langchain import PromptTemplate, LLMChain

In [None]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS']='/workspace/datasets/lc_vertexai_exp/splendid-sector-350209-132a7f9ec3e2.json'

In [None]:
template_pythonista = """You are an expert pythonista
who can write code based on the request below
{request}"""

prompt_pythonista = PromptTemplate(template=template_pythonista, 
                        input_variables=["request"])

In [None]:
llm = VertexAI(temperature=0.1)

In [None]:
llm("What is color of flamingo?")

In [None]:
llm_chain_profile = LLMChain(prompt=prompt_pythonista, llm=llm)

In [None]:
python_code = llm_chain_profile("""Write code to automate youtube comments extraction""")

In [None]:
print(python_code['text'])

In [None]:
!pip install pyspark

In [None]:
from langchain.agents import create_spark_sql_agent
from langchain.agents.agent_toolkits import SparkSQLToolkit
from langchain.utilities.spark_sql import SparkSQL

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

db_name = "langchain_example"

spark.sql(f"CREATE DATABASE IF NOT EXISTS {db_name}")

spark.sql(f"USE {db_name}")

csv_file_path = "/workspace/datasets/lc_vertexai_exp/space_titanic.csv"

table = "space_titanic"

spark.read.csv(csv_file_path, 
               header=True, 
               inferSchema=True).write.saveAsTable(table)

spark.table(table).show(3)

In [None]:
# Note, you can also connect to Spark via Spark connect. For example:
# db = SparkSQL.from_uri("sc://localhost:15002", schema=schema)

spark_sql = SparkSQL(schema=db_name)

toolkit = SparkSQLToolkit(db=spark_sql, 
                          llm=llm)

agent_executor = create_spark_sql_agent(
    llm=llm,
    toolkit=toolkit,
    verbose=True
)

In [None]:
agent_executor.run("List the available tables")

In [None]:
agent_executor.run("Describe space_titanic table")

In [None]:
agent_executor.run("Which is the most frequent Home planet")

In [None]:
agent_executor.run("Which last name is more frequent")

In [None]:
agent_executor.run("""Split the Name column into first and last name
then find which last name is more frequent""")

## Spark Dataframe

In [None]:
df = spark.read.csv(csv_file_path, 
                    header=True, 
                    inferSchema=True)
df.show(3)

In [None]:
from langchain.agents import create_spark_dataframe_agent
from langchain.llms import OpenAI
import os


agent = create_spark_dataframe_agent(llm, 
                                     df=df, 
                                     verbose=True)

In [None]:
agent.run("""Split the Name column into first and last name and 
create new table named table1""")

In [None]:
agent.run("""Split the Name column into first and last name then 
find the most frequent last name""")

In [None]:
agent.run("""
who spent the most in Food Court?
You can find all supported function types in https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html""")

In [None]:
agent_executor.run("who spent the most in Food Court?")

## Let chat with PDF

In [None]:
!pip install faiss-cpu sentence-transformers nltk

In [None]:
!touch linux_play.txt

In [None]:
# This is a long document we can split up.
with open('/workspace/datasets/lc_vertexai_exp/linux_play.txt') as f:
    vimPlay = f.read()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    #Set a really small chunk size, just to show.
    chunk_size = 300,
    chunk_overlap  = 100,
    length_function = len,
)

In [None]:
import nltk
nltk.download('punkt')

In [None]:
docs = text_splitter.split_text(vimPlay)

In [None]:
docs[0]

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')

In [None]:
embeddings.model_name

In [None]:
db = FAISS.from_texts(docs, embeddings)

In [None]:
from langchain.chains import RetrievalQA

In [None]:
chain = RetrievalQA.from_chain_type(llm,chain_type="stuff",
                                    retriever=db.as_retriever())

In [None]:
chain.run("Tell me about Linux?")

In [None]:
chain.run("What are the tasks in Linux playbook?")

In [None]:
chain.run("Where the Vim exercises needs to be done?")

In [None]:
db.similarity_search("is docker necessary to work on Vim Playbook?")

In [None]:
chain.run("is docker necessary to work on Vim Playbook?")

In [None]:
chain.run("How many are activities related to splits")