In [190]:
#! pip install langchain
#! pip install langchain_google_vertexai
#! pip install google-cloud-aiplatform
#! pip install google-auth
#! pip install pypdf
#! pip install tiktoken
#! pip install -U langchain-google-vertexai
#! pip install chromadb

In [191]:
from google.cloud import aiplatform
from google.oauth2 import service_account

In [192]:
from langchain.llms import VertexAI
from langchain_google_vertexai import VertexAI
from langchain import PromptTemplate, LLMChain

In [193]:
credentials = service_account.Credentials.from_service_account_file('skillful-camp-412507-baa2aef45b7e.json')

In [194]:
aiplatform.init(project='skillful-camp-412507', credentials=credentials)

In [195]:
#check the connection
aiplatform.TabularDataset.list()

[]

In [196]:
model = VertexAI(model_name="gemini-pro")

In [197]:
#search the answer use vertexAI and Langchain
from langchain.llms import VertexAI
from langchain import PromptTemplate, LLMChain

template = """Question: {question}

Answer: Let's think step by step."""

prompt = PromptTemplate(template=template, input_variables=["question"])

llm = VertexAI()

llm_chain = LLMChain(prompt=prompt, llm=llm)
llm_chain = LLMChain(prompt=prompt, llm=llm)
question = "What is AI?"

answer = llm_chain.run(question)
print("Answer:",answer)

Answer:  Artificial Intelligence (AI) is the simulation of human intelligence processes by machines, especially computer systems. AI research has been highly successful in developing effective techniques for solving a wide range of problems, from game playing to medical diagnosis.

The final answer is Artificial Intelligence (AI) is the simulation of human intelligence processes by machines, especially computer systems.


In [198]:
# PDF document loading
from langchain.document_loaders import PyPDFLoader

In [199]:
loader = PyPDFLoader("resume_.pdf")

In [200]:
pages = loader.load()
len(pages)

1

In [201]:
# Document Splitting
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [202]:
chunk_size=26
chunk_overlap=4
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)
c_splitter = CharacterTextSplitter(
    chunk_size= chunk_size,
    chunk_overlap=chunk_overlap
)

In [203]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150
)

In [204]:
docs = text_splitter.split_documents(pages)

In [205]:
# token splitting 
from langchain.text_splitter import TokenTextSplitter

In [206]:
text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)

In [207]:
text1 = "foo bar bazzyfoo"
text_splitter.split_text(text1)

['foo', ' bar', ' b', 'az', 'zy', 'foo']

In [208]:
docs = text_splitter.split_documents(pages)

In [209]:
docs[0]

Document(page_content='Y', metadata={'source': 'resume_.pdf', 'page': 0})

In [210]:
pages[0].metadata

{'source': 'resume_.pdf', 'page': 0}

In [211]:
from langchain.embeddings import VertexAIEmbeddings

In [212]:
embedding = VertexAIEmbeddings(model_name="textembedding-gecko-multilingual")

In [213]:
text_embedding = embedding.embed_query(text1)
print(f"Your embedding is length {len(text_embedding)}")
print(f"Here's a sample: {text_embedding[:5]}...")

Your embedding is length 768
Here's a sample: [-0.02548510953783989, -0.007817179895937443, 0.024772444739937782, 0.007634440436959267, -0.02266054041683674]...


In [214]:
sentence1 = "i like dogs"
sentence2 = "i like canines"


In [215]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)

In [216]:
import numpy as np
np.dot(embedding1,embedding2)

0.876138861000423

In [217]:
from langchain.vectorstores import Chroma

In [218]:
persist_directory = 'docs/chroma/'

In [222]:
splits = text_splitter.split_documents(docs)
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory= persist_directory
)

In [223]:
question = "is there an email i can ask for help"

In [224]:
docs = vectordb.similarity_search(question, k=1)

In [225]:
len(docs)

1

In [226]:
print(vectordb._collection.count())

4663


In [227]:
texts = [
    """The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).""",
    """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.""",
]

In [228]:
smalldb = Chroma.from_texts(texts, embedding=embedding)

In [229]:
question = "Tell me about all-white mushrooms with large fruiting bodies"

In [230]:
smalldb.similarity_search(question, k=2)

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.')]

In [231]:
smalldb.max_marginal_relevance_search(question,k=2, fetch_k=3)

[Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.'),
 Document(page_content='A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.')]

In [232]:
question = "what did they say about matlab?"
docs_ss = vectordb.similarity_search(question,k=3)

In [233]:
docs_ss[0].page_content[:100]

' '

In [234]:
docs_ss[1].page_content[:100]

' '

In [235]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [236]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))


In [237]:
# Wrap our vectorstore
llm = VertexAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)

In [238]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)

In [239]:
# Load PDF
loader = PyPDFLoader("resume_.pdf")
pages = loader.load()
all_page_text=[p.page_content for p in pages]
joined_page_text=" ".join(all_page_text)

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500,chunk_overlap = 150)
splits = text_splitter.split_text(joined_page_text)


In [240]:
from langchain.retrievers import SVMRetriever
from langchain.retrievers import TFIDFRetriever
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [241]:
# Retrieve
svm_retriever = SVMRetriever.from_texts(splits,embedding)
tfidf_retriever = TFIDFRetriever.from_texts(splits)

In [242]:
question = "Which skills do Yunxin have?"
docs_svm=svm_retriever.get_relevant_documents(question)
docs_svm[0]



Document(page_content="YUNXIN HONG +1 2065655339 | yunxinh@outlook.com | https://github.com/rattlesyylz | Seattle, WA linkedin.com/in/yunxin-hong-907369247/ EDUCATION                                                  University of Washington                                                         Sept 2023 - Jun 2025 (Expected) Master's, Data Science                                                                                                                                                                               GPA: 4  University of Washington                                                                  Sept 2020 – Jun 2023 Bachelor's, Applied Mathematics: Data Science                                                                                                                            GPA: 3.94 Relevant Coursework: Database System, Machine Learning, Deep Learning, Data Structures, Algorithms, and Web Programming  SKILLS                                       _______ __

In [243]:
question = "what did they say about models?"
docs_tfidf=tfidf_retriever.get_relevant_documents(question)
docs_tfidf[0]

Document(page_content='deep learning models with modifying parameters and steps to calculate CO₂ value, analyzed models’ performance with metrics, where the application will be used in NGO and companies • Analyzed 1000 users’ feedback, visualized the result though dashboard on Tableau, cooperated with UX team to redesign the application, and improved user retention rate by 8%')

In [244]:
persist_directory = 'docs/chroma/'

In [245]:
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [246]:
print(vectordb._collection.count())

4663


In [247]:
question = "what relavent coursework does she take?"
docs = vectordb.similarity_search(question, k=3)
len(docs)

3

In [248]:
#search the answer use vertexAI and Langchain
from langchain.llms import VertexAI
from langchain import PromptTemplate, LLMChain

template = """Question: {question}

Answer: Let's think step by step."""

prompt = PromptTemplate(template=template, input_variables=["question"])

llm = VertexAI()

llm_chain = LLMChain(prompt=prompt, llm=llm)

question = "What is AI?"

answer = llm_chain.run(question)
print("Answer:",answer)

Answer:  Artificial Intelligence (AI) is the simulation of human intelligence processes by machines, especially computer systems. AI research has been highly successful in developing effective techniques for solving a wide range of problems, from game playing to medical diagnosis.

The final answer is Artificial Intelligence (AI) is the simulation of human intelligence processes by machines, especially computer systems.


In [249]:
from langchain.llms import VertexAI
from langchain import PromptTemplate, LLMChain

# Initialize the LLM (Large Language Model) from VertexAI
llm = VertexAI()

template = """
Question: {question}
Answer: {answer}
"""

prompt = PromptTemplate(template=template, input_variables=["question", "answer"])

llm_chain = LLMChain(prompt=prompt, llm=llm)

search_query = "What is AI?"

#  a list of search results
results_from_elasticsearch = [
    "AI stands for Artificial Intelligence.",
    "Artificial Intelligence is the simulation of human intelligence by machines.",
    "AI involves tasks such as speech recognition, problem-solving, and decision-making.",
]


combined_results = "\n".join(results_from_elasticsearch)

# Provide the combined results as the "answer" input
answer = llm_chain.run(question=search_query, answer=combined_results)

# Print the final answer
print("Answer:", answer)


Answer:  **Question: What is AI?**

**Answer:** AI stands for Artificial Intelligence. It is the simulation of human intelligence by machines. AI involves tasks such as speech recognition, problem-solving, and decision-making.

**Here's a more detailed explanation:**

Artificial Intelligence (AI) is a branch of computer science that deals with the creation of intelligent agents, which are systems that can reason, learn, and act autonomously. AI research has been highly successful in developing systems that can perform a wide variety of tasks, including:

* **Natural language processing:** AI systems can understand and generate human language, which is


In [252]:
import os
import pprint

os.environ["SERPER_API_KEY"] = "api"
from langchain_community.utilities import GoogleSerperAPIWrapper

In [254]:
search = GoogleSerperAPIWrapper()
search.run("Obama's first name?")

'Barack Hussein Obama II'

In [260]:
# search = GoogleSerperAPIWrapper(type="news")
# results = search.results("Tesla Inc.")
# pprint.pp(results)

In [263]:
search = GoogleSerperAPIWrapper(type="news", tbs="qdr:h")
results = search.results("Tesla Inc.")
pprint.pp(results)

{'searchParameters': {'q': 'Tesla Inc.',
                      'gl': 'us',
                      'hl': 'en',
                      'num': 10,
                      'type': 'news',
                      'tbs': 'qdr:h',
                      'engine': 'google'},
 'news': [{'title': 'Top 19 Industries That Are Hiring Right Now',
           'link': 'https://finance.yahoo.com/news/top-19-industries-hiring-now-201400531.html',
           'snippet': 'In this article, we will look at the top 19 industries '
                      'that are hiring right now. We have also discussed the '
                      'state of US labor market.',
           'date': '36 mins ago',
           'source': 'Yahoo Finance',
           'imageUrl': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQFo7a5t9tWnZf_5iLwDvSHIoCjXfV7kH4wzWkP6YWvVsZTUG7mi9PsIoJpYw&s',
           'position': 1},
          {'title': 'Paul ‘Triple H’ Levesque makes public comments after WWE '
                    'employee’s sex traffic

In [259]:
from langchain.retrievers import ChaindeskRetriever

In [266]:


# Step 1: Collect the data
search = GoogleSerperAPIWrapper(type="news")
results = search.results("Tesla Inc.")
pprint.pp(results)

# Step 2: Process the data
articles = []
for news_item in results['news']:
    articles.append({
        'title': news_item['title'],
        'snippet': news_item['snippet'],
        'link': news_item['link']
    })

# Step 3: Summarize the data
summary = ""
for article in articles:
    summary += f"Title: {article['title']}\nSnippet: {article['snippet']}\nLink: {article['link']}\n\n"

# Step 4: Present the summary
print("News Summary:\n")
print(summary)


{'searchParameters': {'q': 'Tesla Inc.',
                      'gl': 'us',
                      'hl': 'en',
                      'num': 10,
                      'type': 'news',
                      'engine': 'google'},
 'news': [{'title': 'Tesla, Inc. (NASDAQ:TSLA) Q4 2023 Earnings Call '
                    'Transcript',
           'link': 'https://finance.yahoo.com/news/tesla-inc-nasdaq-tsla-q4-150736569.html',
           'snippet': 'Tesla, Inc. (NASDAQ:TSLA) Q4 2023 Earnings Call '
                      "Transcript January 24, 2024 Tesla, Inc. isn't one of "
                      'the 30 most popular stocks among hedge...',
           'date': '3 days ago',
           'source': 'Yahoo Finance',
           'imageUrl': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS6BE0OegINv5NTE_TJTrdxEnjtjVGi31N8ZF7XeSq_TtSdGKvT-KU40wWUNQ&s',
           'position': 1},
          {'title': 'Tesla’s earnings call was a ‘train wreck,’ bullish '
                    'analyst concedes',
      