In [22]:
#!pip install openai

In [23]:
#!pip -q install langchain tiktoken chromadb pypdf transformers InstructorEmbedding sentence_transformers

In [24]:
#!pip install langchain --upgrade

In [25]:
#!pip -q install accelerate bitsandbytes sentencepiece google.generativeai tensorflow_text

In [26]:
!#pip install langchain --upgrade

In [27]:
#!pip install faiss-cpu tabulate

In [1]:
from langchain import PromptTemplate, LLMChain
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
from langchain.chains import RetrievalQA
from langchain.document_loaders import DataFrameLoader
from langchain.document_transformers import EmbeddingsRedundantFilter
from langchain.llms import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.utilities import GoogleSerperAPIWrapper
from langchain.vectorstores import FAISS

import pandas as pd
import os
os.environ["OPENAI_API_KEY"] = ""

In [2]:
df = pd.read_pickle("../data/raw_data1.pkl")  

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 852 entries, 0 to 402
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date      852 non-null    datetime64[ns]
 1   source    852 non-null    object        
 2   text      852 non-null    object        
 3   title     852 non-null    object        
 4   chairman  852 non-null    object        
 5   variable  852 non-null    object        
dtypes: datetime64[ns](1), object(5)
memory usage: 46.6+ KB


In [4]:
df['source'] = df['date'].dt.strftime('%Y-%m-%d') + '_' + df['variable']  # Concatenate "date" and "variable" columns

In [5]:
# Split the text into paragraphs
df['paragraphs'] = df['text'].apply(lambda x: x.split('\n'))

In [71]:
# Explode the DataFrame on the 'paragraphs' column
df_exploded = df.explode('paragraphs')

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Vectorize the paragraphs
vectorizer = CountVectorizer(stop_words='english')
data_vectorized = vectorizer.fit_transform(df_exploded['paragraphs'])

# Apply LDA
lda_model = LatentDirichletAllocation(n_components=10, random_state=0)
lda_model.fit(data_vectorized)

# Create a DataFrame with the topic probabilities for each paragraph
df_topics = pd.DataFrame(lda_model.transform(data_vectorized))

In [73]:
# Reset the index of df_exploded
df_exploded.reset_index(drop=True, inplace=True)

# Now you can concatenate df_exploded and df_topics
df_exploded = pd.concat([df_exploded, df_topics], axis=1)

In [74]:
# Get the feature names
tf_feature_names = [word for word, index in sorted(vectorizer.vocabulary_.items(), key=lambda item: item[1])]

# Get the top words for each topi|c
topics = [[(tf_feature_names[i], topic[i]) 
            for i in topic.argsort()[:-5 - 1:-1]] 
          for topic_idx, topic in enumerate(lda_model.components_)]

# Step 1 & 2: Identify the maximum probability and corresponding topic
df_exploded['dominant_topic'] = df_exploded[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]].idxmax(axis=1)
df_exploded['dominant_topic'] = df_exploded['dominant_topic'].astype(str)

# Map the topic names to their textual representation
topics_dict = {str(i): '|'.join([word for word, _ in topic]) for i, topic in enumerate(topics)}  
df_exploded['dominant_topic'] = df_exploded['dominant_topic'].map(topics_dict)

In [75]:
df_exploded.drop(columns=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], inplace=True)

In [76]:
df_exploded.head()

Unnamed: 0,date,source,text,title,chairman,variable,paragraphs,dominant_topic
0,1994-02-04,1994-02-04_fomc_minutes,"\n\n\n\nFRB: FOMC Minutes--February 3-4, 1994\...",Minutes of the Federal Open Market Committee,Alan Greenspan,fomc_minutes,,mr|statement|january|30|29
1,1994-02-04,1994-02-04_fomc_minutes,"\n\n\n\nFRB: FOMC Minutes--February 3-4, 1994\...",Minutes of the Federal Open Market Committee,Alan Greenspan,fomc_minutes,,mr|statement|january|30|29
2,1994-02-04,1994-02-04_fomc_minutes,"\n\n\n\nFRB: FOMC Minutes--February 3-4, 1994\...",Minutes of the Federal Open Market Committee,Alan Greenspan,fomc_minutes,,mr|statement|january|30|29
3,1994-02-04,1994-02-04_fomc_minutes,"\n\n\n\nFRB: FOMC Minutes--February 3-4, 1994\...",Minutes of the Federal Open Market Committee,Alan Greenspan,fomc_minutes,,mr|statement|january|30|29
4,1994-02-04,1994-02-04_fomc_minutes,"\n\n\n\nFRB: FOMC Minutes--February 3-4, 1994\...",Minutes of the Federal Open Market Committee,Alan Greenspan,fomc_minutes,"FRB: FOMC Minutes--February 3-4, 1994",mr|statement|january|30|29


In [81]:
# save df_exploded to csv
df_exploded.to_csv('../data/df_exploded.csv', index=False)

In [59]:
llm = OpenAI(model_name="text-davinci-003", n=2,temperature=0.1, best_of=2, max_tokens=2048)

In [60]:
dates_template = """The dataframe contains data from FOMC statements, minutes, press conference rows each identified by a unique date (in the format YYYY-MM-DD) and a variable.
 The other columns are 'source', 'text', 'title', and 'chairman'. 
Based on a user's query, you need to return the appropriate 'year' and 'month'separated by a comma.

User's Query: {query}  
Answer: """


dates_prompt = PromptTemplate(template=dates_template, input_variables=["query"])
dates_chain = LLMChain(prompt=dates_prompt, llm=llm)

In [82]:
topics_template = """The dataframe contains data from FOMC statements, minutes, press conference divided into paras into each row. 
The columns are 'date', 'variable', 'source', 'text', 'title', and 'chairman'. 
The dataframe also includes topic probabilities for each paragraph.
Based on a user's query and a list of topics, return the TOP 5 most relevant topics needed to answer user's query separated by comma.

List of Topics: {topics}
User's Query: {query}  
Answer: """

topics_prompt = PromptTemplate(template=topics_template, input_variables=["topics", "query"])
topics_chain = LLMChain(prompt=topics_prompt, llm=llm)

In [83]:
def get_topics_from_query(topics_string, question):
    topics = topics_string.split('|')
    response = topics_chain.run({"topics": topics, "query": question})
    # Assuming the response is the most relevant topics separated by comma
    relevant_topics = response.split(',')
    return relevant_topics

In [88]:
def get_rows_from_query(df, question):
    response = dates_chain.run(question)
    # Assuming the response is in the format "year,month"
    year, month = response.split(',')
    # Filter the dataframe based on the date
    relevant_rows = df[(df['date'].dt.year == int(year)) & (df['date'].dt.month == int(month))]

    # Get a list of unique topics from the relevant rows
    topics_string = '|'.join(relevant_rows['dominant_topic'].unique().tolist())

    # Get the most relevant topics for the query
    relevant_topics = get_topics_from_query(topics_string, question)
    relevant_topics = [topic.strip() for topic in relevant_topics]

    # Filter the relevant rows based on the most relevant topics
    relevant_rows = relevant_rows[relevant_rows['dominant_topic'].apply(lambda x: any(topic in x for topic in relevant_topics))]

    return relevant_rows

In [89]:
get_rows_from_query(df_exploded, "What are the major topics discussed in the February 2023 meeting?")

Unnamed: 0,date,source,text,title,chairman,variable,paragraphs,dominant_topic
910902,2023-02-01,2023-02-01_fomc_presconf,"February 1, 2023 Chair Powell’s Press Confer...",Press Conference Transcript,Jerome H. Powell,fomc_presconf,Page 1 of 23,market|federal|reserve|committee|open
910906,2023-02-01,2023-02-01_fomc_presconf,"February 1, 2023 Chair Powell’s Press Confer...",Press Conference Transcript,Jerome H. Powell,fomc_presconf,"CHAIR POWELL . Good afternoon, and welcome. ...",mr|chairman|thank|president|board
910907,2023-02-01,2023-02-01_fomc_presconf,"February 1, 2023 Chair Powell’s Press Confer...",Press Conference Transcript,Jerome H. Powell,fomc_presconf,"hardship that high inflation is causing, and w...",policy|inflation|monetary|committee|term
910908,2023-02-01,2023-02-01_fomc_presconf,"February 1, 2023 Chair Powell’s Press Confer...",Press Conference Transcript,Jerome H. Powell,fomc_presconf,down to our 2 percent goal. Over the past yea...,policy|inflation|monetary|committee|term
910909,2023-02-01,2023-02-01_fomc_presconf,"February 1, 2023 Chair Powell’s Press Confer...",Press Conference Transcript,Jerome H. Powell,fomc_presconf,stance of monetary policy. We have covered a ...,policy|inflation|monetary|committee|term
...,...,...,...,...,...,...,...,...
911833,2023-02-01,2023-02-01_fomc_statements,inks provided below. \n\n\n\n\n\n\nFebruary 01...,Federal Reserve issues FOMC statement,Jerome H. Powell,fomc_statements,Link to Federal Reserve YouTube Page,market|federal|reserve|committee|open
911834,2023-02-01,2023-02-01_fomc_statements,inks provided below. \n\n\n\n\n\n\nFebruary 01...,Federal Reserve issues FOMC statement,Jerome H. Powell,fomc_statements,Link to Federal Reserve Flickr Page,market|federal|reserve|committee|open
911835,2023-02-01,2023-02-01_fomc_statements,inks provided below. \n\n\n\n\n\n\nFebruary 01...,Federal Reserve issues FOMC statement,Jerome H. Powell,fomc_statements,Federal Reserve LinkedIn Page,market|federal|reserve|committee|open
911837,2023-02-01,2023-02-01_fomc_statements,inks provided below. \n\n\n\n\n\n\nFebruary 01...,Federal Reserve issues FOMC statement,Jerome H. Powell,fomc_statements,Subscribe to Email,mr|chairman|thank|president|board


In [90]:
import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(str(source.metadata['date']) + '_' + source.metadata['variable'])

In [96]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

def answer_query(df, query):
    rows = get_rows_from_query(df, query)

    # Combine rows with the same 'source' into a single row
    combined_rows = rows.groupby('source').agg({
        'date': 'first',
        'text': ' '.join,
        'title': 'first',
        'chairman': 'first',
        'dominant_topic': 'first',
        'variable': 'first',
        # Include any other columns that you want to keep
    }).reset_index()

    loader = DataFrameLoader(combined_rows, page_content_column="source")
    documents = loader.load()     

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
    texts = text_splitter.split_documents(documents)

    retriever = FAISS.from_documents(texts, OpenAIEmbeddings()).as_retriever(search_type="mmr") 

    """compressor = LLMChainExtractor.from_llm(llm)
    compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)"""

    qa_chain = RetrievalQA.from_chain_type(llm=llm, 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

    return qa_chain(query)


In [97]:
query = "What is the sentiment around inflation in the May 2022 meeting based on the documents?"
llm_response = answer_query(df_exploded, query)
process_llm_response(llm_response)

 The sentiment around inflation in the May 2022 meeting can be found in the FOMC statements, press conference,
and minutes.


Sources:
2022-05-04 00:00:00_fomc_statements
2022-05-04 00:00:00_fomc_presconf
2022-05-04 00:00:00_fomc_minutes


In [None]:
#@title Default title text
#Define the scope of the chatbot : specific range of info or functionality that our bot will cover. Identify boundaries of our bot.
# sample general questions

# When is the next FOMC meeting?
# What is the purpose of the FOMC meetings?
# How often does the FOMC meet?
# What are the key factors considered during FOMC meetings?
# What is the current interest rate set by the FOMC?
# What was the outcome of the last FOMC meeting?
#   
# Can you explain the FOMC statement from the latest meeting?
# How does the FOMC influence the economy?
# Are FOMC meetings open to the public?

#sample questions that an investor can pose to the chatbot

# What was the outcome of the latest FOMC meeting, and how did it affect interest rates?
# Did the FOMC provide any guidance on future interest rate changes during the meeting?
# What was the rationale behind the FOMC's decision to raise/lower/keep interest rates unchanged?
# How did the market react to the FOMC's decision, and what are the implications for my investment portfolio?
# Were there any notable statements or remarks made by the FOMC chair or committee members during the meeting?
# Did the FOMC provide any insights or projections regarding the state of the economy and its impact on monetary policy?
# Were there any changes in the FOMC's assessment of inflation, employment, or other key economic indicators?
# How do the FOMC's decisions align with my investment strategy or outlook?
# What are the expectations for future FOMC meetings and potential policy shifts?
# Are there any specific sectors or industries that are likely to be affected by the FOMC's decisions?
# How does the FOMC's decision on interest rates align with the current economic conditions?
# Did the FOMC provide any forward guidance on the pace or direction of future interest rate changes?
# What are the potential implications of the FOMC's decision for bond markets and fixed-income investments?
# How might the FOMC's decision impact currency exchange rates and international investments?
# Did the FOMC express any concerns or considerations regarding financial stability or potential risks to the economy?
# Are there any specific sectors or industries that are expected to benefit or face challenges as a result of the FOMC's decision?
# How might the FOMC's decision impact borrowing costs for businesses and consumers?
# Did the FOMC provide any insights into its views on inflation expectations and their potential impact on investment strategies?
# What are the potential implications of the FOMC's decision for equity markets and specific stocks or sectors?
# Are there any changes in the FOMC's assessment of the economic outlook that might warrant adjustments to investment strategies?

#prompts for comparing topics

# "Can you provide a comparison of interest rate changes by the FOMC over the past five years?"
# "What were the key economic indicators discussed in FOMC meetings during the last quarter, and how do they compare to the previous quarter?"
# "How have the FOMC's statements on inflation evolved over the past 10 years, and what trends can be observed?"
# "Could you outline the decisions made by the FOMC regarding quantitative easing programs and their impact on the stock market over the past three years?"
# "What were the FOMC's projections for GDP growth in the past five years, and how did the actual outcomes compare to those projections?"
# "Has there been a notable change in the FOMC's tone or language regarding its stance on interest rates over the past year?"
# "Can you provide a historical comparison of the FOMC's policy statements and their impact on bond yields over the past decade?"
# "How have market reactions differed based on the FOMC's decisions to raise, lower, or keep interest rates unchanged over the past five years?"
# "What trends can be observed in the FOMC's discussions on employment and its relationship to monetary policy decisions over the past two years?"
# "Has there been any significant shift in the FOMC's outlook on global economic factors and their impact on the domestic economy over the past three years?"
# "Compare the FOMC's interest rate decisions during periods of economic expansion versus economic contraction over the past decade."
# "Provide a historical comparison of the FOMC's response to financial crises and their impact on equity markets."
# "What are the differences in the FOMC's statements regarding inflation expectations during periods of low versus high inflation over the past five years?"
# "Compare the FOMC's outlook on employment and its relationship to interest rate decisions during periods of rapid economic growth versus periods of stagnation."
# "Has there been a notable shift in the FOMC's discussions on global trade and its influence on monetary policy decisions over the past three years?"
# "Compare the FOMC's responses to geopolitical events and their effect on market volatility over the past decade."
# "What patterns can be observed in the FOMC's decisions to taper or expand quantitative easing programs during periods of economic uncertainty?"
# "Compare the FOMC's language on financial stability risks and its impact on bond markets during periods of market turbulence over the past five years."
# "Has there been a significant change in the FOMC's approach to forward guidance and its impact on market expectations over the past two years?"
# "Compare the FOMC's statements on wage growth and its influence on inflation targeting during periods of tight labor markets versus periods of high unemployment."

In [14]:
from langchain.agents import create_pandas_dataframe_agent
from langchain.chat_models import ChatOpenAI
from langchain.agents.agent_types import AgentType

In [17]:
agent = create_pandas_dataframe_agent(OpenAI(temperature=0), df, verbose=True)

In [19]:
agent.run("How many records are there in the dataframe?")



[1m> Entering new  chain...[0m


Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIError: HTTP code 413 from API (<html>
<head><title>413 Request Entity Too Large</title></head>
<body>
<center><h1>413 Request Entity Too Large</h1></center>
<hr><center>cloudflare</center>
</body>
</html>
).
Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIError: HTTP code 413 from API (<html>
<head><title>413 Request Entity Too Large</title></head>
<body>
<center><h1>413 Request Entity Too Large</h1></center>
<hr><center>cloudflare</center>
</body>
</html>
).


KeyboardInterrupt: 