### Installing Neccessary Packages

In [1]:
from pinecone import Pinecone, ServerlessSpec

from langchain_openai import AzureOpenAIEmbeddings
from langchain_openai import AzureOpenAI
from langchain.chat_models import AzureChatOpenAI
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader
from langchain.llms import AzureOpenAI

from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
import azure.identity

from langchain import PromptTemplate, LLMChain
from langchain.chains import RetrievalQA
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

import os
import pyodbc
import json
import requests
from datetime import datetime
import pandas as pd
from datetime import datetime, timedelta


import warnings
warnings.filterwarnings('ignore')

### Connecting To Azure Open AI

In [2]:
load_dotenv() # Load environment variables from the .env file
deployment_name                       = "CART"
embedding_deployment_name             = os.getenv("embedding_deployment_name") 
AZURE_OPENAI_API_TYPE                 = os.getenv("AZURE_OPENAI_API_TYPE")
AZURE_OPENAI_API_KEY                  = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT                 = os.getenv("AZURE_OPENAI_API_BASE")
AZURE_OPENAI_API_VERSION              = os.getenv("AZURE_OPENAI_API_VERSION_CHAT")
AZURE_OPENAI_API_VERSION_EMBEDDING    = os.getenv("AZURE_OPENAI_API_VERSION_EMBEDDING")

os.environ["OPENAI_API_VERSION"]      = AZURE_OPENAI_API_VERSION
os.environ["AZURE_OPENAI_ENDPOINT"]   = AZURE_OPENAI_ENDPOINT
os.environ["AZURE_OPENAI_API_KEY"]    = AZURE_OPENAI_API_KEY

print("Establishing connection with GPT-4 Turbo OpenAI LLM.")
llm = AzureChatOpenAI(
                        deployment_name = deployment_name,
                        temperature=0.0
                    )
print("Established connection with GPT-4 Turbo OpenAI LLM.")


print("Fetching GPT-4 OpenAI Embeddings.")
embeddings_model = AzureOpenAIEmbeddings(
                                            model          = "text-embedding-ada-002",
                                            azure_endpoint = AZURE_OPENAI_ENDPOINT,
                                            api_key        = AZURE_OPENAI_API_KEY,
                                            openai_api_version = AZURE_OPENAI_API_VERSION_EMBEDDING
                                        )
print("Fetched with GPT-4 OpenAI Embeddings.")

Establishing connection with GPT-4 Turbo OpenAI LLM.
Established connection with GPT-4 Turbo OpenAI LLM.
Fetching GPT-4 OpenAI Embeddings.
Fetched with GPT-4 OpenAI Embeddings.


### Initializing Pinecone Connection

In [3]:
print("Initializing the connection to the Pinecone Vector Database.")
load_dotenv()  # Load environment variables from the .env file
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_HOST_NAME = os.getenv('PINECONE_HOST_NAME')
pc = Pinecone(api_key = PINECONE_API_KEY)
pc
print("Initialized the connection to the Pinecone Vector Database.")

Initializing the connection to the Pinecone Vector Database.
Initialized the connection to the Pinecone Vector Database.


### Connecting to Google News API

In [4]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import os

def fetch_fair_lending_articles():
    # Pull the NEWS_API_KEY from environment variables
    NEWS_API_KEY = os.getenv('NEWS_API_KEY')

    # Check if API key is available
    if not NEWS_API_KEY:
        raise ValueError("Please set the 'NEWS_API_KEY' environment variable.")

    # Define the base URL for NewsAPI
    url = 'https://newsapi.org/v2/everything'

    # Get the current date and the date from 7 days ago in YYYY-MM-DD format
    current_date = datetime.now().strftime('%Y-%m-%d')
    last_week_date = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')

    # Define the search query with specific keywords and phrases
    search_query = (
        '("fair lending" OR "disparate treatment" OR "overt discrimination" OR '
        '"redlining" OR "mortgage discrimination" OR "banking discrimination" OR '
        '"racial discrimination in banking" OR "lending bias" OR "credit discrimination")'
    )

    # Set parameters for the request
    params = {
        'q': search_query,
        'from': last_week_date,      # Start date (7 days ago)
        'to': current_date,          # End date (today)
        'language': 'en',            # Filter to English articles
        'sortBy': 'publishedAt',     # Sort by recent publications
        'pageSize': 10,              # Limit to 10 articles
        'apiKey': NEWS_API_KEY
    }

    # Make the request to the NewsAPI
    response = requests.get(url, params=params)

    # Handle the response
    if response.status_code == 200:
        data = response.json()
        articles = data.get('articles', [])
        
        # Extract relevant metadata and store it in a DataFrame
        if articles:
            articles_data = []

            # Loop through each article to scrape and clean the full content
            for article in articles:
                article_data = {
                    'Title': article.get('title'),
                    'Author': article.get('author'),
                    'Source': article.get('source', {}).get('name'),
                    'Description': article.get('description'),
                    'URL': article.get('url'),
                    'Published At': article.get('publishedAt'),
                }

                # Scrape the full content of the article
                article_url = article.get('url')
                try:
                    page = requests.get(article_url)
                    soup = BeautifulSoup(page.content, 'html.parser')

                    # Extract paragraphs from the content
                    paragraphs = soup.find_all('p')
                    raw_content = ' '.join([para.get_text() for para in paragraphs])

                    # Clean the text by removing non-ASCII characters
                    cleaned_content = ''.join(filter(lambda x: x in set(map(chr, range(32, 127))), raw_content))

                    article_data['Content'] = cleaned_content
                except Exception as e:
                    article_data['Content'] = f"Error fetching content: {e}"

                articles_data.append(article_data)

            # Create a DataFrame from the list of dictionaries
            articles_df = pd.DataFrame(articles_data)
            return articles_df
        else:
            print("No articles found for the last week.")
            return pd.DataFrame()  # Return an empty DataFrame if no articles found
    else:
        raise Exception(f"Error: {response.status_code} - {response.text}")

# Example usage of the function
articles_df = fetch_fair_lending_articles()
articles_df


Unnamed: 0,Title,Author,Source,Description,URL,Published At,Content
0,Immigrant Underemployment Contributes to Menta...,Robert T Muller Ph.D.,Psychology Today,Immigrants and refugees often struggle to find...,https://www.psychologytoday.com/intl/blog/talk...,2024-11-17T16:05:29Z,When we fall prey to perfectionism...
1,"Ward Scull, 81, passionate Virginia advocate f...",DAVE RESS Richmond Times-Dispatch,Richmond.com,"Ward Scull fought the long, hard battle to sto...",https://richmond.com/news/state-regional/ward-...,2024-11-15T18:33:00Z,E-edition PLUS unlimited articles & videos Per...
2,Mortgage access for Black buyers blamed for wi...,Marian McPherson,Inman,Mortgage discrimination and appraisal bias are...,https://www.inman.com/2024/11/14/mortgage-acce...,2024-11-14T19:03:45Z,Mortgage discrimination and appraisal bias are...
3,On Building Git for Lawyers,Jordan Bryan,Substack.com,"Over this past weekend, Twitter discovered the...",https://jordanbryan.substack.com/p/on-building...,2024-11-14T15:48:45Z,"Over this past weekend, Twitter discovered the..."
4,Project 2025 Document Mentions Trump's Name Mo...,Aleksandra Wrona,Snopes.com,U.S. President-elect Donald Trump's name is li...,https://www.snopes.com//fact-check/trump-proje...,2024-11-14T14:00:00Z,"About this rating The word ""Trump"" appears 31..."
5,"If You Like Being Ripped Off By Comcast, You’l...",Karl Bode,Techdirt,Current FCC Commissioner Brendan Carr has spen...,https://www.techdirt.com/2024/11/13/if-you-lik...,2024-11-13T13:31:35Z,Predictions Current FCC Commissioner Brendan C...
6,Court: Connecticut High School Females’ Discri...,James Nault,Legalinsurrection.com,Connecticut federal court rules that Plaintiff...,https://legalinsurrection.com/2024/11/court-co...,2024-11-13T00:00:29Z,This website is using a security service to pr...
7,CAMEO CEO Carolina Martinez: The Untapped Powe...,"Rhett Buttle, Contributor, \n Rhett Buttle, Co...",Forbes,"As CEO of CAMEO, Carolina Martinez leads a net...",https://www.forbes.com/sites/rhettbuttle/2024/...,2024-11-12T15:57:27Z,"Carolina Martinez, CEO of the CAMEO Network A..."
8,Must the Professor Crusade? W. Ralph Eubanks o...,W. Ralph Eubanks,Lithub.com,“I’ve seen this movie before” is what I though...,https://lithub.com/must-the-professor-crusade-...,2024-11-12T13:41:54Z,Ive seen this movie before is what I thought a...
9,Dis/Trusting the Institution(s) of Literature,groenlat@tcd.ie,Upenn.edu,"updated: \r\nTuesday, November 12, 2024 - 5:41...",http://call-for-papers.sas.upenn.edu/cfp/2024/...,2024-11-12T10:19:01Z,Jump to navigation Call for Papers a service p...


### Filtering to only Fair Lending data

In [5]:
from langchain import PromptTemplate, LLMChain
import pandas as pd

def evaluate_article_content(article_content, llm_chain):
    try:
        # Use the LLMChain to evaluate the article
        response = llm_chain.run({"article_content": article_content})
        # Determine if the response indicates the article is related
        return "yes" in response.lower()
    except Exception as e:
        print(f"Error while evaluating the article: {e}")
        return False

def summarize_article_content(article_content, llm):
    try:
        # Prompt to summarize the content in 4-5 easy-to-understand sentences focusing on Fair Lending
        summary_prompt = (
            "Please provide a concise summary of the following article in 4-5 sentences, focusing on the Fair Lending aspects of the article and making it easy to understand:\n\n"
            "{article_content}"
        )
        summary_template = PromptTemplate(input_variables=["article_content"], template=summary_prompt)
        summary_chain = LLMChain(llm=llm, prompt=summary_template)
        response = summary_chain.run({"article_content": article_content})
        return response
    except Exception as e:
        print(f"Error while summarizing the article: {e}")
        return "Error generating summary"

def evaluate_articles_with_langchain(articles_df, llm):
    # Ensure the DataFrame has content to process
    if articles_df.empty:
        raise ValueError("The articles DataFrame is empty.")

    # Define the prompt template
    prompt_template = (
        "Determine if the following article content is related to fair lending practices, "
        "a financial institution, a bank, CFPB regulations, or any type of discrimination. "
        "Please respond with 'Yes' if it is related, or 'No' if it is not:\n\n"
        "{article_content}"
    )

    # Create the LangChain PromptTemplate and LLMChain
    prompt = PromptTemplate(input_variables=["article_content"], template=prompt_template)
    llm_chain = LLMChain(llm=llm, prompt=prompt)

    # Apply the evaluation function to each article content
    articles_df['Is_Related'] = articles_df['Content'].apply(lambda x: evaluate_article_content(x, llm_chain))

    # Apply the summary function to related articles only
    articles_df['Content_Summary'] = articles_df.apply(
        lambda row: summarize_article_content(row['Content'], llm) if row['Is_Related'] else "", axis=1
    )
    updated_articles_df = articles_df.loc[articles_df['Is_Related'] == True,:]
    return updated_articles_df

# Assuming articles_df is obtained using fetch_fair_lending_articles()
updated_articles_df = evaluate_articles_with_langchain(articles_df, llm)



In [6]:
updated_articles_df

Unnamed: 0,Title,Author,Source,Description,URL,Published At,Content,Is_Related,Content_Summary
2,Mortgage access for Black buyers blamed for wi...,Marian McPherson,Inman,Mortgage discrimination and appraisal bias are...,https://www.inman.com/2024/11/14/mortgage-acce...,2024-11-14T19:03:45Z,Mortgage discrimination and appraisal bias are...,True,Black homeownership gains are being eroded by ...
7,CAMEO CEO Carolina Martinez: The Untapped Powe...,"Rhett Buttle, Contributor, \n Rhett Buttle, Co...",Forbes,"As CEO of CAMEO, Carolina Martinez leads a net...",https://www.forbes.com/sites/rhettbuttle/2024/...,2024-11-12T15:57:27Z,"Carolina Martinez, CEO of the CAMEO Network A...",True,"Carolina Martinez, CEO of the CAMEO Network, l..."


In [7]:
csv_filename = "updated_fair_lending_articles.csv"
updated_articles_df.to_csv(csv_filename, index=False)

### Loading into vector database

In [8]:
import pandas as pd
import numpy as np
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore as PineconeStore

# Function to embed and load articles and other documents into Pinecone
def embed_and_load_to_pinecone(updated_articles_df, additional_document, embeddings_model, pinecone_index):
    # Initialize Pinecone using the latest Pinecone class
    pc = Pinecone(api_key=PINECONE_API_KEY)

    # Create or connect to the Pinecone index
    if pinecone_index not in pc.list_indexes().names():
        pc.create_index(
            name=pinecone_index,
            dimension=1536,  # Assuming embedding dimension is 1536
            metric='cosine',  # Change metric if needed (e.g., euclidean, dotproduct)
            spec=ServerlessSpec(
                cloud='gcp',  # Replace with appropriate cloud provider if needed
                region='us-west1'  # Replace with your Pinecone region
            )
        )

    # Connect to the index and create PineconeStore with embeddings model
    pinecone_store = PineconeStore(index_name=pinecone_index, embedding=embeddings_model)

    # List to store metadata for DataFrame creation
    metadata_list = []
    load_datetime = datetime.now().isoformat()  # Capture the current datetime for loading

    # Prepare texts and metadata for news articles from updated_articles_df
    if not updated_articles_df.empty:
        texts = updated_articles_df['Content'].tolist()
        metadatas = updated_articles_df.apply(lambda row: {
            "document_type": "news_article",
            "title": row.get('Title', ''),
            "author": row.get('Author', ''),
            "source": row.get('Source', ''),
            "published_at": row.get('Published_At', ''),
            "summary": row.get('Content_Summary', ''),
            "file_name": row.get('File_Name', 'article'),
            "page_number": None,
            "table_name": None,
            "record_id": None,
            "generation_date": row.get('Generation_Date', ''),
            "month_year": row.get('Month_Year', ''),
            "document_version": 1,
            "load_datetime": load_datetime
        }, axis=1).tolist()

        # Upsert news articles into Pinecone
        for idx, (text, metadata) in enumerate(zip(texts, metadatas)):
            vector_id = f"article_{idx}"
            try:
                # Delete existing vector if it exists
                pinecone_store.delete([vector_id])
                print(f"Deleted existing vector with ID {vector_id}.")
            except Exception as e:
                print(f"Error deleting vector ID {vector_id}: {e}")

            # Replace None values with empty strings or valid defaults
            for key, value in metadata.items():
                if value is None:
                    metadata[key] = ''

            # Upsert the new vector with metadata (excluding text in metadata)
            try:
                pinecone_store.add_texts(texts=[text], metadatas=[{k: v for k, v in metadata.items() if k != 'content'}], ids=[vector_id])
                print(f"Successfully upserted vector with ID {vector_id} into Pinecone with metadata.")
                metadata_list.append(metadata)
            except Exception as e:
                print(f"Error upserting vector ID {vector_id} into Pinecone: {e}")

    # Prepare texts and metadata for additional document if it exists
    if additional_document:
        text = additional_document.get('content', '')
        metadata = {
            "document_type": additional_document.get('document_type', 'unknown'),
            "title": additional_document.get('title', ''),
            "author": additional_document.get('author', ''),
            "source": additional_document.get('source', ''),
            "published_at": additional_document.get('published_at', ''),
            "summary": additional_document.get('summary', ''),
            "file_name": additional_document.get('file_name', ''),
            "page_number": additional_document.get('page_number', ''),
            "table_name": additional_document.get('table_name', ''),
            "record_id": additional_document.get('record_id', ''),
            "generation_date": additional_document.get('generation_date', ''),
            "month_year": additional_document.get('month_year', ''),
            "document_version": additional_document.get('document_version', 1),
            "load_datetime": load_datetime
        }

        vector_id = "doc_0"
        try:
            # Delete existing vector if it exists
            pinecone_store.delete([vector_id])
            print(f"Deleted existing vector with ID {vector_id}.")
        except Exception as e:
            print(f"Error deleting vector ID {vector_id}: {e}")

        # Replace None values with empty strings or valid defaults
        for key, value in metadata.items():
            if value is None:
                metadata[key] = ''

        # Upsert the new vector with metadata (excluding text in metadata)
        try:
            pinecone_store.add_texts(texts=[text], metadatas=[{k: v for k, v in metadata.items() if k != 'content'}], ids=[vector_id])
            print(f"Successfully upserted vector with ID {vector_id} into Pinecone with metadata.")
            metadata_list.append(metadata)
        except Exception as e:
            print(f"Error upserting vector ID {vector_id} into Pinecone: {e}")

    # Create DataFrame from metadata
    metadata_df = pd.DataFrame(metadata_list)
    return metadata_df

# Example usage of embed_and_load_to_pinecone
additional_document = {}  # No additional document to upload for now

# Assuming updated_articles_df and embeddings_model are defined
metadata_df = embed_and_load_to_pinecone(updated_articles_df, additional_document, embeddings_model, "fair-lens")

# Print the metadata DataFrame
metadata_df

Error deleting vector ID article_0: (404)
Reason: Not Found
HTTP response headers: HTTPHeaderDict({'Date': 'Mon, 18 Nov 2024 16:10:15 GMT', 'Content-Type': 'application/json', 'Content-Length': '55', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '40', 'x-pinecone-request-id': '4317144196064810223', 'x-envoy-upstream-service-time': '41', 'server': 'envoy'})
HTTP response body: {"code":5,"message":"Namespace not found","details":[]}

Successfully upserted vector with ID article_0 into Pinecone with metadata.
Deleted existing vector with ID article_1.
Successfully upserted vector with ID article_1 into Pinecone with metadata.


Unnamed: 0,document_type,title,author,source,published_at,summary,file_name,page_number,table_name,record_id,generation_date,month_year,document_version,load_datetime
0,news_article,Mortgage access for Black buyers blamed for wi...,Marian McPherson,Inman,,Black homeownership gains are being eroded by ...,article,,,,,,1,2024-11-18T11:10:20.207272
1,news_article,CAMEO CEO Carolina Martinez: The Untapped Powe...,"Rhett Buttle, Contributor, \n Rhett Buttle, Co...",Forbes,,"Carolina Martinez, CEO of the CAMEO Network, l...",article,,,,,,1,2024-11-18T11:10:20.207272


In [9]:
metadata_df.to_csv("metadata.csv", index = False)

In [10]:
from langchain_pinecone import PineconeVectorStore as PineconeStore

def similarity_search_in_pinecone(query, pinecone_index_name, embeddings_model, top_k=3):
    """
    Perform similarity search on the Pinecone index.

    Parameters:
    - query (str): The query to search for similar documents.
    - pinecone_index_name (str): The name of the Pinecone index.
    - embeddings_model: The embedding model used to create the query embedding.
    - top_k (int): The number of top similar documents to return.

    Returns:
    - List of most similar documents and their metadata.
    """
    # Initialize Pinecone Store
    pinecone_store = PineconeStore(index_name=pinecone_index_name, embedding=embeddings_model)

    # Perform the similarity search using the query
    try:
        results = pinecone_store.similarity_search(query=query, k=top_k)
        return results
    except Exception as e:
        print(f"Error while performing similarity search: {e}")
        return []

# Example usage:
query_text = "What are the latest updates on Fair Lending practices?"
results = similarity_search_in_pinecone(query_text, "fair-lens", embeddings_model, top_k=5)

# Display results
for idx, result in enumerate(results, start=1):
    print(f"Result {idx}:")
    print(f"Title: {result.metadata.get('title')}")
    print(f"Content: {result.page_content[:200]}...")  # Displaying first 200 chars of the content for brevity
    print(f"Source: {result.metadata.get('source')}")
    print('-' * 80)


### Creating a chain for chatbot

In [11]:
prompt_template="""
                    You are an Fair Lending chatbot and an expert fair lending and discrimination related issues.
                    Use the following pieces of information to answer the user's question.If the user's question is related to an article, and if the
                    user has not mentioned which article in any way, then ask him which article and then answer after he responds.
                    If you don't know the answer, just say that you don't know, don't try to make up an answer.
                    
                    Context: {context}
                    Question: {question}
                    
                    Only return the helpful answer below and nothing else.
                    Helpful answer:
                """
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs = {"prompt": PROMPT}

In [12]:
PROMPT

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="\n                    You are an Fair Lending chatbot and an expert fair lending and discrimination related issues.\n                    Use the following pieces of information to answer the user's question.If the user's question is related to an article, and if the\n                    user has not mentioned which article in any way, then ask him which article and then answer after he responds.\n                    If you don't know the answer, just say that you don't know, don't try to make up an answer.\n                    \n                    Context: {context}\n                    Question: {question}\n                    \n                    Only return the helpful answer below and nothing else.\n                    Helpful answer:\n                ")

In [13]:
# Initialize Pinecone Store
pinecone_store = PineconeStore(index_name='fair-lens', embedding=embeddings_model)

# Create retriever using PineconeStore
retriever = pinecone_store.as_retriever(search_kwargs={"k": 3})
retriever


VectorStoreRetriever(tags=['PineconeVectorStore', 'AzureOpenAIEmbeddings'], vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x000001768FC578E0>, search_kwargs={'k': 3})

In [14]:
# chain = (
#             {"context": retriever, "question": RunnablePassthrough()}
#             | PROMPT
#             | llm
#             | StrOutputParser()
#         )

In [15]:
# chain.invoke("Can you summarize the article with title 'Mortgage access for Black buyers blamed for widening ownership gap'?")

In [16]:
# Create a RetrievalQA chain using LangChain
retrieval_qa_chain = RetrievalQA.from_chain_type(
                                                    llm=llm,
                                                    chain_type="stuff",
                                                    retriever=retriever,
                                                    chain_type_kwargs={"prompt": PROMPT}
                                                )


In [18]:
retrieval_qa_chain.invoke("Can you summarize the article with title 'Mortgage access for Black buyers blamed for widening ownership gap'?")

{'query': "Can you summarize the article with title 'Mortgage access for Black buyers blamed for widening ownership gap'?",
 'result': "According to the article 'Mortgage access for Black buyers blamed for widening ownership gap', Black Americans face significant barriers to homeownership due to discriminatory lending practices and lack of access to affordable mortgages. This has resulted in a widening gap in homeownership rates between Black and white Americans."}