### **Configuration**

In [1]:
# Dependencies
import os
from config import DARTMOUTH_API_KEY, DARTMOUTH_CHAT_API_KEY
from langchain_dartmouth.llms import ChatDartmouth

# Retrieving keys and creating environment variables
os.environ['DARTMOUTH_CHAT_API_KEY'] = DARTMOUTH_CHAT_API_KEY
os.environ['DARTMOUTH_API_KEY'] = DARTMOUTH_API_KEY

# Defining llm and embeddings models
llm_model_name = "openai.gpt-4o-mini-2024-07-18"
embeddings_model_name = "bge-m3"

# Defining keywords and sources
keywords = "antitrust OR Lina Khan OR monopoly OR regulation"
source="breitbart-news"

# Defining testing data file
testing_data = '../input/antitrust.json'

### **Building Knowledge Base**

In [34]:
# Importing dependencies
import requests
import os
from bs4 import BeautifulSoup
from newsapi import NewsApiClient
import re

# Creating directory to hold scraped articles
os.makedirs(name=f"../knowledge-bases/{keywords}_{source}")

# Creating newsapi client
newsapiclient = NewsApiClient(api_key="03122dc3b7b84ea29212ca965b40c7aa")

# Querying articles
articles = newsapiclient.get_everything(q=keywords, sources=source ,page_size=100)
articles = articles['articles']

# Scraping articles and saving in directory
for article in articles:
	url = article['url']
	response = requests.get(url=url)

	# Printing article content to directory if valid response
	if response.status_code==200:
		beautifulsoup = BeautifulSoup(response.content, "html.parser")
		article_paragraphs = beautifulsoup.find_all("p")

		# Cleaning article title
		article_title = re.sub(' ', '_', article['title'])

		with open(file=f"../knowledge-bases/{keywords}_{source}/{article_title}.txt", mode="w") as fp:
			for paragraph in article_paragraphs:
				paragraph_cleaned = str(paragraph.get_text()).strip()

				if paragraph_cleaned != "":
					fp.write(paragraph_cleaned)

### **Loading and Splitting Documents**

In [35]:
# Importing dependencies
from langchain.document_loaders import DirectoryLoader
from langchain_text_splitters import CharacterTextSplitter

# Defining directory path
directory = f"../knowledge-bases/{keywords}_{source}"

# Creating tokenizer

# Creating loader and splitter
loader = DirectoryLoader(path=directory, glob="*.txt")
splitter = CharacterTextSplitter.from_tiktoken_encoder(encoding_name="cl100k_base", 
													   chunk_size=256, chunk_overlap=0)

# Loading and splitting documents
docs = loader.load_and_split(text_splitter=splitter)

### **Embedding and Storing Documents**

In [36]:
# Importing dependencies
from langchain_dartmouth.embeddings import DartmouthEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore

# Creating embeddings model
embeddings = DartmouthEmbeddings(model_name=embeddings_model_name, dartmouth_api_key=str(DARTMOUTH_API_KEY))

# Embedding documents and storing them in memory
vector_store = InMemoryVectorStore(embedding=embeddings)

for i in range(0, len(docs), 50):
	_ = vector_store.add_documents(docs[i: i+100])

### **Retrieval and Generation**

In [41]:
# Importing dependencies
from langchain_dartmouth.llms import ChatDartmouthCloud
import json

# Initializing variable referencing LLM
llm = ChatDartmouthCloud(model_name=llm_model_name)

# Open testing data file
with open(testing_data, 'r') as fp:
	test_data = json.load(fp)

counter = 1

# Iterating through each test data point
for tweet in test_data:

	# Retrieving most-similar documents
	query = tweet['Tweet']

	print("Query,", query)
	docs = vector_store.similarity_search(query, k=5)
	
	# Creating augmented prompt
	prompt = (
		"Classify as 'Real News' or 'Fake News': "
		+ query
		+ f"\n\nConsider the following info: \n\n"
		+ "Only respond with the classification"
	)

	# prompt = (
	# 	"Classify as 'Real News' or 'Fake News': "
	# 	+ query
	# 	+ "Only respond with the classification"
	# )

	for doc in docs:
		prompt += doc.page_content + "\n--\n"

	# Querying LLM and printing response to file
	response = llm.invoke(prompt)
	output = response.pretty_repr()

	with open(f"../output/{source}_monopoly", "a") as fp:
		print(output, file=fp)

Query, The US government has sued Google for violating antitrust laws, alleging that the company's search engine favors its own products over competitors.
Query, Facebook is planning to launch its own cryptocurrency, which will revolutionize the way we make transactions online.
Query, The European Commission has fined Amazon €250 million for abusing its market dominance in the e-commerce sector.
Query, Apple has been accused of using its dominant market position to stifle innovation in the smartwatch industry.
Query, The US Federal Trade Commission has cleared Google's acquisition of Waze, citing no antitrust concerns.
Query, Microsoft has been secretly working on a blockchain-based antitrust monitoring platform to track market trends.
Query, The US Justice Department has launched an investigation into Amazon's labor practices, citing concerns about worker wages and conditions.
Query, The European Union has approved a merger between two major airlines, despite concerns about reduced co

### **Evaluation**

In [42]:
import json
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix


with open(testing_data) as fp_input:
	input_data = json.load(fp_input)

	y_actual = []

	for entry in input_data:
		label = entry['Label']

		if label == "Real News":
			y_actual.append(1)
		elif label == "Fake News":
			y_actual.append(0)
	
	print(y_actual)



with open(f"../output/{source}_monopoly", "r") as fp_output:
	counter = 0

	y_pred = []

	for line in fp_output:
		if "Real News" in line:
			y_pred.append(1)
		elif "Fake News" in line:
			y_pred.append(0)
	
	print(y_pred)

recall = recall_score(y_true=y_actual, y_pred=y_pred)
precision = precision_score(y_true=y_actual, y_pred=y_pred)
accuracy = accuracy_score(y_true=y_actual, y_pred=y_pred)
confusion_matrix = confusion_matrix(y_true=y_actual, y_pred=y_pred)

print(recall)
print(precision)
print(accuracy)
print(confusion_matrix)
		
		

[1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0]
[1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1]
0.7878787878787878
0.9122807017543859
0.8041237113402062
[[26  5]
 [14 52]]
