### **Llama w/ Augmentation** ###

In [13]:
newsapi = NewsApiClient(api_key="03122dc3b7b84ea29212ca965b40c7aa")
sources = newsapi.get_sources()
for source in sources["sources"]:
	print(source["id"])

abc-news
abc-news-au
aftenposten
al-jazeera-english
ansa
argaam
ars-technica
ary-news
associated-press
australian-financial-review
axios
bbc-news
bbc-sport
bild
blasting-news-br
bleacher-report
bloomberg
breitbart-news
business-insider
buzzfeed
cbc-news
cbs-news
cnn
cnn-es
crypto-coins-news
der-tagesspiegel
die-zeit
el-mundo
engadget
entertainment-weekly
espn
espn-cric-info
financial-post
focus
football-italia
fortune
four-four-two
fox-news
fox-sports
globo
google-news
google-news-ar
google-news-au
google-news-br
google-news-ca
google-news-fr
google-news-in
google-news-is
google-news-it
google-news-ru
google-news-sa
google-news-uk
goteborgs-posten
gruenderszene
hacker-news
handelsblatt
ign
il-sole-24-ore
independent
infobae
info-money
la-gaceta
la-nacion
la-repubblica
le-monde
lenta
lequipe
les-echos
liberation
marca
mashable
medical-news-today
msnbc
mtv-news
mtv-news-uk
national-geographic
national-review
nbc-news
news24
new-scientist
news-com-au
newsweek
new-york-magazine
next-big-fu

#### Configuration

In [16]:
# Dependencies
import os
from config import DARTMOUTH_API_KEY, DARTMOUTH_CHAT_API_KEY
from langchain_dartmouth.llms import DartmouthLLM

# Retrieving keys and creating environment variables
os.environ['DARTMOUTH_CHAT_API_KEY'] = DARTMOUTH_CHAT_API_KEY
os.environ['DARTMOUTH_API_KEY'] = DARTMOUTH_API_KEY

# Defining llm and embeddings models
llm_model_name = "llama-3-8b-instruct"
embeddings_model_name = "bge-m3"

# Defining keywords and sources
keywords = "antitrust OR Lina Khan OR monopoly OR regulation"
source="cnn"

# Defining testing data file
testing_data = '../input/antitrust.json'

#### Building Knowledge Base

In [17]:
# Importing dependencies
import requests
import os
from bs4 import BeautifulSoup
from newsapi import NewsApiClient
import re

# Creating directory to hold scraped articles
os.makedirs(name=f"../knowledge-bases/{keywords}_{source}")

# Creating newsapi client
newsapiclient = NewsApiClient(api_key="03122dc3b7b84ea29212ca965b40c7aa")

# Querying articles
articles = newsapiclient.get_everything(q=keywords, sources=source ,page_size=100)
articles = articles['articles']

# Scraping articles and saving in directory
for article in articles:
	url = article['url']
	response = requests.get(url=url)

	# Printing article content to directory if valid response
	if response.status_code==200:
		beautifulsoup = BeautifulSoup(response.content, "html.parser")
		article_paragraphs = beautifulsoup.find_all("p")

		# Cleaning article title
		article_title = re.sub(' ', '_', article['title'])

		with open(file=f"../knowledge-bases/{keywords}_{source}/{article_title}.txt", mode="w") as fp:
			for paragraph in article_paragraphs:
				paragraph_cleaned = str(paragraph.get_text()).strip()

				if paragraph_cleaned != "":
					fp.write(paragraph_cleaned)

#### Loading and Splitting Documents

In [18]:
# Importing dependencies
from langchain.document_loaders import DirectoryLoader
from langchain_text_splitters import CharacterTextSplitter

# Defining directory path
directory = f"../knowledge-bases/{keywords}_{source}"

# Creating tokenizer

# Creating loader and splitter
loader = DirectoryLoader(path=directory, glob="*.txt")
splitter = CharacterTextSplitter.from_tiktoken_encoder(encoding_name="cl100k_base", 
													   chunk_size=256, chunk_overlap=0)

# Loading and splitting documents
docs = loader.load_and_split(text_splitter=splitter)

Created a chunk of size 508, which is longer than the specified 256
Created a chunk of size 414, which is longer than the specified 256
Created a chunk of size 337, which is longer than the specified 256
Created a chunk of size 775, which is longer than the specified 256
Created a chunk of size 447, which is longer than the specified 256
Created a chunk of size 475, which is longer than the specified 256
Created a chunk of size 1214, which is longer than the specified 256
Created a chunk of size 529, which is longer than the specified 256
Created a chunk of size 279, which is longer than the specified 256
Created a chunk of size 306, which is longer than the specified 256
Created a chunk of size 513, which is longer than the specified 256
Created a chunk of size 1099, which is longer than the specified 256
Created a chunk of size 352, which is longer than the specified 256
Created a chunk of size 433, which is longer than the specified 256
Created a chunk of size 883, which is longer t

#### Embedding and Storing Documents

In [19]:
# Importing dependencies
from langchain_dartmouth.embeddings import DartmouthEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore

# Creating embeddings model
embeddings = DartmouthEmbeddings(model_name=embeddings_model_name, dartmouth_api_key=str(DARTMOUTH_API_KEY))

# Embedding documents and storing them in memory
vector_store = InMemoryVectorStore(embedding=embeddings)

for i in range(0, len(docs), 50):
	_ = vector_store.add_documents(docs[i: i+100])

#### Retrieval and Generation

In [20]:
# Importing dependencies
from langchain_dartmouth.llms import ChatDartmouth
import json

# Initializing variable to hold output
output = ""

# Initializing variable referencing LLM
llm = ChatDartmouth(model_name="llama-3-1-8b-instruct")

# Open testing data file
with open(testing_data, 'r') as fp:
	test_data = json.load(fp)

# Iterating through each test data point
for tweet in test_data:

	# Retrieving most-similar documents
	query = tweet['Tweet']
	docs = vector_store.similarity_search(query, k=5)
	
	# Creating augmented prompt
	prompt = (
		"Classify tweet as 'Real News' or 'Fake News': "
		+ query
		+ f"\n\nConsider the following info: \n\n"
	)

	for doc in docs:
		prompt += doc.page_content + "\n--\n"
	
	prompt = prompt + "Only respond with the classification."

	# Querying LLM and printing response to file
	response = llm.invoke(prompt)
	output = response.pretty_repr() + output

#### Evaluation 

In [21]:
# Importing dependencies
import json
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix, f1_score
import io

# Loading true labels
with open(testing_data) as fp_input:
	input_data = json.load(fp_input)

	# Encoding true labels as 0s and 1s
	y_actual = []

	for entry in input_data:
		label = entry['Label']

		if label == "Real News":
			y_actual.append(1)
		elif label == "Fake News":
			y_actual.append(0)

# Extracting predicted labels
y_pred = []

# Encoding true labels as 0s and 1s
with io.StringIO(output) as fp_output:

	for line in fp_output:
		if "Real News" in line:
			y_pred.append(1)
		elif "Fake News" in line:
			y_pred.append(0)

# Calculating and printing metrics
recall = recall_score(y_true=y_actual, y_pred=y_pred)
precision = precision_score(y_true=y_actual, y_pred=y_pred)
accuracy = accuracy_score(y_true=y_actual, y_pred=y_pred)
f1score = f1_score(y_true=y_actual, y_pred=y_pred)
confusion_matrix = confusion_matrix(y_true=y_actual, y_pred=y_pred)

print("Recall: ", recall)
print("Precision: ",precision)
print("Accuracy: ",accuracy)
print("F1 Score", f1score)
print("Confusion matrix: ",confusion_matrix)
		
		

Recall:  0.8076923076923077
Precision:  0.525
Accuracy:  0.5
F1 Score 0.6363636363636364
Confusion matrix:  [[ 3 19]
 [ 5 21]]


### **Llama w/o Augmentation** ###

#### Generation

In [22]:
# Importing dependencies
import json

# Initializing variable to hold output
output = ""

# Initializing variable referencing LLM
llm = ChatDartmouth(model_name="llama-3-1-8b-instruct")

# Open testing data file
with open(testing_data, 'r') as fp:
	test_data = json.load(fp)

# Iterating through each test data point
for tweet in test_data:

	# Creating non-augmented prompt
	query = tweet['Tweet']
	prompt = (
		"Classify this tweet as 'Real News' or 'Fake News': "
		+ query
		+ "Only respond with the classification."
	)

	# Querying LLM and printing response to file
	response = llm.invoke(prompt)
	output = response.pretty_repr() + output

#### Evaluation

In [23]:
# Importing dependencies
import json
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix, f1_score
import io

# Loading true labels
with open(testing_data) as fp_input:
	input_data = json.load(fp_input)

	# Encoding true labels as 0s and 1s
	y_actual = []

	for entry in input_data:
		label = entry['Label']

		if label == "Real News":
			y_actual.append(1)
		elif label == "Fake News":
			y_actual.append(0)

# Extracting predicted labels
y_pred = []

# Encoding true labels as 0s and 1s
with io.StringIO(output) as fp_output:

	for line in fp_output:
		if "Real News" in line:
			y_pred.append(1)
		elif "Fake News" in line:
			y_pred.append(0)

# Calculating and printing metrics
recall = recall_score(y_true=y_actual, y_pred=y_pred)
precision = precision_score(y_true=y_actual, y_pred=y_pred)
accuracy = accuracy_score(y_true=y_actual, y_pred=y_pred)
f1score = f1_score(y_true=y_actual, y_pred=y_pred)
confusion_matrix = confusion_matrix(y_true=y_actual, y_pred=y_pred)

print("Recall: ", recall)
print("Precision: ",precision)
print("Accuracy: ",accuracy)
print("F1 Score", f1score)
print("Confusion matrix: ",confusion_matrix)

Recall:  0.5384615384615384
Precision:  0.4827586206896552
Accuracy:  0.4375
F1 Score 0.509090909090909
Confusion matrix:  [[ 7 15]
 [12 14]]
