In [56]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [83]:
base_dir = os.getcwd()
output_dir = os.path.join(base_dir, "..", "..", "output")

file_path1 = os.path.join(output_dir, "news_items.csv")
df1 = pd.read_csv(file_path1)

file_path2 = os.path.join(output_dir, "redirected_urls_1.csv")
df2 = pd.read_csv(file_path2)

In [84]:
df1.columns

Index(['Message_ID', 'Date', 'Query_Text', 'url1_', 'url2_', 'article1',
       'article2'],
      dtype='object')

In [85]:
df1.drop(columns=['Date', 'Query_Text', 'url1_', 'url2_'], inplace=True)

In [86]:
df1.head()

Unnamed: 0,Message_ID,article1,article2
0,1266,Profile\nSections\nLocal\ntv\nFeatured\nMore F...,When hundreds of hours of tapes from the Nixon...
1,1268,Error: No connection adapters were found for '...,Move comes five years after US did so and will...
2,1269,In Pictures\nIsrael’s war on Gaza has killed a...,Misleading videos shared by a spokesperson for...
3,1273,"This material may not be published, broadcast,...",Israeli military presents drone footage from G...
4,1274,To travel to Israel during wartime is to exper...,How is that an abstract issue like the reasona...


In [87]:
df2.columns

Index(['Message_ID', 'Date', 'Cleaned_text', 'Query_Text', 'news_urls', 'url1',
       'url2', 'url1_', 'url2_'],
      dtype='object')

In [88]:
df1 = pd.merge(df1, df2[['Message_ID', 'Cleaned_text']], on='Message_ID', how='left')

In [89]:
column_order = ['Message_ID', 'Cleaned_text', 'article1', 'article2']
df1 = df1[column_order]

In [90]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [91]:
df1['Cleaned_text'] = df1['Cleaned_text'].fillna('')
df1['article1'] = df1['article1'].fillna('')
df1['article2'] = df1['article2'].fillna('')

# Convert Telegram news to embeddings
telegram_embeddings = model.encode(df1['Cleaned_text'].tolist())

# Convert each credible news article to embeddings
article1_embeddings = model.encode(df1['article1'].tolist())
article2_embeddings = model.encode(df1['article2'].tolist())

In [92]:
similarity1 = cosine_similarity(telegram_embeddings, article1_embeddings)
similarity2 = cosine_similarity(telegram_embeddings, article2_embeddings)

max_similarities = np.maximum(np.diag(similarity1), np.diag(similarity2))

max_similarities = np.clip(max_similarities, 0, 1)
threshold = 0.7

is_fake = max_similarities < threshold

df1['Max_Cosine_Similarity'] = max_similarities
df1['Fake_News_Flag'] = is_fake

In [93]:
df1.head()

Unnamed: 0,Message_ID,Cleaned_text,article1,article2,Max_Cosine_Similarity,Fake_News_Flag
0,1266,donald trump hate america want eliminate israe...,Profile\nSections\nLocal\ntv\nFeatured\nMore F...,When hundreds of hours of tapes from the Nixon...,0.573545,True
1,1268,breaking germany called european union classif...,Error: No connection adapters were found for '...,Move comes five years after US did so and will...,0.497152,True
2,1269,accurate photo right israel israel v hamas syr...,In Pictures\nIsrael’s war on Gaza has killed a...,Misleading videos shared by a spokesperson for...,0.527227,True
3,1273,breaking idf released footage precise eliminat...,"This material may not be published, broadcast,...",Israeli military presents drone footage from G...,0.536036,True
4,1274,love israel heart please comment shabbat shalo...,To travel to Israel during wartime is to exper...,How is that an abstract issue like the reasona...,0.508663,True


In [94]:
df1.drop(columns=['Cleaned_text', 'article1', 'article2'], inplace=True)

In [95]:
file_path3 = os.path.join(output_dir, "fake_news_flag.csv")
df1.to_csv(file_path3, index=False)