In [5]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer

In [6]:
stopwords = [
    # Standard stopwords
    "a",
    "about",
    "above",
    "after",
    "again",
    "against",
    "all",
    "am",
    "an",
    "and",
    "any",
    "are",
    "arent",
    "as",
    "at",
    "be",
    "because",
    "been",
    "before",
    "being",
    "below",
    "between",
    "both",
    "but",
    "by",
    "cant",
    "cannot",
    "could",
    "couldnt",
    "did",
    "didnt",
    "do",
    "does",
    "doesnt",
    "doing",
    "dont",
    "down",
    "during",
    "each",
    "few",
    "for",
    "from",
    "further",
    "had",
    "hadnt",
    "has",
    "hasnt",
    "have",
    "havent",
    "having",
    "he",
    "hed",
    "hell",
    "hes",
    "her",
    "here",
    "heres",
    "hers",
    "herself",
    "him",
    "himself",
    "his",
    "how",
    "hows",
    "i",
    "id",
    "ill",
    "im",
    "ive",
    "if",
    "in",
    "into",
    "is",
    "isnt",
    "it",
    "its",
    "itself",
    "lets",
    "me",
    "more",
    "most",
    "mustnt",
    "my",
    "myself",
    "no",
    "nor",
    "not",
    "of",
    "off",
    "on",
    "once",
    "only",
    "or",
    "other",
    "ought",
    "our",
    "ours",
    "ourselves",
    "out",
    "over",
    "own",
    "same",
    "she",
    "shed",
    "shell",
    "shes",
    "should",
    "shouldnt",
    "so",
    "some",
    "such",
    "than",
    "that",
    "thats",
    "the",
    "their",
    "theirs",
    "them",
    "themselves",
    "then",
    "there",
    "theres",
    "these",
    "they",
    "theyd",
    "theyll",
    "theyre",
    "theyve",
    "this",
    "those",
    "through",
    "to",
    "too",
    "under",
    "until",
    "up",
    "very",
    "was",
    "wasnt",
    "we",
    "wed",
    "well",
    "were",
    "weve",
    "were",
    "werent",
    "what",
    "whats",
    "when",
    "whens",
    "where",
    "wheres",
    "which",
    "while",
    "who",
    "whos",
    "whom",
    "why",
    "whys",
    "with",
    "wont",
    "would",
    "wouldnt",
    "you",
    "youd",
    "youll",
    "youre",
    "youve",
    "your",
    "yours",
    "yourself",
    "yourselves",
    # Social media filler words
    "rt",
    "via",
    "lol",
    "lmao",
    "omg",
    "idk",
    "tbh",
    "btw",
    "pls",
    "plz",
    "u",
    "ur",
    "r",
    "imho",
    "irl",
    "smh",
    "fyi",
    "yea",
    "yeah",
    "yup",
    "nope",
    "okay",
    "ok",
    "k",
    # Noise words
    "breaking",
    "update",
    "alert",
    "exclusive",
    "viral",
    "share",
    "repost",
    "read",
    "watch",
    "click",
    "follow",
    "true",
    "false",
    "real",
    "fake",
    "hoax",
    "scam",
    # Very frequent low-signal verbs
    "say",
    "says",
    "said",
    "tell",
    "told",
    "think",
    "thought",
    "know",
    "known",
    "report",
    "reported",
    "claim",
    "claimed",
    "claims",
    "show",
    "shown",
    "shows",
    "make",
    "makes",
    "made",
    "see",
    "seen",
    "look",
    "looks",
    # Generic nouns that almost never contribute to classification
    "people",
    "person",
    "man",
    "woman",
    "guy",
    "guys",
    "thing",
    "stuff",
    "someone",
    "everyone",
    "anyone",
    # Misinformation bait
    "wow",
    "shocking",
    "unbelievable",
    "insane",
    "must",
    "watch",
    "truth",
    "facts",
    "omfg",
    "literally",
]



In [7]:
lem = WordNetLemmatizer()

def cleanse(word):
  buffer = word.split()
  stream = ""
  for token in buffer:
    clean = lem.lemmatize(re.sub(r"[^a-zA-Z0-9]", "", token).lower()).strip()
    if clean not in stopwords:
      stream += clean+" "
  return stream

In [8]:
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import torch
from nnModel import BiLSTM_Attention_Model

model = SentenceTransformer('all-mpnet-base-v2')
# model = SentenceTransformer("all-MiniLM-L6-v2")

In [9]:
model.save("sentence-transformer-all-mpnet-base-v2")

In [10]:
pip download torch==2.3.0+cpu --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch==2.3.0+cpu
  Downloading https://download.pytorch.org/whl/cpu/torch-2.3.0%2Bcpu-cp310-cp310-win_amd64.whl (161.7 MB)
     ---------------------------------------- 0.0/161.7 MB ? eta -:--:--
     ---------------------------------------- 0.3/161.7 MB ? eta -:--:--
     ---------------------------------------- 0.8/161.7 MB 2.1 MB/s eta 0:01:17
     ---------------------------------------- 1.3/161.7 MB 2.1 MB/s eta 0:01:17
     ---------------------------------------- 1.6/161.7 MB 1.9 MB/s eta 0:01:26
      --------------------------------------- 2.4/161.7 MB 2.3 MB/s eta 0:01:09
      --------------------------------------- 2.9/161.7 MB 2.3 MB/s eta 0:01:09
      --------------------------------------- 3.4/161.7 MB 2.3 MB/s eta 0:01:08
      --------------------------------------- 3.7/161.7 MB 2.3 MB/s eta 0:01:08
     - -------------------------------------- 4.2/161.7 MB 2.3 MB/s eta 0:01:09
     - ----------------

In [None]:


def auditor(data):
  emb = model.encode(data)
  X_single = torch.tensor(emb, dtype=torch.float32).unsqueeze(0)  
  input_dim = X_single.shape[1]
  num_classes = 2
  hidden_dim = 256
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model_lstm = BiLSTM_Attention_Model(input_dim=input_dim, hidden_dim=hidden_dim, num_classes=num_classes)
  model_lstm.load_state_dict(torch.load("./models/yaari-synth-auditor-model-v1.pth", map_location=device))
  model_lstm.to(device)
  model_lstm.eval()
  with torch.no_grad():
      outputs, _ = model_lstm(X_single.to(device))
      pred_class = torch.argmax(torch.softmax(outputs, dim=1), dim=1).item()

  print("Predicted class:", pred_class)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
# Fake
print(
    auditor(
        "Scientists at NASA announced yesterday that satellite imagery revealed the presence of a massive hidden city beneath the Pacific Ocean. According to the report, the structures are estimated to be over 20,000 years old and contain advanced technology far beyond modern capabilities. NASA plans to send a human team to explore the site by next month."
    )
)

# Fake
print(
    auditor(
        "Researchers at a private laboratory in Switzerland claim to have created a handheld device capable of translating animal sounds into human language. According to the team, early tests with dogs and cats show over 90% accuracy in interpreting emotions and simple commands. The developers say the device could be available to the public by early next year, potentially revolutionizing human–animal communication."
    )
)

# Real
print(
    auditor(
        "The World Health Organization (WHO) has released updated recommendations for seasonal influenza vaccination ahead of the upcoming flu season. The new guidelines emphasize the importance of vaccination for older adults, healthcare workers, and individuals with chronic illnesses. The WHO also urged countries to strengthen awareness campaigns to improve vaccination coverage worldwide."
    )
)


# Real
print(
    auditor(
        "The United Nations has launched a new global initiative aimed at improving access to clean drinking water for communities in low-income and drought-affected regions. The program, announced on Monday, will fund water purification systems, rainwater harvesting projects, and infrastructure upgrades across 25 countries. According to UN officials, the initiative is expected to benefit more than 40 million people over the next five years and will involve partnerships with local governments, NGOs, and private-sector organizations."
    )
)





RuntimeError: Error(s) in loading state_dict for BiLSTM_Attention_Model:
	size mismatch for lstm.weight_ih_l0: copying a param with shape torch.Size([1024, 768]) from checkpoint, the shape in current model is torch.Size([1024, 384]).
	size mismatch for lstm.weight_ih_l0_reverse: copying a param with shape torch.Size([1024, 768]) from checkpoint, the shape in current model is torch.Size([1024, 384]).