In [None]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer

In [None]:
data = pd.read_csv('./_74429.csv')

In [None]:
WORD = data['news'][3]

In [None]:
stopwords = [
    # Standard stopwords
    "a",
    "about",
    "above",
    "after",
    "again",
    "against",
    "all",
    "am",
    "an",
    "and",
    "any",
    "are",
    "arent",
    "as",
    "at",
    "be",
    "because",
    "been",
    "before",
    "being",
    "below",
    "between",
    "both",
    "but",
    "by",
    "cant",
    "cannot",
    "could",
    "couldnt",
    "did",
    "didnt",
    "do",
    "does",
    "doesnt",
    "doing",
    "dont",
    "down",
    "during",
    "each",
    "few",
    "for",
    "from",
    "further",
    "had",
    "hadnt",
    "has",
    "hasnt",
    "have",
    "havent",
    "having",
    "he",
    "hed",
    "hell",
    "hes",
    "her",
    "here",
    "heres",
    "hers",
    "herself",
    "him",
    "himself",
    "his",
    "how",
    "hows",
    "i",
    "id",
    "ill",
    "im",
    "ive",
    "if",
    "in",
    "into",
    "is",
    "isnt",
    "it",
    "its",
    "itself",
    "lets",
    "me",
    "more",
    "most",
    "mustnt",
    "my",
    "myself",
    "no",
    "nor",
    "not",
    "of",
    "off",
    "on",
    "once",
    "only",
    "or",
    "other",
    "ought",
    "our",
    "ours",
    "ourselves",
    "out",
    "over",
    "own",
    "same",
    "she",
    "shed",
    "shell",
    "shes",
    "should",
    "shouldnt",
    "so",
    "some",
    "such",
    "than",
    "that",
    "thats",
    "the",
    "their",
    "theirs",
    "them",
    "themselves",
    "then",
    "there",
    "theres",
    "these",
    "they",
    "theyd",
    "theyll",
    "theyre",
    "theyve",
    "this",
    "those",
    "through",
    "to",
    "too",
    "under",
    "until",
    "up",
    "very",
    "was",
    "wasnt",
    "we",
    "wed",
    "well",
    "were",
    "weve",
    "were",
    "werent",
    "what",
    "whats",
    "when",
    "whens",
    "where",
    "wheres",
    "which",
    "while",
    "who",
    "whos",
    "whom",
    "why",
    "whys",
    "with",
    "wont",
    "would",
    "wouldnt",
    "you",
    "youd",
    "youll",
    "youre",
    "youve",
    "your",
    "yours",
    "yourself",
    "yourselves",
    # Social media filler words
    "rt",
    "via",
    "lol",
    "lmao",
    "omg",
    "idk",
    "tbh",
    "btw",
    "pls",
    "plz",
    "u",
    "ur",
    "r",
    "imho",
    "irl",
    "smh",
    "fyi",
    "yea",
    "yeah",
    "yup",
    "nope",
    "okay",
    "ok",
    "k",
    # Noise words
    "breaking",
    "update",
    "alert",
    "exclusive",
    "viral",
    "share",
    "repost",
    "read",
    "watch",
    "click",
    "follow",
    "true",
    "false",
    "real",
    "fake",
    "hoax",
    "scam",
    # Very frequent low-signal verbs
    "say",
    "says",
    "said",
    "tell",
    "told",
    "think",
    "thought",
    "know",
    "known",
    "report",
    "reported",
    "claim",
    "claimed",
    "claims",
    "show",
    "shown",
    "shows",
    "make",
    "makes",
    "made",
    "see",
    "seen",
    "look",
    "looks",
    # Generic nouns that almost never contribute to classification
    "people",
    "person",
    "man",
    "woman",
    "guy",
    "guys",
    "thing",
    "stuff",
    "someone",
    "everyone",
    "anyone",
    # Misinformation bait
    "wow",
    "shocking",
    "unbelievable",
    "insane",
    "must",
    "watch",
    "truth",
    "facts",
    "omfg",
    "literally",
]

In [None]:
lem = WordNetLemmatizer()

def cleanse(word):
  buffer = word.split()
  stream = ""
  for token in buffer:
    clean = lem.lemmatize(re.sub(r"[^a-zA-Z0-9]", "", token).lower()).strip()
    if clean not in stopwords:
      stream += clean+" "
  return stream


print(cleanse(WORD))


In [None]:
data_frame = pd.DataFrame({"news": data['news'].apply(cleanse), "class": data['label']})

In [None]:
data_frame.to_csv("_74429_V01.csv", index=False)

# Embeddings

In [None]:
import pandas as pd
import pickle as pk
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

In [None]:
data = pd.read_csv("./_74429_V01.csv")

In [None]:
model = SentenceTransformer('all-mpnet-base-v2')

In [None]:
data['news'] = data['news'].astype('str')

In [None]:
embeddings = []
for text in tqdm(data['news'], desc="Generating embeddings"):
    emb = model.encode(text)
    embeddings.append(emb)

In [None]:
file = open('./embeddings.pkl', 'wb')
pk.dump({"embeddings": embeddings}, file)
file.close()

In [None]:
classes = pd.read_csv("./_74429_V01.csv")['class']
file = open('./embedding_classes.pkl', 'wb')
pk.dump({"embedding_classes": classes}, file)
file.close()