In [1]:
import pandas as pd

In [2]:
import spacy
from pycaret.classification import *

spacy.require_gpu()

True

In [3]:
nlp = spacy.load('spacy_pipeline')

In [4]:
model = load_model('lightgbm_model')

Transformation Pipeline and Model Successfully Loaded


In [5]:
def get_count(doc):
    pos_count = {}
    ent_count = {}
    
    pos = [token.tag_ for token in doc]
    ents = [token.ent_type_ for token in doc]
    
    for tag in pos:
        if tag in pos_count:
            pos_count[tag] += 1
        else:
            pos_count[tag] = 1
    
    for tag in ents:
        if tag in ent_count:
            ent_count[tag] += 1
        else:
            ent_count[tag] = 1
    
    return pos_count, ent_count

def get_embedding(doc):
    tensor = doc._.trf_data.tensors[0]  
    contextual_embedding = tensor[0, 0, :]     
    return contextual_embedding

def get_features(doc):
    pos_tags = ["NOUN", "VERB", "ADJ", "ADV", "PRON"]
    ent_tags = ["PER", "ORG", "LOC"]

    pos, ent = get_count(doc)
    pos_count = [pos.get(tag, 0) for tag in pos_tags]
    ent_count = [ent.get(tag, 0) for tag in ent_tags]
    
    token_count = len(doc)

    contextual_embedding = get_embedding(doc)

    features = {f'{tag}': count for tag, count in zip(pos_tags, pos_count)}
    features.update({f'{tag}': count for tag, count in zip(ent_tags, ent_count)})
    features['token_count'] = token_count
    
    for i, embedding in enumerate(contextual_embedding):
        if hasattr(embedding, 'get'):
            embedding = embedding.get()
        
        features[f'emb_{i}'] = embedding

    return pd.Series(features)

In [6]:
news = pd.read_csv('WELFake_Dataset.csv')

In [7]:
real_news = news[news['label'] == 0]
fake_news = news[news['label'] == 1]

In [8]:
real_articles = real_news['text']
fake_articles = fake_news['text']

In [9]:
real_features = []
for doc in nlp.pipe(real_articles, batch_size=64):
    features = get_features(doc)
    real_features.append(features)

real_df = pd.DataFrame(real_features)

In [11]:
fake_articles = [article for article in fake_articles if isinstance(article, str)]

In [12]:
fake_features = []
for doc in nlp.pipe(fake_articles, batch_size=64):
    features = get_features(doc)
    fake_features.append(features)

fake_df = pd.DataFrame(fake_features)

In [13]:
real_df.to_csv('WELFake_real.csv', index=False)
fake_df.to_csv('WELFake_fake.csv', index=False)