In [1]:
import pandas as pd

In [2]:
import spacy
from spacy.tokens import Doc

In [3]:
spacy.require_gpu()

True

In [4]:
nlp = spacy.load("C:/project/spacy_pipeline")

In [5]:
nlp.add_pipe("sentencizer", first=True)

<spacy.pipeline.sentencizer.Sentencizer at 0x214a3545f80>

In [6]:
print(nlp.pipe_names)

['sentencizer', 'transformer', 'tagger', 'ner']


In [7]:
fake_news = pd.read_csv("Fake.csv")
real_news = pd.read_csv("True.csv")

In [8]:
fake_news["text"] = fake_news["text"].str.replace(
    r'^.*?\([^)]*\)\s*-\s*',
    '',
    regex=True
)

real_news["text"] = real_news["text"].str.replace(
    r'^.*?\([^)]*\)\s*-\s*',
    '',
    regex=True
)

In [9]:
fake_articles = fake_news["text"]
real_articles = real_news["text"]

In [10]:
def get_count(doc):
    pos_count = {}
    ent_count = {}
    
    pos = [token.tag_ for token in doc]
    ents = [token.ent_type_ for token in doc]
    
    for tag in pos:
        if tag in pos_count:
            pos_count[tag] += 1
        else:
            pos_count[tag] = 1
    
    for tag in ents:
        if tag in ent_count:
            ent_count[tag] += 1
        else:
            ent_count[tag] = 1
    
    return pos_count, ent_count

In [11]:
def get_embedding(doc):
    tensor = doc._.trf_data.tensors[0]  
    contextual_embedding = tensor[0, 0, :]     
    return contextual_embedding

In [None]:
def get_features(doc):
    pos_tags = ["NOUN", "VERB", "ADJ", "ADV", "PRON"]
    ent_tags = ["PER", "ORG", "LOC"]

    pos, ent = get_count(doc)
    pos_count = [pos.get(tag, 0) for tag in pos_tags]
    ent_count = [ent.get(tag, 0) for tag in ent_tags]
    
    token_count = len(doc)

    contextual_embedding = get_embedding(doc)

    features = {f'{tag}': count for tag, count in zip(pos_tags, pos_count)}
    features.update({f'{tag}': count for tag, count in zip(ent_tags, ent_count)})
    features['token_count'] = token_count
    
    for i, embedding in enumerate(contextual_embedding):
        if hasattr(embedding, 'get'):
            embedding = embedding.get()
        
        features[f'emb_{i}'] = embedding

    return pd.Series(features)

In [16]:
fake_features = []
for doc in nlp.pipe(fake_articles, batch_size=64):
    features = get_features(doc)
    fake_features.append(features)

fake_df = pd.DataFrame(fake_features)

Token indices sequence length is longer than the specified maximum sequence length for this model (1736 > 512). Running this sequence through the model will result in indexing errors


In [17]:
fake_df.shape

(23481, 777)

In [18]:
fake_df.head()

Unnamed: 0,NOUN,VERB,ADJ,ADV,PRON,PER,ORG,LOC,token_count,emb_0,...,emb_758,emb_759,emb_760,emb_761,emb_762,emb_763,emb_764,emb_765,emb_766,emb_767
0,70,63,32,30,49,19,38,13,621,0.49166468,...,-0.99469244,-0.9691234,-1.1756288,-0.28396177,0.38695464,-0.21877588,-0.36717254,-0.67960757,-0.73541623,-0.17068739
1,60,32,24,11,15,23,15,10,352,0.7399958,...,-1.0174954,-1.4178197,-1.3892937,-0.14997888,0.47479692,-0.29169118,-0.5290003,-1.0338486,-0.9278441,-0.025926411
2,113,78,31,23,70,22,31,13,710,0.87741137,...,-0.74752975,-1.3563344,-1.4687192,-0.24174379,0.39115188,-0.2931207,-0.64882225,-0.80387855,-0.960147,-0.11864097
3,82,55,18,26,30,26,9,5,541,0.75685537,...,-0.69399625,-1.4524714,-1.4473013,-0.24620362,0.3863606,-0.24031949,-0.7331426,-1.1221443,-1.04414,-0.008274795
4,71,65,19,16,41,14,7,16,475,0.059116684,...,0.121360816,-1.0237797,-0.38299266,-0.28531605,0.60456675,0.21088573,0.19139396,0.016843403,0.12979959,-0.61656046


In [19]:
real_features = []
for doc in nlp.pipe(real_articles, batch_size=64):
    features = get_features(doc)
    real_features.append(features)

real_df = pd.DataFrame(real_features)

In [20]:
real_df.shape

(21417, 777)

In [21]:
real_df.head()

Unnamed: 0,NOUN,VERB,ADJ,ADV,PRON,PER,ORG,LOC,token_count,emb_0,...,emb_758,emb_759,emb_760,emb_761,emb_762,emb_763,emb_764,emb_765,emb_766,emb_767
0,167,104,61,27,38,9,42,15,882,-0.34478572,...,1.1484325,0.96233004,1.4939576,0.02855343,-0.56160927,0.03955583,0.24786349,0.7841359,1.0551856,-0.011685535
1,151,103,38,14,25,12,26,11,711,-0.47281256,...,1.0105367,0.9513012,1.5497067,-0.04653076,-0.32387793,0.07917192,0.105632976,1.0099013,1.3343059,0.09810337
2,89,60,29,6,38,16,24,18,519,-0.35205472,...,0.8666072,0.89774895,1.3697631,0.11708815,-0.4107434,0.11244487,0.20330209,1.0099039,1.0146673,0.049866267
3,74,47,26,11,18,16,16,30,432,-0.4820314,...,0.90372217,1.1081113,1.5619082,-0.089562096,-0.5314894,0.28768918,0.18642981,0.7435344,1.1432315,0.109753005
4,227,83,67,33,40,11,64,23,990,-0.39500818,...,1.064916,0.9793283,1.4728062,0.005544428,-0.48552608,0.03723717,0.0794876,0.95452595,1.2365569,0.041941077


In [22]:
fake_df.to_csv('fake_data.csv', index=False)
real_df.to_csv('real_data.csv', index=False)