In [41]:
from src.utils.FeatureConfig import FeatureConfig
from src.utils.ProductionExtractor import ProductionExtractor
from src.utils.FeatureExtractor import FeatureExtractor
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import os
import warnings
from tqdm import tqdm
import re
import json
import random
import pickle
from src.utils.UtilsOfTree import computePCFG
from src.utils.UtilsOfTree import collect_productions_from_json_tree
warnings.filterwarnings("ignore")

def loadDataset(path):
    data = []
    files = os.listdir(path)
    for file in files:
        with open(os.path.join(path, file)) as f:
            data.append(f.read())
    return data

def loadTree(path):
    data = []
    files = os.listdir(path)
    for file in files:
        trees = []
        with open(os.path.join(path, file)) as f:
            trees.extend([json.loads(line) for line in f.readlines()])
        data.append(trees)
    return data

def loadPCFG(article_trees):
    pcfg = {}
    for trees in article_trees:
        for tree in trees:
            productions = collect_productions_from_json_tree(tree)
            computePCFG(productions, pcfg)
    return pcfg

def dataToFeatures(data, featureExtractor):
    
    features = []
    for doc in data:
        features.append(featureExtractor.extract())

def kFoldsValidation(X, y, k=10, shuffle=True, on_pos=False, on_neg=False):
    '''
    Args:
        X: a n * m matrix. n is the number of samples and m is the dimension of features
        y: a ndarray. with shape (n, )
    
    Return:
        a list of accuracy
    '''
    numberOfSample = X.shape[0]
    batchSize = numberOfSample//k
    X = np.concatenate([y.reshape(-1, 1), X], axis = 1)
    np.random.shuffle(X)

    clf = make_pipeline(StandardScaler(), SVC(gamma='auto', C=10))
    
    tp = []
    tn = []
    fp = []
    fn = []
    for i in range(k):
        testingIndices = np.arange(i*batchSize, (i+1)*batchSize)
        trainingIndices = np.delete(np.arange(numberOfSample), testingIndices)
        
        trainX = X[trainingIndices, 1:]
        trainY = X[trainingIndices, 0]
        clf.fit(trainX, trainY)
        
        testX = X[testingIndices, 1:]
        testY = X[testingIndices, 0]

        predict = clf.predict(testX)
        tp.append((predict == testY)[testY == 1].sum())
        tn.append((predict == testY)[testY == 0].sum())
        fp.append((predict != testY)[testY == 0].sum())
        fn.append((predict != testY)[testY == 1].sum())
        
    tp = np.array(tp)
    tn = np.array(tn)
    fp = np.array(fp)
    fn = np.array(fn)
    
    acc = ((tp + tn)/(tp+tn+fp+fn))
    precision = ((tp)/(tp+fp))
    recall = ((tp)/(tp+fn))
    results = [acc, precision, recall]
    for res in results:
        for i, num in enumerate(res):
            if np.isnan(num):
                res[i] = 0
    
    return acc.mean(), precision.mean(), recall.mean()

def kFoldsValidationAug(X, y, X_aug, y_aug, k=10, shuffle=True, on_pos=False, on_neg=False):
    '''
    Args:
        X: a n * m matrix. n is the number of samples and m is the dimension of features
        y: a ndarray. with shape (n, )
    
    Return:
        a list of accuracy
    '''
    numberOfSample = X.shape[0]
    batchSize = numberOfSample//k

    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)

    clf = make_pipeline(StandardScaler(), SVC(gamma='auto', C=10))
    
    tp = []
    tn = []
    fp = []
    fn = []
    for i in range(k):
        testingIndices = indices[i*batchSize:(i+1)*batchSize]
        trainingIndices = np.delete(indices, np.arange(i*batchSize, (i+1)*batchSize))
        
        trainX = np.concatenate([X[trainingIndices], X_aug[trainingIndices]])
        trainY = np.concatenate([y[trainingIndices], y_aug[trainingIndices]])
        
        clf.fit(trainX, trainY)
        
        testX = X[testingIndices]
        testY = y[testingIndices]

        predict = clf.predict(testX)
        tp.append((predict == testY)[testY == 1].sum())
        tn.append((predict == testY)[testY == 0].sum())
        fp.append((predict != testY)[testY == 0].sum())
        fn.append((predict != testY)[testY == 1].sum())
        
    tp = np.array(tp)
    tn = np.array(tn)
    fp = np.array(fp)
    fn = np.array(fn)
    
    acc = ((tp + tn)/(tp+tn+fp+fn))
    precision = ((tp)/(tp+fp))
    recall = ((tp)/(tp+fn))
    results = [acc, precision, recall]
    for res in results:
        for i, num in enumerate(res):
            if np.isnan(num):
                res[i] = 0
    
    return acc.mean(), precision.mean(), recall.mean()

# To test on syntax transfer fake news
def kFoldsValidationReplaceTestSet(X, y, X_test, k=10, shuffle=True, on_pos=False, on_neg=False):
    '''
    Args:
        X: a n * m matrix. n is the number of samples and m is the dimension of features
        y: a ndarray. with shape (n, )
    
    Return:
        a list of accuracy
    '''
    numberOfSample = X.shape[0]
    numberOfFeatures = X.shape[1]
    batchSize = numberOfSample//k
    X = np.concatenate([y.reshape(-1, 1), X, X_test], axis = 1)
    np.random.shuffle(X)

    clf = make_pipeline(StandardScaler(), SVC(gamma='auto', C=10))
    
    tp = []
    tn = []
    fp = []
    fn = []
    for i in range(k):

        testingIndices = np.arange(i*batchSize, (i+1)*batchSize)
        trainingIndices = np.delete(np.arange(numberOfSample), testingIndices)
        
        trainX = X[trainingIndices, 1:1+numberOfFeatures]
        trainY = X[trainingIndices, 0]
        clf.fit(trainX, trainY)
        
        testY = X[testingIndices, 0]
        testX = np.zeros((testY.shape[0], numberOfFeatures))
        allFeatures = X[testingIndices, 1:]

        for i in range(testY.shape[0]):
            if testY[i]:
                testX[i] = allFeatures[i][numberOfFeatures:]
            else:
                testX[i] = allFeatures[i][:numberOfFeatures]

        predict = clf.predict(testX)
        tp.append((predict == testY)[testY == 1].sum())
        tn.append((predict == testY)[testY == 0].sum())
        fp.append((predict != testY)[testY == 0].sum())
        fn.append((predict != testY)[testY == 1].sum())
        
    tp = np.array(tp)
    tn = np.array(tn)
    fp = np.array(fp)
    fn = np.array(fn)
    
    acc = ((tp + tn)/(tp+tn+fp+fn))
    precision = ((tp)/(tp+fp))
    recall = ((tp)/(tp+fn))
    results = [acc, precision, recall]
    for res in results:
        for i, num in enumerate(res):
            if np.isnan(num):
                res[i] = 0
    
    return acc.mean(), precision.mean(), recall.mean()

def trainAndTest(X, y, X_test, y_test):
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto', C=10))
    clf.fit(X, y)
    predict = clf.predict(X_test)

    tp = (predict == y_test)[y_test==1].sum()
    tn = (predict == y_test)[y_test==0].sum()
    fp = (predict != y_test)[y_test==0].sum()
    fn = (predict != y_test)[y_test==1].sum()
    
    acc = (tp+tn)/(tp+tn+fp+fn)
    precision = (tp)/(tp+fp)
    recall = (tp)/(tp+fn)

    return acc, precision, recall


In [2]:
pathesToFiles = []
legitBase = 'data/fakeNewsDatasets/fakeNewsDataset/legit/'
fakeBase = 'data/fakeNewsDatasets/fakeNewsDataset/fake/'
for file in os.listdir(legitBase):
    pathesToFiles.append(os.path.join(legitBase, file))
for file in os.listdir(fakeBase):
    pathesToFiles.append(os.path.join(fakeBase, file))

base = "data/fakeNewsDatasets/fakeNewsDatasetTree/"

fake_files = [os.path.join(base, 'fake', file) for file in os.listdir(os.path.join(base, 'fake'))]
real_files = [os.path.join(base, 'legit', file) for file in os.listdir(os.path.join(base, 'legit'))]

pathesToParseTree = fake_files + real_files

config = FeatureConfig(
    init = True,
    pathesToFiles = pathesToFiles,
    pathesToParseTree = pathesToParseTree,
    sourceBase = 'src.utils',
    bow = True,
    vocabSize = 500,
    gi = True,
    pathToGI='inquirerbasic.xls',
    pos = True,
    posBigram = False,
    posTrigram = False,
    collectPosFromCorpus = True,
    production = True,
    collectProductionFromCorpus = True,
    readability = True,
    quantity = True,
    sentiment = True,
    punctutation = False,
)

featureExtractor = FeatureExtractor(config)

2021-04-27 15:37:12 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2021-04-27 15:37:12 INFO: Use device: gpu
2021-04-27 15:37:12 INFO: Loading: tokenize
2021-04-27 15:37:14 INFO: Loading: ner
2021-04-27 15:37:14 INFO: Done loading processors!


In [None]:
len(featureExtractor.extractors[3].featureName())

In [3]:
legitData = loadDataset('data/fakeNewsDatasets/fakeNewsDataset/legit/')
fakeData = loadDataset('data/fakeNewsDatasets/fakeNewsDataset/fake/')

legitTree = loadTree('data/fakeNewsDatasets/fakeNewsDatasetTree/legit/')
fakeTree = loadTree('data/fakeNewsDatasets/fakeNewsDatasetTree//fake/')

legitPCFG = loadPCFG(legitTree)
fakePCFG = loadPCFG(fakeTree)
# legitTransferData = loadDataset('data/fakeNewsDatasets/transferFakeNewsDataset/legit/')
# fakeTransferData = loadDataset('data/fakeNewsDatasets/transferFakeNewsDataset/fake/')

In [None]:
# legitTransferFeatures = featureExtractor.extract(legitTransferData)
# fakeTransferFeatures = featureExtractor.extract(fakeTransferData)
legitFeatures = featureExtractor.extract(legitData)
fakeFeatures = featureExtractor.extract(fakeData)

In [36]:
with open("save/legit_feature_new.pkl", 'rb') as f:
    legitFeatures = pickle.load(f)
with open("save/fake_feature_new.pkl", 'rb') as f:
    fakeFeatures = pickle.load(f)

In [8]:
def sample_from_production_distribution(production_counts, productions):

    out = {production : 0 for production in productions}

    def dfs(lhs, production_counts, productions, out):
        
        if lhs != "[END]":
            all_rhs = list(production_counts[lhs].keys())
            counts = [0] * len(all_rhs)
            for i, rhs in enumerate(all_rhs):
                counts[i] = production_counts[lhs][rhs]
            sampled_rhs = random.choices(all_rhs, counts)[0]
            if sampled_rhs != "[END]" and '{}->{}'.format(lhs, sampled_rhs) in out:
                out['{}->{}'.format(lhs, sampled_rhs)] += 1

                for child in sampled_rhs.split():
                    dfs(child, production_counts, productions, out)

    dfs('ROOT', production_counts, productions, out)
    features = [0] * len(productions)
    for i, production in enumerate(productions):
        features[i] = out[production]
    return np.array(features)

In [None]:
sample = sample_from_production_distribution(extractor.pcfg, extractor.featureName())

In [29]:
aug_legit = []
for features in legitFeatures:
    array = features.copy()
    array[726:3610] = sample_from_production_distribution(legitPCFG, featureExtractor.extractors[3].featureName())
    aug_legit.append(array)
aug_legit = np.array(aug_legit)

aug_fake = []
for features in fakeFeatures:
    array = features.copy()
    array[726:3610] = sample_from_production_distribution(fakePCFG, featureExtractor.extractors[3].featureName())
    aug_fake.append(array)
aug_fake = np.array(aug_fake)

In [44]:
from tqdm import tqdm
# Before attack
# X = np.concatenate([legitFeatures, fakeFeatures, aug_sample], axis = 0)
# y = np.array([0] * len(legitFeatures) + [1] * len(fakeFeatures) + [1] * len(aug_sample))
# result = [0, 0, 0]
# times = 100
# for i in tqdm(range(times)):
#     acc, precision, recall = kFoldsValidation(X, y)
#     result[0] += acc
#     result[1] += precision
#     result[2] += recall
# acc = result[0]/times
# precision = result[1]/times
# recall = result[2]/times
# print("accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(acc, precision, recall))
X = np.concatenate([legitFeatures, fakeFeatures], axis = 0)
y = np.array([0] * len(legitFeatures) + [1] * len(fakeFeatures))
result = [0, 0, 0]
times = 100
for i in tqdm(range(times)):
    acc, precision, recall = kFoldsValidation(X, y)
    result[0] += acc
    result[1] += precision
    result[2] += recall
acc = result[0]/times
precision = result[1]/times
recall = result[2]/times
print("accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(acc, precision, recall))

100%|██████████| 100/100 [04:05<00:00,  2.45s/it]

accuracy: 0.59, precision: 0.58, recall: 0.70.





In [45]:
# augmentation
X = np.concatenate([legitFeatures, fakeFeatures], axis = 0)
y = np.array([0] * len(legitFeatures) + [1] * len(fakeFeatures))
X_aug = np.concatenate([aug_legit, aug_fake])
y_aug = np.array([0] * len(aug_legit) + [1] * len(aug_fake))

result = [0, 0, 0]
times = 100
for i in tqdm(range(times)):
    acc, precision, recall = kFoldsValidationAug(X, y, X_aug, y_aug)
    result[0] += acc
    result[1] += precision
    result[2] += recall
acc = result[0]/times
precision = result[1]/times
recall = result[2]/times
print("accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(acc, precision, recall))

100%|██████████| 100/100 [16:13<00:00,  9.74s/it]

accuracy: 0.62, precision: 0.61, recall: 0.69.





In [35]:
legitFeatures.shape

(240, 1490)

In [None]:
len(featureExtractor.featureName())

In [None]:
std = np.std(legitFeatures[:, 726:3563], axis = 0)
mean = np.mean(legitFeatures[:, 726:3563], axis = 0)

In [47]:
featureExtractor.extractorIndices()

{'GIExtractor': array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
        156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
        169, 170, 171, 172, 173, 174

In [46]:
# Before attack, each module
times = 10
for extractor in featureExtractor.extractors:
    indices = featureExtractor.extractorIndices()[extractor.__class__.__name__]
    X = np.concatenate([legitFeatures[:, indices], fakeFeatures[:, indices]], axis = 0)
    y = np.array([0] * len(legitFeatures) + [1] * len(fakeFeatures))
    result = [0, 0, 0]

    for i in range(times):
        acc, precision, recall = kFoldsValidation(X, y, k=10)
        result[0] += acc
        result[1] += precision
        result[2] += recall

    acc = result[0]/times
    precision = result[1]/times
    recall = result[2]/times
    print("{:^20} accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(extractor.__class__.__name__, acc, precision, recall))

    GIExtractor      accuracy: 0.45, precision: 0.46, recall: 0.47.
    BOWExtractor     accuracy: 0.50, precision: 0.50, recall: 1.00.
    PosExtractor     accuracy: 0.64, precision: 0.65, recall: 0.64.
ProductionExtractor  accuracy: 0.57, precision: 0.56, recall: 0.65.
ReadabilityExtractor accuracy: 0.59, precision: 0.58, recall: 0.67.
 QuantityExtractor   accuracy: 0.59, precision: 0.58, recall: 0.71.
 SentimentExtractor  accuracy: 0.50, precision: 0.51, recall: 0.46.


In [None]:
# Attack
X = np.concatenate([legitFeatures, fakeFeatures], axis = 0)
y = np.array([0] * len(legitFeatures) + [1] * len(fakeFeatures))
X_Transfer = np.concatenate([legitTransferFeatures, fakeTransferFeatures], axis = 0)

result = [0, 0, 0]
times = 100
for i in range(times):
    acc, precision, recall = kFoldsValidationReplaceTestSet(X, y, X_Transfer)
    result[0] += acc
    result[1] += precision
    result[2] += recall
acc = result[0]/times
precision = result[1]/times
recall = result[2]/times
print("accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(acc, precision, recall))

In [None]:
# Attack, each module
times = 100
for extractor in featureExtractor.extractors:
    indices = featureExtractor.extractorIndices()[extractor.__class__.__name__]
    X = np.concatenate([legitFeatures[:, indices], fakeFeatures[:, indices]], axis = 0)
    y = np.array([0] * len(legitFeatures) + [1] * len(fakeFeatures))
    X_Transfer = np.concatenate([legitTransferFeatures[:, indices], fakeTransferFeatures[:, indices]], axis = 0)
    result = [0, 0, 0]
    for i in range(times):
        acc, precision, recall = kFoldsValidationReplaceTestSet(X, y, X_Transfer)
        result[0] += acc
        result[1] += precision
        result[2] += recall
    acc = result[0]/times
    precision = result[1]/times
    recall = result[2]/times
    print("{:^20} accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(extractor.__class__.__name__, acc, precision, recall))

In [None]:
# data augmentation
X = np.concatenate([legitFeatures, fakeFeatures, fakeTransferFeatures], axis = 0)
y = np.array([0] * len(legitFeatures) + [1] * 2 * len(fakeFeatures))

result = [0, 0, 0]
times = 100
for i in range(times):
    acc, precision, recall = kFoldsValidation(X, y, k=10)
    result[0] += acc
    result[1] += precision
    result[2] += recall
acc = result[0]/times
precision = result[1]/times
recall = result[2]/times
print("accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(acc, precision, recall))

In [None]:
times = 100
for extractor in featureExtractor.extractors:
    indices = featureExtractor.extractorIndices()[extractor.__class__.__name__]
    X = np.concatenate([legitFeatures[:, indices], fakeFeatures[:, indices], fakeTransferFeatures[:, indices]], axis = 0)
    y = np.array([0] * len(legitFeatures) + [1] * 2 * len(fakeFeatures)) 
    result = [0, 0, 0]
    for i in range(times):
        acc, precision, recall = kFoldsValidation(X, y, k=10)
        result[0] += acc
        result[1] += precision
        result[2] += recall
    acc = result[0]/times
    precision = result[1]/times
    recall = result[2]/times
    print("{:^20} accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(extractor.__class__.__name__, acc, precision, recall))

In [None]:
re.sub(r'-\d+', '', 'ASP-1')

In [None]:
random.choices(['a', 'b'], [1, 9])

In [None]:
list({'a':0}.keys())