In [94]:
from src.utils.FeatureConfig import FeatureConfig
from src.utils.FeatureExtractor import FeatureExtractor
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import os
import warnings
from tqdm import tqdm
import re
import json
import random
import pickle

warnings.filterwarnings("ignore")

def loadDataset(path):
    data = []
    files = os.listdir(path)
    for file in files:
        with open(os.path.join(path, file)) as f:
            data.append(f.read())
    return data

def dataToFeatures(data, featureExtractor):
    
    features = []
    for doc in data:
        features.append(featureExtractor.extract())

def kFoldsValidation(X, y, k=10, shuffle=True, on_pos=False, on_neg=False):
    '''
    Args:
        X: a n * m matrix. n is the number of samples and m is the dimension of features
        y: a ndarray. with shape (n, )
    
    Return:
        a list of accuracy
    '''
    numberOfSample = X.shape[0]
    batchSize = numberOfSample//k
    X = np.concatenate([y.reshape(-1, 1), X], axis = 1)
    np.random.shuffle(X)

    clf = make_pipeline(StandardScaler(), SVC(gamma='auto', C=10))
    
    tp = []
    tn = []
    fp = []
    fn = []
    for i in range(k):
        testingIndices = np.arange(i*batchSize, (i+1)*batchSize)
        trainingIndices = np.delete(np.arange(numberOfSample), testingIndices)
        
        trainX = X[trainingIndices, 1:]
        trainY = X[trainingIndices, 0]
        clf.fit(trainX, trainY)
        
        testX = X[testingIndices, 1:]
        testY = X[testingIndices, 0]

        predict = clf.predict(testX)
        tp.append((predict == testY)[testY == 1].sum())
        tn.append((predict == testY)[testY == 0].sum())
        fp.append((predict != testY)[testY == 0].sum())
        fn.append((predict != testY)[testY == 1].sum())
        
    tp = np.array(tp)
    tn = np.array(tn)
    fp = np.array(fp)
    fn = np.array(fn)
    
    acc = ((tp + tn)/(tp+tn+fp+fn))
    precision = ((tp)/(tp+fp))
    recall = ((tp)/(tp+fn))
    results = [acc, precision, recall]
    for res in results:
        for i, num in enumerate(res):
            if np.isnan(num):
                res[i] = 0
    
    return acc.mean(), precision.mean(), recall.mean()

def kFoldsValidationAug(X, y, X_aug, y_aug, k=10, shuffle=True, on_pos=False, on_neg=False):
    '''
    Args:
        X: a n * m matrix. n is the number of samples and m is the dimension of features
        y: a ndarray. with shape (n, )
    
    Return:
        a list of accuracy
    '''
    numberOfSample = X.shape[0]
    batchSize = numberOfSample//k
    X = np.concatenate([y.reshape(-1, 1), X], axis = 1)
    np.random.shuffle(X)

    clf = make_pipeline(StandardScaler(), SVC(gamma='auto', C=10))
    
    tp = []
    tn = []
    fp = []
    fn = []
    for i in range(k):
        testingIndices = np.arange(i*batchSize, (i+1)*batchSize)
        trainingIndices = np.delete(np.arange(numberOfSample), testingIndices)
        
        trainX = np.concatenate([X[trainingIndices, 1:], X_aug])
        trainY = np.concatenate([X[trainingIndices, 0], y_aug])
        
        clf.fit(trainX, trainY)
        
        testX = X[testingIndices, 1:]
        testY = X[testingIndices, 0]

        predict = clf.predict(testX)
        tp.append((predict == testY)[testY == 1].sum())
        tn.append((predict == testY)[testY == 0].sum())
        fp.append((predict != testY)[testY == 0].sum())
        fn.append((predict != testY)[testY == 1].sum())
        
    tp = np.array(tp)
    tn = np.array(tn)
    fp = np.array(fp)
    fn = np.array(fn)
    
    acc = ((tp + tn)/(tp+tn+fp+fn))
    precision = ((tp)/(tp+fp))
    recall = ((tp)/(tp+fn))
    results = [acc, precision, recall]
    for res in results:
        for i, num in enumerate(res):
            if np.isnan(num):
                res[i] = 0
    
    return acc.mean(), precision.mean(), recall.mean()

# To test on syntax transfer fake news
def kFoldsValidationReplaceTestSet(X, y, X_test, k=10, shuffle=True, on_pos=False, on_neg=False):
    '''
    Args:
        X: a n * m matrix. n is the number of samples and m is the dimension of features
        y: a ndarray. with shape (n, )
    
    Return:
        a list of accuracy
    '''
    numberOfSample = X.shape[0]
    numberOfFeatures = X.shape[1]
    batchSize = numberOfSample//k
    X = np.concatenate([y.reshape(-1, 1), X, X_test], axis = 1)
    np.random.shuffle(X)

    clf = make_pipeline(StandardScaler(), SVC(gamma='auto', C=10))
    
    tp = []
    tn = []
    fp = []
    fn = []
    for i in range(k):

        testingIndices = np.arange(i*batchSize, (i+1)*batchSize)
        trainingIndices = np.delete(np.arange(numberOfSample), testingIndices)
        
        trainX = X[trainingIndices, 1:1+numberOfFeatures]
        trainY = X[trainingIndices, 0]
        clf.fit(trainX, trainY)
        
        testY = X[testingIndices, 0]
        testX = np.zeros((testY.shape[0], numberOfFeatures))
        allFeatures = X[testingIndices, 1:]

        for i in range(testY.shape[0]):
            if testY[i]:
                testX[i] = allFeatures[i][numberOfFeatures:]
            else:
                testX[i] = allFeatures[i][:numberOfFeatures]

        predict = clf.predict(testX)
        tp.append((predict == testY)[testY == 1].sum())
        tn.append((predict == testY)[testY == 0].sum())
        fp.append((predict != testY)[testY == 0].sum())
        fn.append((predict != testY)[testY == 1].sum())
        
    tp = np.array(tp)
    tn = np.array(tn)
    fp = np.array(fp)
    fn = np.array(fn)
    
    acc = ((tp + tn)/(tp+tn+fp+fn))
    precision = ((tp)/(tp+fp))
    recall = ((tp)/(tp+fn))
    results = [acc, precision, recall]
    for res in results:
        for i, num in enumerate(res):
            if np.isnan(num):
                res[i] = 0
    
    return acc.mean(), precision.mean(), recall.mean()

def trainAndTest(X, y, X_test, y_test):
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto', C=10))
    clf.fit(X, y)
    predict = clf.predict(X_test)

    tp = (predict == y_test)[y_test==1].sum()
    tn = (predict == y_test)[y_test==0].sum()
    fp = (predict != y_test)[y_test==0].sum()
    fn = (predict != y_test)[y_test==1].sum()
    
    acc = (tp+tn)/(tp+tn+fp+fn)
    precision = (tp)/(tp+fp)
    recall = (tp)/(tp+fn)

    return acc, precision, recall


In [73]:
pathesToFiles = []
legitBase = 'data/fakeNewsDatasets/fakeNewsDataset/legit/'
fakeBase = 'data/fakeNewsDatasets/fakeNewsDataset/fake/'

for file in os.listdir(legitBase):
    pathesToFiles.append(os.path.join(legitBase, file))
for file in os.listdir(fakeBase):
    pathesToFiles.append(os.path.join(fakeBase, file))

config = FeatureConfig(
    init = False,
    pathesToFiles = pathesToFiles,
    sourceBase = 'src.utils',
    bow = True,
    vocabSize = 500,
    gi = True,
    pathToGI='inquirerbasic.xls',
    pos = True,
    posBigram = False,
    posTrigram = False,
    collectPosFromCorpus = True,
    production = True,
    collectProductionFromCorpus = True,
    readability = True,
    quantity = True,
    sentiment = True,
    punctutation = False,
)

featureExtractor = FeatureExtractor(config)

In [74]:
featureExtractor.load("save/state_backup.pkl")

In [71]:
legitData = loadDataset('data/fakeNewsDatasets/fakeNewsDataset/legit/')
fakeData = loadDataset('data/fakeNewsDatasets/fakeNewsDataset/fake/')

# legitTransferData = loadDataset('data/fakeNewsDatasets/transferFakeNewsDataset/legit/')
# fakeTransferData = loadDataset('data/fakeNewsDatasets/transferFakeNewsDataset/fake/')

In [None]:
# legitTransferFeatures = featureExtractor.extract(legitTransferData)
# fakeTransferFeatures = featureExtractor.extract(fakeTransferData)
legitFeatures = featureExtractor.extract(legitData)
fakeFeatures = featureExtractor.extract(fakeData)

In [77]:
# import pickle

# with open('save/legit_feature.pkl', 'wb') as f:
#     pickle.dump(legitFeatures, f)
# with open('save/fake_feature.pkl', 'wb') as f:
#     pickle.dump(fakeFeatures, f)
with open('save/legit_feature.pkl', 'rb') as f:
    legitFeatures = pickle.load(f)
with open('save/fake_feature.pkl', 'rb') as f:
    fakeFeatures = pickle.load(f)

In [88]:
def addProduction(tree, productions):
    
    def dfs(node, tree):
        
        if node in tree:
            
            lhs = re.sub(r'.\d+', '', node)
            if lhs not in productions:
                productions[lhs] = {}

            if len(tree[node]) == 1 and tree[node][0] not in tree:
                rhs = "[END]"
            else:
                rhs = ' '.join([re.sub(r'-\d+', '', child) for child in tree[node]])
            
            if rhs not in productions[lhs]:
                productions[lhs][rhs] = 1
            else:
                productions[lhs][rhs] += 1
                
            for child in tree[node]:

                dfs(child, tree)
    dfs('ROOT', tree)

def normalize_productions(productions):
    
    normalized_productions = {}
    
    for lhs in productions:
        normalized_productions[lhs] = {}
        counts = 0
        for rhs in productions[lhs]:
            counts += productions[lhs][rhs]
        for rhs in productions[lhs]:
            normalized_productions[lhs][rhs] = productions[lhs][rhs]/counts
    return normalized_productions
        
legit_production_counts = {}
path = 'data/fakeNewsDatasets/fakeNewsDatasetTree/legit/'

for file in os.listdir(path):
    
    with open(os.path.join(path, file)) as f:
        lines = f.readlines()
        tree = json.loads(lines[0])
        addProduction(tree, legit_production_counts)
        
fake_production_counts = {}
path = 'data/fakeNewsDatasets/fakeNewsDatasetTree/fake/'

for file in os.listdir(path):
    
    with open(os.path.join(path, file)) as f:
        lines = f.readlines()
        tree = json.loads(lines[0])
        addProduction(tree, fake_production_counts)

In [79]:
def sample_from_production_distribution(production_counts, productions):
    
    out = {production : 0 for production in productions}

    def dfs(lhs, production_counts, productions, out):
        
        if lhs != "[END]":
            all_rhs = list(production_counts[lhs].keys())
            counts = [0] * len(all_rhs)
            for i, rhs in enumerate(all_rhs):
                counts[i] = production_counts[lhs][rhs]
            sampled_rhs = random.choices(all_rhs, counts)[0]
            if sampled_rhs != "[END]" and '{} -> {}'.format(lhs, sampled_rhs) in out:
                out['{} -> {}'.format(lhs, sampled_rhs)] += 1

                for child in sampled_rhs.split():
                    dfs(child, production_counts, productions, out)

    dfs('ROOT', production_counts, productions, out)
    features = [0] * len(productions)
    for i, production in enumerate(productions):
        features[i] = out[production]
    return np.array(features)

In [80]:
sample = sample_from_production_distribution(production_counts, featureExtractor.extractors[3].featureName())

In [97]:
# augmentation
aug_legit = []
for features in legitFeatures:
    array = features.copy()
    array[726:3563] = sample_from_production_distribution(legit_production_counts, featureExtractor.extractors[3].featureName())
    aug_legit.append(array)
aug_legit = np.array(aug_legit)

aug_fake = []
for features in fakeFeatures:
    array = features.copy()
    array[726:3563] = sample_from_production_distribution(fake_production_counts, featureExtractor.extractors[3].featureName())
    aug_fake.append(array)
aug_fake = np.array(aug_fake)

In [87]:
from tqdm import tqdm
# Before attack
# X = np.concatenate([legitFeatures, fakeFeatures, aug_sample], axis = 0)
# y = np.array([0] * len(legitFeatures) + [1] * len(fakeFeatures) + [1] * len(aug_sample))
# result = [0, 0, 0]
# times = 100
# for i in tqdm(range(times)):
#     acc, precision, recall = kFoldsValidation(X, y)
#     result[0] += acc
#     result[1] += precision
#     result[2] += recall
# acc = result[0]/times
# precision = result[1]/times
# recall = result[2]/times
# print("accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(acc, precision, recall))
X = np.concatenate([legitFeatures, fakeFeatures], axis = 0)
y = np.array([0] * len(legitFeatures) + [1] * len(fakeFeatures))
result = [0, 0, 0]
times = 100
for i in tqdm(range(times)):
    acc, precision, recall = kFoldsValidation(X, y)
    result[0] += acc
    result[1] += precision
    result[2] += recall
acc = result[0]/times
precision = result[1]/times
recall = result[2]/times
print("accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(acc, precision, recall))

100%|██████████| 100/100 [03:18<00:00,  1.98s/it]

accuracy: 0.59, precision: 0.57, recall: 0.71.





In [98]:
# augmentation
X = np.concatenate([legitFeatures, fakeFeatures], axis = 0)
y = np.array([0] * len(legitFeatures) + [1] * len(fakeFeatures))
X_aug = np.concatenate([aug_legit, aug_fake])
y_aug = np.array([0] * len(aug_legit) + [1] * len(aug_fake))

result = [0, 0, 0]
times = 100
for i in tqdm(range(times)):
    acc, precision, recall = kFoldsValidationAug(X, y, X_aug, y_aug)
    result[0] += acc
    result[1] += precision
    result[2] += recall
acc = result[0]/times
precision = result[1]/times
recall = result[2]/times
print("accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(acc, precision, recall))

100%|██████████| 100/100 [19:09<00:00, 11.49s/it]

accuracy: 0.78, precision: 0.78, recall: 0.77.





In [None]:
kFoldsValidationAug

In [None]:
len(featureExtractor.featureName())

In [None]:
std = np.std(legitFeatures[:, 726:3563], axis = 0)
mean = np.mean(legitFeatures[:, 726:3563], axis = 0)

In [None]:
mean.max()

In [None]:
# Before attack, each module
times = 100
for extractor in featureExtractor.extractors:
    indices = featureExtractor.extractorIndices()[extractor.__class__.__name__]
    X = np.concatenate([legitFeatures[:, indices], fakeFeatures[:, indices]], axis = 0)
    y = np.array([0] * len(legitFeatures) + [1] * len(fakeFeatures))
    result = [0, 0, 0]

    for i in range(times):
        acc, precision, recall = kFoldsValidation(X, y, k=10)
        result[0] += acc
        result[1] += precision
        result[2] += recall

    acc = result[0]/times
    precision = result[1]/times
    recall = result[2]/times
    print("{:^20} accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(extractor.__class__.__name__, acc, precision, recall))

In [None]:
# Attack
X = np.concatenate([legitFeatures, fakeFeatures], axis = 0)
y = np.array([0] * len(legitFeatures) + [1] * len(fakeFeatures))
X_Transfer = np.concatenate([legitTransferFeatures, fakeTransferFeatures], axis = 0)

result = [0, 0, 0]
times = 100
for i in range(times):
    acc, precision, recall = kFoldsValidationReplaceTestSet(X, y, X_Transfer)
    result[0] += acc
    result[1] += precision
    result[2] += recall
acc = result[0]/times
precision = result[1]/times
recall = result[2]/times
print("accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(acc, precision, recall))

In [None]:
# Attack, each module
times = 100
for extractor in featureExtractor.extractors:
    indices = featureExtractor.extractorIndices()[extractor.__class__.__name__]
    X = np.concatenate([legitFeatures[:, indices], fakeFeatures[:, indices]], axis = 0)
    y = np.array([0] * len(legitFeatures) + [1] * len(fakeFeatures))
    X_Transfer = np.concatenate([legitTransferFeatures[:, indices], fakeTransferFeatures[:, indices]], axis = 0)
    result = [0, 0, 0]
    for i in range(times):
        acc, precision, recall = kFoldsValidationReplaceTestSet(X, y, X_Transfer)
        result[0] += acc
        result[1] += precision
        result[2] += recall
    acc = result[0]/times
    precision = result[1]/times
    recall = result[2]/times
    print("{:^20} accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(extractor.__class__.__name__, acc, precision, recall))

In [None]:
# data augmentation
X = np.concatenate([legitFeatures, fakeFeatures, fakeTransferFeatures], axis = 0)
y = np.array([0] * len(legitFeatures) + [1] * 2 * len(fakeFeatures))

result = [0, 0, 0]
times = 100
for i in range(times):
    acc, precision, recall = kFoldsValidation(X, y, k=10)
    result[0] += acc
    result[1] += precision
    result[2] += recall
acc = result[0]/times
precision = result[1]/times
recall = result[2]/times
print("accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(acc, precision, recall))

In [None]:
times = 100
for extractor in featureExtractor.extractors:
    indices = featureExtractor.extractorIndices()[extractor.__class__.__name__]
    X = np.concatenate([legitFeatures[:, indices], fakeFeatures[:, indices], fakeTransferFeatures[:, indices]], axis = 0)
    y = np.array([0] * len(legitFeatures) + [1] * 2 * len(fakeFeatures)) 
    result = [0, 0, 0]
    for i in range(times):
        acc, precision, recall = kFoldsValidation(X, y, k=10)
        result[0] += acc
        result[1] += precision
        result[2] += recall
    acc = result[0]/times
    precision = result[1]/times
    recall = result[2]/times
    print("{:^20} accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(extractor.__class__.__name__, acc, precision, recall))

In [2]:
re.sub(r'-\d+', '', 'ASP-1')

'ASP'

In [31]:
random.choices(['a', 'b'], [1, 9])

['b']

In [34]:
list({'a':0}.keys())

['a']