In [1]:
from src.utils.FeatureConfig import FeatureConfig
from src.utils.FeatureExtractor import FeatureExtractor
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import os
import warnings
import tqdm
warnings.filterwarnings("ignore")

def loadDataset(path):
    data = []
    files = os.listdir(path)
    for file in files:
        with open(os.path.join(path, file)) as f:
            data.append(f.read())
    return data

def dataToFeatures(data, featureExtractor):
    
    features = []
    for doc in data:
        features.append(featureExtractor.extract())

def kFoldsValidation(X, y, k=10, shuffle=True, on_pos=False, on_neg=False):
    '''
    Args:
        X: a n * m matrix. n is the number of samples and m is the dimension of features
        y: a ndarray. with shape (n, )
    
    Return:
        a list of accuracy
    '''
    numberOfSample = X.shape[0]
    batchSize = numberOfSample//k
    X = np.concatenate([y.reshape(-1, 1), X], axis = 1)
    np.random.shuffle(X)

    clf = make_pipeline(StandardScaler(), SVC(gamma='auto', C=10))
    
    tp = []
    tn = []
    fp = []
    fn = []
    for i in range(k):
        testingIndices = np.arange(i*batchSize, (i+1)*batchSize)
        trainingIndices = np.delete(np.arange(numberOfSample), testingIndices)
        
        trainX = X[trainingIndices, 1:]
        trainY = X[trainingIndices, 0]
        clf.fit(trainX, trainY)
        
        testX = X[testingIndices, 1:]
        testY = X[testingIndices, 0]

        predict = clf.predict(testX)
        tp.append((predict == testY)[testY == 1].sum())
        tn.append((predict == testY)[testY == 0].sum())
        fp.append((predict != testY)[testY == 0].sum())
        fn.append((predict != testY)[testY == 1].sum())
        
    tp = np.array(tp)
    tn = np.array(tn)
    fp = np.array(fp)
    fn = np.array(fn)
    
    acc = ((tp + tn)/(tp+tn+fp+fn))
    precision = ((tp)/(tp+fp))
    recall = ((tp)/(tp+fn))
    results = [acc, precision, recall]
    for res in results:
        for i, num in enumerate(res):
            if np.isnan(num):
                res[i] = 0
    
    return acc.mean(), precision.mean(), recall.mean()

# To test on syntax transfer fake news
def kFoldsValidationReplaceTestSet(X, y, X_test, k=10, shuffle=True, on_pos=False, on_neg=False):
    '''
    Args:
        X: a n * m matrix. n is the number of samples and m is the dimension of features
        y: a ndarray. with shape (n, )
    
    Return:
        a list of accuracy
    '''
    numberOfSample = X.shape[0]
    numberOfFeatures = X.shape[1]
    batchSize = numberOfSample//k
    X = np.concatenate([y.reshape(-1, 1), X, X_test], axis = 1)
    np.random.shuffle(X)

    clf = make_pipeline(StandardScaler(), SVC(gamma='auto', C=10))
    
    tp = []
    tn = []
    fp = []
    fn = []
    for i in range(k):

        testingIndices = np.arange(i*batchSize, (i+1)*batchSize)
        trainingIndices = np.delete(np.arange(numberOfSample), testingIndices)
        
        trainX = X[trainingIndices, 1:1+numberOfFeatures]
        trainY = X[trainingIndices, 0]
        clf.fit(trainX, trainY)
        
        testY = X[testingIndices, 0]
        testX = np.zeros((testY.shape[0], numberOfFeatures))
        allFeatures = X[testingIndices, 1:]

        for i in range(testY.shape[0]):
            if testY[i]:
                testX[i] = allFeatures[i][numberOfFeatures:]
            else:
                testX[i] = allFeatures[i][:numberOfFeatures]

        predict = clf.predict(testX)
        tp.append((predict == testY)[testY == 1].sum())
        tn.append((predict == testY)[testY == 0].sum())
        fp.append((predict != testY)[testY == 0].sum())
        fn.append((predict != testY)[testY == 1].sum())
        
    tp = np.array(tp)
    tn = np.array(tn)
    fp = np.array(fp)
    fn = np.array(fn)
    
    acc = ((tp + tn)/(tp+tn+fp+fn))
    precision = ((tp)/(tp+fp))
    recall = ((tp)/(tp+fn))
    results = [acc, precision, recall]
    for res in results:
        for i, num in enumerate(res):
            if np.isnan(num):
                res[i] = 0
    
    return acc.mean(), precision.mean(), recall.mean()

def trainAndTest(X, y, X_test, y_test):
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto', C=10))
    clf.fit(X, y)
    predict = clf.predict(X_test)

    tp = (predict == y_test)[y_test==1].sum()
    tn = (predict == y_test)[y_test==0].sum()
    fp = (predict != y_test)[y_test==0].sum()
    fn = (predict != y_test)[y_test==1].sum()
    
    acc = (tp+tn)/(tp+tn+fp+fn)
    precision = (tp)/(tp+fp)
    recall = (tp)/(tp+fn)

    return acc, precision, recall


In [2]:
pathesToFiles = []
legitBase = 'data/fakeNewsDatasets/preprocessedFakeNewsDatasets/legit/'
fakeBase = 'data/fakeNewsDatasets/preprocessedFakeNewsDatasets/fake/'

for file in os.listdir(legitBase):
    pathesToFiles.append(os.path.join(legitBase, file))
for file in os.listdir(fakeBase):
    pathesToFiles.append(os.path.join(fakeBase, file))

config = FeatureConfig(
    init = False,
    pathesToFiles = pathesToFiles,
    sourceBase = 'src.utils',
    bow = True,
    vocabSize = 500,
    gi = True,
    pathToGI='inquirerbasic.xls',
    pos = True,
    posBigram = False,
    posTrigram = False,
    collectPosFromCorpus = True,
    production = True,
    collectProductionFromCorpus = True,
    readability = True,
    quantity = True,
    sentiment = True,
    punctutation = False,
)

featureExtractor = FeatureExtractor(config)

In [3]:
featureExtractor.load("save/state.pkl")

In [4]:
featureExtractor.extractorIndices()

{'GIExtractor': array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
        156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
        169, 170, 171, 172, 173, 174

In [4]:
legitData = loadDataset('data/fakeNewsDatasets/preprocessedFakeNewsDatasets/legit/')
fakeData = loadDataset('data/fakeNewsDatasets/preprocessedFakeNewsDatasets/fake/')

legitTransferData = loadDataset('data/fakeNewsDatasets/transferFakeNewsDataset/legit/')
fakeTransferData = loadDataset('data/fakeNewsDatasets/transferFakeNewsDataset/fake/')

In [9]:
legitTransferFeatures = featureExtractor.extract(legitTransferData)
fakeTransferFeatures = featureExtractor.extract(fakeTransferData)
legitFeatures = featureExtractor.extract(legitData)
fakeFeatures = featureExtractor.extract(fakeData)

2021-03-19 12:09:57 INFO: Writing properties to tmp file: corenlp_server-17d2ec0310a94ece.props
2021-03-19 12:09:57 INFO: Starting server with command: java -Xmx16G -cp /home/allen/stanza_corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 30000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-17d2ec0310a94ece.props -annotators tokenize,ssplit,parse -preload -outputFormat serialized
100%|██████████| 240/240 [01:16<00:00,  3.13it/s]
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
2021-03-19 12:11:18 INFO: Writing properties to tmp file: corenlp_server-0e519f39ba154c2c.props
20

  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
2021-03-19 12:22:51 INFO: Writing properties to tmp file: corenlp_server-1e7a62b512e8480c.props
2021-03-19 12:22:51 INFO: Starting server with command: java -Xmx16G -cp /home/allen/stanza_corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 30000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-1e7a62b512e8480c.props -annotators tokenize,ssplit,parse -preload -outputFormat serialized
100%|██████████| 240/240 [07:12<00:00,  1.80s/it]
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as

In [174]:
from tqdm import tqdm
# Before attack
X = np.concatenate([legitFeatures, fakeFeatures], axis = 0)
y = np.array([0] * len(legitFeatures) + [1] * len(fakeFeatures))
result = [0, 0, 0]
times = 100
for i in tqdm(range(times)):
    acc, precision, recall = kFoldsValidation(X, y)
    result[0] += acc
    result[1] += precision
    result[2] += recall
acc = result[0]/times
precision = result[1]/times
recall = result[2]/times
print("accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(acc, precision, recall))

100%|██████████| 100/100 [04:17<00:00,  2.57s/it]

accuracy: 0.60, precision: 0.58, recall: 0.71.





In [176]:
# Before attack, each module
times = 100
for extractor in featureExtractor.extractors:
    indices = featureExtractor.extractorIndices()[extractor.__class__.__name__]
    X = np.concatenate([legitFeatures[:, indices], fakeFeatures[:, indices]], axis = 0)
    y = np.array([0] * len(legitFeatures) + [1] * len(fakeFeatures))
    result = [0, 0, 0]

    for i in range(times):
        acc, precision, recall = kFoldsValidation(X, y, k=10)
        result[0] += acc
        result[1] += precision
        result[2] += recall

    acc = result[0]/times
    precision = result[1]/times
    recall = result[2]/times
    print("{:^20} accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(extractor.__class__.__name__, acc, precision, recall))

    GIExtractor      accuracy: 0.45, precision: 0.45, recall: 0.46.
    BOWExtractor     accuracy: 0.47, precision: 0.18, recall: 0.26.
    PosExtractor     accuracy: 0.64, precision: 0.64, recall: 0.63.
ProductionExtractor  accuracy: 0.56, precision: 0.55, recall: 0.66.
ReadabilityExtractor accuracy: 0.58, precision: 0.57, recall: 0.65.
 QuantityExtractor   accuracy: 0.59, precision: 0.58, recall: 0.66.
 SentimentExtractor  accuracy: 0.51, precision: 0.52, recall: 0.49.


In [177]:
# Attack
X = np.concatenate([legitFeatures, fakeFeatures], axis = 0)
y = np.array([0] * len(legitFeatures) + [1] * len(fakeFeatures))
X_Transfer = np.concatenate([legitTransferFeatures, fakeTransferFeatures], axis = 0)

result = [0, 0, 0]
times = 100
for i in range(times):
    acc, precision, recall = kFoldsValidationReplaceTestSet(X, y, X_Transfer)
    result[0] += acc
    result[1] += precision
    result[2] += recall
acc = result[0]/times
precision = result[1]/times
recall = result[2]/times
print("accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(acc, precision, recall))

accuracy: 0.55, precision: 0.54, recall: 0.61.


In [178]:
# Attack, each module
times = 100
for extractor in featureExtractor.extractors:
    indices = featureExtractor.extractorIndices()[extractor.__class__.__name__]
    X = np.concatenate([legitFeatures[:, indices], fakeFeatures[:, indices]], axis = 0)
    y = np.array([0] * len(legitFeatures) + [1] * len(fakeFeatures))
    X_Transfer = np.concatenate([legitTransferFeatures[:, indices], fakeTransferFeatures[:, indices]], axis = 0)
    result = [0, 0, 0]
    for i in range(times):
        acc, precision, recall = kFoldsValidationReplaceTestSet(X, y, X_Transfer)
        result[0] += acc
        result[1] += precision
        result[2] += recall
    acc = result[0]/times
    precision = result[1]/times
    recall = result[2]/times
    print("{:^20} accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(extractor.__class__.__name__, acc, precision, recall))

    GIExtractor      accuracy: 0.68, precision: 0.62, recall: 0.92.
    BOWExtractor     accuracy: 0.49, precision: 0.66, recall: 0.28.
    PosExtractor     accuracy: 0.36, precision: 0.17, recall: 0.07.
ProductionExtractor  accuracy: 0.40, precision: 0.40, recall: 0.36.
ReadabilityExtractor accuracy: 0.71, precision: 0.65, recall: 0.92.
 QuantityExtractor   accuracy: 0.71, precision: 0.65, recall: 0.91.
 SentimentExtractor  accuracy: 0.47, precision: 0.47, recall: 0.41.


In [179]:
# data augmentation
X = np.concatenate([legitFeatures, fakeFeatures, fakeTransferFeatures], axis = 0)
y = np.array([0] * len(legitFeatures) + [1] * 2 * len(fakeFeatures))

result = [0, 0, 0]
times = 100
for i in range(times):
    acc, precision, recall = kFoldsValidation(X, y, k=10)
    result[0] += acc
    result[1] += precision
    result[2] += recall
acc = result[0]/times
precision = result[1]/times
recall = result[2]/times
print("accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(acc, precision, recall))

accuracy: 0.64, precision: 0.70, recall: 0.81.


In [181]:
times = 100
for extractor in featureExtractor.extractors:
    indices = featureExtractor.extractorIndices()[extractor.__class__.__name__]
    X = np.concatenate([legitFeatures[:, indices], fakeFeatures[:, indices], fakeTransferFeatures[:, indices]], axis = 0)
    y = np.array([0] * len(legitFeatures) + [1] * 2 * len(fakeFeatures)) 
    result = [0, 0, 0]
    for i in range(times):
        acc, precision, recall = kFoldsValidation(X, y, k=10)
        result[0] += acc
        result[1] += precision
        result[2] += recall
    acc = result[0]/times
    precision = result[1]/times
    recall = result[2]/times
    print("{:^20} accuracy: {:.2f}, precision: {:.2f}, recall: {:.2f}.".format(extractor.__class__.__name__, acc, precision, recall))

    GIExtractor      accuracy: 0.64, precision: 0.72, recall: 0.74.
    BOWExtractor     accuracy: 0.65, precision: 0.66, recall: 0.98.
    PosExtractor     accuracy: 0.75, precision: 0.80, recall: 0.83.
ProductionExtractor  accuracy: 0.62, precision: 0.69, recall: 0.78.
ReadabilityExtractor accuracy: 0.74, precision: 0.78, recall: 0.85.
 QuantityExtractor   accuracy: 0.73, precision: 0.77, recall: 0.86.
 SentimentExtractor  accuracy: 0.64, precision: 0.67, recall: 0.93.
