In [255]:
from src.utils.FeatureConfig import FeatureConfig
from src.utils.FeatureExtractor import FeatureExtractor
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import os

def loadDataset(path):
    data = []
    files = os.listdir(path)
    for file in files:
        with open(os.path.join(path, file)) as f:
            data.append(f.read())
    return data

def dataToFeatures(data, featureExtractor):
    
    features = []
    for doc in data:
        features.append(featureExtractor.extract())

def kFoldsValidation(X, y, k=10, shuffle=True):
    '''
    Args:
        X: a n * m matrix. n is the number of samples and m is the dimension of features
        y: a ndarray. with shape (n, )
    
    Return:
        a list of accuracy
    '''
    numberOfSample = X.shape[0]
    batchSize = numberOfSample//k
    X = np.concatenate([y.reshape(-1, 1), X], axis = 1)
    np.random.shuffle(X)
    
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto', C=10))
    out = []
    for i in range(k):
        testingIndices = np.arange(i*batchSize, (i+1)*batchSize)
        trainingIndices = np.delete(np.arange(numberOfSample), testingIndices)
        
        trainX = X[trainingIndices, 1:]
        trainY = X[trainingIndices, 0]
        clf.fit(trainX, trainY)
        
        testX = X[testingIndices, 1:]
        testY = X[testingIndices, 0]
        out.append((clf.predict(testX) == testY).sum()/testY.shape[0])
    return out

In [269]:
config = FeatureConfig(pathToDataset='data/fakeNewsDatasets/fakeNewsDataset/',
                       pathToGI='inquirerbasic.xls',
                       collectPosFromCorpus=False,
                       collectProductionFromCorpus=False,
                       )
featureExtractor = FeatureExtractor(config)
featureExtractor.load("save/state.pkl")
featureExtractor.drop("BOWExtractor")
featureExtractor.drop("GIExtractor")
featureExtractor.drop("ProductionExtractor")

  'stop_words.' % sorted(inconsistent))


Dropped BOWExtractor.
Dropped GIExtractor.
Dropped ProductionExtractor.


In [268]:
featureExtractor.extractorName()

['GIExtractor',
 'PosExtractor',
 'ProductionExtractor',
 'ReadabilityExtractor',
 'QuantityExtractor',
 'SentimentExtractor']

In [6]:
legitData = loadDataset('data/fakeNewsDatasets/fakeNewsDataset/legit/')
fakeData = loadDataset('data/fakeNewsDatasets/fakeNewsDataset/fake/')

In [270]:
legitFeatures = featureExtractor.extract(legitData)
fakeFeatures = featureExtractor.extract(fakeData)

  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))


In [271]:
X = np.concatenate([legitFeatures, fakeFeatures], axis = 0)
y = np.array([0] * len(legitFeatures) + [1] * len(fakeFeatures))

In [291]:
acc = kFoldsValidation(X, y)

In [294]:
sum(acc)/10

0.712962962962963

In [293]:
featureExtractor.extractorName()

['PosExtractor',
 'ReadabilityExtractor',
 'QuantityExtractor',
 'SentimentExtractor']

In [295]:
for extractor in featureExtractor.extractors:
    legitFeatures = extractor.extract(legitData)
    fakeFeatures = extractor.extract(fakeData)
    X = np.concatenate([legitFeatures, fakeFeatures], axis = 0)
    y = np.array([0] * len(legitFeatures) + [1] * len(fakeFeatures))
    acc = kFoldsValidation(X, y)
    print("{} Accuracy: {}.".format(extractor, sum(acc)/10))

<src.utils.PosExtractor.PosExtractor object at 0x7f3cfcd86790> Accuracy: 0.64375.


  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))
  " assigning as vowel: '{}'".format(c))


<src.utils.ReadabilityExtractor.ReadabilityExtractor object at 0x7f3cfccd9610> Accuracy: 0.5229166666666666.
<src.utils.QuantityExtractor.QuantityExtractor object at 0x7f3cfccd9650> Accuracy: 0.5374999999999999.
<src.utils.SentimentExtractor.SentimentExtractor object at 0x7f3cfccd9690> Accuracy: 0.5020833333333333.


In [267]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

['document', 'documents', 'second']


In [265]:
vectorizer.transform(['He is is one of a dog.']).toarray()

array([[0.        , 0.        , 0.        , 0.722056  , 0.69183461,
        0.        , 0.        , 0.        , 0.        ]])

In [264]:
X.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

In [314]:
from stanza.server import CoreNLPClient

with CoreNLPClient(
    annotators=['parse'],
    timeout=30000,
    memory='16G') as client:
    text = 'This Trump is the second document and documents.'
    annotate = client.annotate(text)

2021-03-17 10:52:40 INFO: Writing properties to tmp file: corenlp_server-cc936bcc5b364f5f.props
2021-03-17 10:52:40 INFO: Starting server with command: java -Xmx16G -cp /home/allen/stanza_corenlp/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 30000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-cc936bcc5b364f5f.props -annotators parse -preload -outputFormat serialized


In [318]:
import stanza
stanza.download('en')
nlp = stanza.Pipeline(lang='en', processors='tokenize, ner')
doc = nlp("Chris Manning teaches at Stanford University. He lives in the Bay Area.")
print(*[f'token: {token.text}\tner: {token.ner}' for sent in doc.sentences for token in sent.tokens], sep='\n')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 18.6MB/s]                    
2021-03-17 10:57:32 INFO: Downloading default packages for language: en (English)...
Downloading http://nlp.stanford.edu/software/stanza/1.2.0/en/default.zip: 100%|██████████| 411M/411M [01:16<00:00, 5.38MB/s] 
2021-03-17 10:58:53 INFO: Finished downloading models and saved to /home/allen/stanza_resources.
2021-03-17 10:58:53 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2021-03-17 10:58:53 INFO: Use device: gpu
2021-03-17 10:58:53 INFO: Loading: tokenize
2021-03-17 10:58:55 INFO: Loading: ner
2021-03-17 10:58:55 INFO: Done loading processors!


token: Chris	ner: B-PERSON
token: Manning	ner: E-PERSON
token: teaches	ner: O
token: at	ner: O
token: Stanford	ner: B-ORG
token: University	ner: E-ORG
token: .	ner: O
token: He	ner: O
token: lives	ner: O
token: in	ner: O
token: the	ner: B-LOC
token: Bay	ner: I-LOC
token: Area	ner: E-LOC
token: .	ner: O


In [330]:
doc.sentences[0].tokens[2]

[
  {
    "id": 3,
    "text": "teaches",
    "misc": "start_char=14|end_char=21",
    "ner": "O"
  }
]

In [328]:
doc.sentences[0].tokens[1].ner

'E-PERSON'

In [332]:
"O"=="O"

True