In [1]:
from nltk.corpus import stopwords
from os import listdir
from collections import Counter
from numpy import array
import numpy as np
import string
import re

In [2]:
# loading files
def load_file(filename):
    file = open(filename, encoding="utf8")
    content = file.read()
    file.close()
    return content

In [3]:
# clean file
def clean_file(file):
    # splitting into tokens
    tokens = file.split()
    # filter character
    punc = re.compile('[%s]'% re.escape(string.punctuation))
    # remove punctuation
    tokens = [punc.sub(' ', w) for w in tokens]
    # remove non-alphabetic tokens
    tokens = [word for word in tokens if word.isalpha()]
    # filter stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter other
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [4]:
# add file to vocab
def add_file_to_vocab(filename, vocab):
    file = load_file(filename)
    tokens = clean_file(file)
    vocab.update(tokens)

In [5]:
# load all file from a folder
def process_files(folder, vocab):
    for filename in listdir(folder):
        path = folder + '/' + filename
        add_file_to_vocab(path, vocab)

In [6]:
# save list
def save_list(lines, filename):
    # convert line to text
    data = '\n'.join(lines)
    file = open(filename, 'w', encoding="utf-8")
    file.write(data)
    file.close()

In [7]:
# load tokens
vocab = load_file('vocab2.txt')
vocab = set(vocab.split())

In [8]:
# this cleaning involves filtering out
# all the tokens not in the vocabulary
def clean_file_vocab(file, vocab):
    tokens = file.split()
    punc = re.compile('[%s]' %re.escape(string.punctuation))
    tokens = [punc.sub('', w) for w in tokens]
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens

In [9]:
def process_files_vocab(folder, vocab):
    documents = list()
    for filename in listdir(folder):
        path = folder + '/' + filename
        file = load_file(path)
        tokens = clean_file_vocab(file, vocab)
        documents.append(tokens)
    return documents

In [10]:
# load and clean dataset for training
def clean_dataset(vocab, left_data, leanleft_data, centre_data, leanright_data, right_data):
    # load files
    left = process_files_vocab(left_data, vocab)
    leanleft = process_files_vocab(leanleft_data, vocab)
    centre = process_files_vocab(centre_data, vocab)
    leanright = process_files_vocab(leanright_data, vocab)
    right = process_files_vocab(right_data, vocab)
    
    files = left + leanleft + centre + leanright + right
    # labelling dataset
    # taken help from here: https://nlp.stanford.edu/IR-book/html/htmledition/classification-with-more-than-two-classes-1.html
    
    labels = array([[1, 0, 0, 0, 0] for _ in range(len(left))] + [[0, 1, 0, 0, 0] for _ in range(len(leanleft))] 
                   + [[0, 0, 1, 0, 0] for _ in range(len(centre))] + [[0, 0, 0, 1, 0] for _ in range(len(leanright))]
                   + [[0, 0, 0, 0, 1] for _ in range(len(right))])
    return files, labels

In [11]:
# necessary modules to tokenize the text
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [12]:
# fitting tokenizer on texts
def new_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [13]:
# encode documents as integer and pad documents
def encode_and_pad(tokenizer, max_length, files):
    # encoding
    encoded = tokenizer.texts_to_sequences(files)
    # padding
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded

In [14]:
# got this while training
vocab_size = 91679
max_length = 8346

In [15]:
test_files, ytest = clean_dataset(vocab, 'data/test/left', 'data/test/leanleft', 
                                    'data/test/centre', 'data/test/leanright', 'data/test/right')
tokenizer_test = new_tokenizer(test_files)
Xtest = encode_and_pad(tokenizer_test, max_length, test_files)
from keras.models import load_model


In [16]:
modelv1 = load_model('sequential_10epochv1.h5')
acc, _ = modelv1.evaluate(Xtest, ytest, verbose=0)
print("Accuracy on Test Data when using Val Data: ", acc*10)

W0809 17:15:32.016240 33784 deprecation_wrapper.py:119] From C:\Users\Vedansh\Anaconda3\envs\tensorflow\lib\site-packages\keras\backend\tensorflow_backend.py:4070: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.

W0809 17:15:35.036723 33784 deprecation_wrapper.py:119] From C:\Users\Vedansh\Anaconda3\envs\tensorflow\lib\site-packages\keras\backend\tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.



Accuracy on Test Data when using Val Data:  61.31669714536275


In [17]:
modelv2 = load_model('sequential_10epoch.h5')
acc, _ = modelv2.evaluate(Xtest, ytest, verbose=0)
print("Accuracy on Test Data: ", acc*10)

Accuracy on Test Data:  74.35901250447586


In [18]:
def predict_bias(text, vocab, tokenizer, max_length, model):
    line = clean_file_vocab(text, vocab)
    padded = encode_and_pad(tokenizer, max_length, [line])
    yhat = model.predict(padded, verbose=0)
    ideologies = ["Left", "Lean Left", "Center", "Lean Right", "Right"]
    index = np.argmax(yhat)
    print(ideologies[index])

In [19]:
# Trump's recent Tweet
text = 'Sleepy Joe Biden just agreed with the Radical Left Democrats to raise Taxes by Three Trillion Dollars. Everyone will pay - Will kill your Stocks, 401k’s, and the ECONOMY. BIG CRASH!'
review = predict_bias(text, vocab, tokenizer_test, max_length, modelv2)

Lean Right


In [20]:
# Snippet of NYT article
text2 = 'President Trump took executive action on Saturday to circumvent Congress and try to extend an array of federal pandemic relief, resorting to a legally dubious set of edicts whose impact was unclear, as negotiations over an economic recovery package appeared on the brink of collapse.It was not clear what authority Mr. Trump had to act on his own on the measures or what immediate effect, if any, they would have, given that Congress controls federal spending. But his decision to sign the measures — billed as a federal eviction ban, a payroll tax suspension, and relief for student borrowers and $400 a week for the unemployed — reflected the failure of two weeks of talks between White House officials and top congressional Democrats to strike a deal on a broad relief plan as crucial benefits have expired with no resolution in sight.Mr. Trump’s move also illustrated the heightened concern of a president staring down re-election in the middle of a historic recession and a pandemic, and determined to show voters that he was doing something to address the crises. But despite Mr. Trump’s assertions on Saturday that his actions “will take care of this entire situation,” the orders also leave a number of critical bipartisan funding proposals unaddressed, including providing assistance to small businesses, billions of dollars to schools ahead of the new school year, aid to states and cities and a second round of $1,200 stimulus checks to Americans.“Nancy Pelosi and Chuck Schumer have chosen to hold this vital assistance hostage,” Mr. Trump said, savaging the two top Democrats and their $3.4 trillion opening offer during a news conference at his private golf club in New Jersey, his second in two days. A few dozen club guests were in attendance, and the president appeared to revel in their laughter at his jokes denouncing his political rivals.'
review = predict_bias(text2, vocab, tokenizer_test, max_length, modelv2)

Center


In [21]:
# Some Fun
text3 = "Vedansh's idology?"
review = predict_bias(text3, vocab, tokenizer_test, max_length, modelv2)

Lean Right
