In [2]:
import numpy as np
import pandas as pd

df = pd.read_csv('/home/rachel/FakeNewsProject/FakeNews/news_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [3]:
df = df.dropna()
try:
    df = df.drop(columns=['Unnamed: 0'])
except Exception as e:
    print(f"An error occurred while dropping the column 'Unnamed: 0': {e}")
df['label'] = pd.to_numeric(df['label'], errors='coerce')
df.head()
df = df.drop_duplicates()
df = df.dropna(subset=['text', 'title'])

In [4]:
import torch
torch.cuda.is_available()

False

In [5]:
df = df[df['text'].str.strip() != '']
df = df[df['title'].str.strip() != '']

In [6]:
#some regex cleaning

import re

def cleanup(text):
    if pd.isna(text):
        return ""
    text = text.strip()  
    #remove any web urls
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) 
    #remove email addresses
    text = re.sub(r'\b\w+@\w+\.\w+\b', '', text)  
    #get rid of any duplicate spacing
    text = re.sub(r'\s+', ' ', text)  
    #get rid of punctuaation
    text = re.sub(r'[^\w\s]', '', text) 
    #lowercase all text
    text = text.lower() 
    return text

df['text'] = df['text'].apply(cleanup)
df['title'] = df['title'].apply(cleanup)

In [7]:
df['combined'] = df['title'].fillna('') + ' ' + df['text'].fillna('')

In [8]:

text = ' '.join(df['combined'])
all_words = text.split()

In [9]:
#Now that we have a list of all words we have to create a dictionary based on the most common words
#This is a parameter that we can adjust to see how it affects our training
from collections import Counter
word_counts = Counter(all_words)
most_common_words = word_counts.most_common(15000)
words = [word for word, _ in most_common_words]

print(words)




In [10]:
#now create the integer equivalent vocabulary
integer_dictionary = {word: ii for ii, word in enumerate(words, 1)}
print(integer_dictionary)
print(len(integer_dictionary))
integer_dictionary["<UNK>"] = len(integer_dictionary) + 1
integer_dictionary["<0>"] = 0


15000


In [11]:
integer_dictionary["<UNK>"]

15001

In [12]:
def get_key_by_value(d, value):
    for key, val in d.items():
        if val == value:
            return key
    return None

get_key_by_value(integer_dictionary, 332)

'decision'

In [13]:

def get_words(text):
    return text.split() 

article_to_word_ints = []
articles = df['combined']
article_to_label = df['label'].to_numpy()
assert(len(articles) == len(article_to_label))

for article in articles:
    words = get_words(article)
    word_ints = []
    for word in words:
        word_ints.append(integer_dictionary.get(word, integer_dictionary['<UNK>']))
    article_to_word_ints.append(word_ints)


In [14]:
def add_padding(article_to_word_ints, length_of_sequence):
    article_to_sequence = np.zeros((len(article_to_word_ints),length_of_sequence),dtype=int)
    #make all the sequences padded by adding zeros to the beginning if smaller than sequence length or clip articles that are too long.
    num_padded = 0
    num_clipped = 0
    for i in range(len(article_to_word_ints)):
        word_ints = article_to_word_ints[i]
        n_words = len(word_ints)
        
        if n_words >= length_of_sequence:
            num_clipped += 1
            for j in range(length_of_sequence):
                article_to_sequence[i,j] = word_ints[j]
        else:
            num_padded += 1
            for j in range(n_words):
                article_to_sequence[i,length_of_sequence-n_words+j] = word_ints[j]
    print(f"num padded: {num_padded}")
    print(f"num clipped: {num_clipped}")
    return article_to_sequence

In [15]:
sequence_length = 500
article_to_sequence = add_padding(article_to_word_ints, length_of_sequence=sequence_length)

print(article_to_sequence[200:, -10:])

num padded: 38091
num clipped: 24501
[[   25 14499   385 ...     1  6641  6431]
 [   11   133    19 ...  1765    54  2018]
 [    4 11211  5926 ...     1  1358     2]
 ...
 [  513    88    18 ...     6  1614  4987]
 [ 1741 15001    78 ...  7247   401   675]
 [14298    94    41 ...     8   321   140]]


In [16]:
import sklearn
from sklearn.model_selection import train_test_split
X = article_to_sequence
y = article_to_label

#training data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

#rest is validation and testing
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(X_train.shape)

(43814, 500)


In [17]:
import torch
from torch.utils.data import TensorDataset, DataLoader

#make tensor dataset
training_tensor = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
validation_tensor = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))
test_tensor = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

#try different batch sizes
batch_size = 16
#more workers = more gpu usage
num_workers = 10

#make loaders for all data
training_loader = DataLoader(training_tensor, shuffle=True, batch_size=batch_size, num_workers=num_workers)
validation_loader = DataLoader(validation_tensor, shuffle=False, batch_size=batch_size, num_workers=num_workers)
test_loader = DataLoader(test_tensor, shuffle=False, batch_size=batch_size, num_workers=num_workers)

In [18]:
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"device is {device}")

device is cpu


In [19]:
import torch
torch.cuda.is_available()

False

In [20]:
print(len(integer_dictionary))

15002


In [21]:
import torch.nn as nn


class LSTM_Model(nn.Module):
    def __init__(self, dictionary_size, embedding_size, hidden_size, number_layers, lstm_drop=0.5, fully_connected_drop=0.5):
        super(LSTM_Model, self).__init__()
        self.number_layers = number_layers
        self.hidden_size = hidden_size
        self.embedding_layer = nn.Embedding(dictionary_size,embedding_size)
        self.lstm_layer = nn.LSTM(embedding_size, hidden_size, number_layers, dropout=lstm_drop, batch_first=True)
        self.fully_connected_dropout = nn.Dropout(fully_connected_drop)
        #the output dimension should be 1
        self.fully_connected_layer = nn.Linear(hidden_size, 1)
        self.sigmoid_function = nn.Sigmoid()

    def forward(self, x, hidden_state):
        batch_size = x.size(0)
        # print(f"batch size {batch_size}")
        x = self.embedding_layer(x)
        x, hidden_state = self.lstm_layer(x, hidden_state)
        # print(f"lstm out shape: {x.shape}")
        x = x.contiguous().view(-1, self.hidden_size)
        # print(f"lstm out shape: {x.shape}")
        x = self.fully_connected_dropout(x)
        x = self.fully_connected_layer(x)
        # print(f"fully connected out shape: {x.shape}")
        x = self.sigmoid_function(x)
        # print(f"sigmoid out 1: {x.shape}")
        x = x.view(batch_size, -1)
        # print(f"sigmoid: {x.shape}")
        x = x[:, -1] 
        # print(f"sigmoid out3: {x.shape}")
        return x, hidden_state

    def initialize_hidden_state(self, batch_size):
        weight = next(self.parameters()).data
        if device == "cuda":
            hidden_state = (weight.new(self.number_layers, batch_size, self.hidden_size).zero_().cuda(),
                      weight.new(self.number_layers, batch_size, self.hidden_size).zero_().cuda())
        else:
            hidden_state = (weight.new(self.number_layers, batch_size, self.hidden_size).zero_(),
                      weight.new(self.number_layers, batch_size, self.hidden_size).zero_())

        return hidden_state
    


In [22]:
dictionary_size = len(integer_dictionary)
number_layers = 1
embedding_size = 100
hidden_size = 100



In [23]:

model = LSTM_Model(dictionary_size=dictionary_size, embedding_size=embedding_size, hidden_size=hidden_size, number_layers=number_layers)
if device == "cuda":
    model.cuda()
    
print(model)

LSTM_Model(
  (embedding_layer): Embedding(15002, 100)
  (lstm_layer): LSTM(100, 100, batch_first=True, dropout=0.5)
  (fully_connected_dropout): Dropout(p=0.5, inplace=False)
  (fully_connected_layer): Linear(in_features=100, out_features=1, bias=True)
  (sigmoid_function): Sigmoid()
)




In [24]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.optim import SGD
import torch.nn as nn
import torch.optim as optim

loss_function = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.1, patience=1, verbose=True)



In [25]:
number_epochs = 5
gradient_clip = 10

for epoch in range(number_epochs):
    train_loss_total = 0.0
    val_loss_total = 0.0
    train_correct = 0
    val_correct = 0
    train_samples = 0
    val_samples = 0

    model.train()
    for inputs, labels in training_loader:
        batch_size = inputs.size(0)
        if device == "cuda":
            inputs, labels = inputs.cuda(), labels.cuda()

        hidden = model.initialize_hidden_state(batch_size)
        hidden = tuple([h.data for h in hidden])

        model.zero_grad()
        outputs, hidden = model(inputs, hidden)
        loss = loss_function(outputs.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), gradient_clip)
        optimizer.step()

        train_loss_total += loss.item() * batch_size
        train_samples += batch_size
        prediction_train = torch.round(outputs.squeeze())
        correct_train = prediction_train.eq(labels.float().view_as(prediction_train))
        correct_train_np = correct_train.cpu().numpy() if device == "cuda" else correct_train.numpy()
        train_correct += np.sum(correct_train_np)

    avg_train_loss = train_loss_total / train_samples
    train_accuracy = train_correct / train_samples

    model.eval()
    with torch.no_grad():
        for inputs, labels in validation_loader:
            batch_size = inputs.size(0)
            if device == "cuda":
                inputs, labels = inputs.cuda(), labels.cuda()

            val_hidden = model.initialize_hidden_state(batch_size)
            val_hidden = tuple([h.data for h in val_hidden])

            outputs, val_hidden = model(inputs, val_hidden)
            loss = loss_function(outputs.squeeze(), labels.float())

            val_loss_total += loss.item() * batch_size
            val_samples += batch_size
            pred_val = torch.round(outputs.squeeze())
            correct_val = pred_val.eq(labels.float().view_as(pred_val))
            correct_val_np = correct_val.cpu().numpy() if device == "cuda" else correct_val.numpy()
            val_correct += np.sum(correct_val_np)

    avg_val_loss = val_loss_total / val_samples
    val_accuracy = val_correct / val_samples
    scheduler.step(avg_val_loss)

    print(f"finished training epoch: {epoch + 1}/{number_epochs}",
          f"training loss: {avg_train_loss:.5f} ------------------ training accuracy: {train_accuracy:.5f}",
          f"validation loss: {avg_val_loss:.5f} ------------------ validation accuracy: {val_accuracy:.5f}",)


KeyboardInterrupt: 

: 

In [None]:
testing_losses = [] 
number_correct_outputs = 0
hidden_state = model.initialize_hidden_state(batch_size)

model.eval()

for inputs, labels in test_loader:
    if device == "cuda":
        inputs, labels = inputs.cuda(), labels.cuda()
    hidden_state = model.initialize_hidden_state(inputs.size(0))
    output, hidden_state = model(inputs, hidden_state)

    loss_of_test_data = loss_function(output.squeeze(), labels.float())
    testing_losses.append(loss_of_test_data.item())
    prediction = torch.round(output.squeeze())
    correct_tensor = prediction.eq(labels.float().view_as(prediction))
    correct = np.squeeze(correct_tensor.numpy()) if not device == "cuda" else np.squeeze(correct_tensor.cpu().numpy())
    number_correct_outputs += np.sum(correct)

test_accuracy = number_correct_outputs/len(test_loader.dataset)

print("test data loss: {:.5f}".format(np.mean(testing_losses)))
print("test data accuracy: {:.5f}%".format(test_accuracy*100))