In [5]:
# ---- LIBRARY IMPORTS ----
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

# ---- DEVICE SET UP ----
def get_gpu():
    if torch.backends.mps.is_available():
        print("Using mps")
        return "mps"
    elif torch.backend.cuda.is_available():
        print("Using cuda")
        return"cuda"
    else:
        print("Using CPU")
        return "cpu"
device = get_gpu()

Using mps


# Cleaning Data

In [2]:
# ---- DATA IMPORT ----
# Specify data types for each column
dtypes = {0: "UInt64", 1: "string", 2: "string", "3": "UInt64"}

# Import data
data_raw = pd.read_csv('data/fake_news/news_data.csv', dtype=dtypes)

# Remove NA values
data_no_na = data_raw.dropna()

# Drop columns 
data_drop_columns = data_no_na.drop(columns=['Unnamed: 0', 'title'])

# Drop duplicate rows
clean_data = data_drop_columns.drop_duplicates()

In [3]:
"""
---- CLEANING TEXT DATA ----
The following steps will be taken to clean data
- all non-alphabetical data will be removed
- all emails and web urls will be removed
- all stop words will be removed based on the stop words in the ntlk dictionary
- all text will be made lowercase
"""

# ---- REMOVING STOP WORDS ----
stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    return ' '.join(word for word in text.split() if word not in stop_words)

clean_data['text'] = clean_data['text'].apply(remove_stop_words)

# --- LEMMATIZE THE TEXT ----
lemmatizer = WordNetLemmatizer()

def wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'
    elif treebank_tag.startswith('V'):
        return 'v' 
    elif treebank_tag.startswith('N'):
        return 'n' 
    elif treebank_tag.startswith('R'):
        return 'r'  
    else:
        return 'n'  

# function to lemmatize words
def pos_lemmatize(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(token, wordnet_pos(tag)) for token, tag in pos_tags]
    return ' '.join(lemmatized_tokens)


# Regex function to clean strings
def regex_cleaner(text):
    try:
    # remove any web urls
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) 
    # remove email addresses
        text = re.sub(r'\b\w+@\w+\.\w+\b', '', text) 
    # Make all text lowercase
        text_lowercase = text.lower()
    # Remove all non-alphanumeric text
        text_alphanumeric = re.sub(r'[^a-z\s\-]', '', text_lowercase)
    # Combine words that overlap to a new line
        no_overlap = re.sub(r'(\-\n)', '', text_alphanumeric)
    # remove \n and "-"
        no_new_line = re.sub(r'[\n\-]', ' ', no_overlap)
    # Remove extra spacing
        clean_text = re.sub(r'\s+', r' ', no_new_line)
        return clean_text
    except:
        raise Exception("Something went wrong :(")


# Clean text data!
clean_data['text'] = clean_data['text'].apply(regex_cleaner)
clean_data['text'] = clean_data['text'].apply(lambda x: pos_lemmatize(str(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_data['text'] = clean_data['text'].apply(remove_stop_words)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_data['text'] = clean_data['text'].apply(regex_cleaner)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_data['text'] = clean_data['text'].apply(lambda x: pos_lemmatize(str(x)))


In [22]:
# split data into train, test sets
train, test = train_test_split(clean_data, test_size=0.2, random_state=0)

# Create an iterator object for train and test data

data_iter = clean_data.iterrows()
train_iter = train.iterrows()
test_iter = test.iterrows()

# Convert generators to list of tuples because DataLoader does not work well with pandas dataframes
# Use this as inputs for DataLoader
data_list_of_tuples = [(row.text, row.label) for index, row in data_iter]
train_list_of_tuples = [(row.text, row.label) for index, row in train_iter]
test_list_of_tuples = [(row.text, row.label) for index, row in test_iter]

# Taken from pytorch documentation tutorials -> https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html
# tokenization for word sequences
# No tokenizer is required as data was tokenized in previous step.  We only require tokenizer to split articles by word to create the sequences
tokenizer = get_tokenizer(tokenizer=None)

def yield_tokens(data):
    # pull the text data from series to tokenize it
    # Each row is a series when calling the iterrows() method, you must call the text column to pull its value
    for index, row in data.iterrows():
        text = row.text
        yield tokenizer(text)

# vocab_dict is now a function that takes a list of words as an input and returns integers based on the indexes found in the vocab_dict's dictionary
# <unk> -> In case a word is not in vocab_dict, we default it to a special index for words not in vocab_dict
vocab_dict = build_vocab_from_iterator(iterator=yield_tokens(clean_data), specials=["<unk>"])
vocab_dict.set_default_index(vocab_dict["<unk>"])
# text_sequencer is a function that takes a string and returns a list of integers based off vocab_dict
text_sequencer = lambda string: vocab_dict(tokenizer(string))


def collate_batch(batch):
    """
    This function takes a batch created from the DataLoader function and does data preprocessing to it
    """
    labels, text_tensors_list = [], []
    for example in batch:
    # Get data from pandas series
        text = example[0]
        label = example[1]
    # convert text to sequences of integers
        text_sequence = text_sequencer(text)
    # convert text_sequence to tensor
        text_sequence_tensor = torch.tensor(text_sequence, dtype=torch.int64)
    # append tensors to lists
        labels.append(label)
        text_tensors_list.append(text_sequence_tensor)
    # add padding of 0 to text_tensors (All articles have a different number of words and we want all tensors to be the same size)
    text_tensors = pad_sequence(text_tensors_list, batch_first=True, padding_value = 0)
    
    # convert labels lists to tensor
    labels_tensor = torch.tensor(labels, dtype=torch.int64)
    return labels_tensor.to(device), text_tensors.to(device)

# Model Building

## RNN

In [28]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.hidden_layer_input = nn.Linear(in_features=input_size, out_features=hidden_size, bias=False)
        self.hidden_layer_previous = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=False)
        self.hidden_layer_out = nn.Linear(in_features=hidden_size, out_features=1)
        self.activation = nn.Sigmoid()

    def forward(self, input_, hidden):
        hidden = F.tanh(self.hidden_layer_input(input_) + self.hidden_layer_previous(hidden))
        output = self.hidden_layer_out(hidden)
        output = self.activation(output)
        return output, hidden

    def initHidden(self):
    # Return a matrix of 1 row and k columns where k=hidden_size
        return torch.zeros(1, self.hidden_size)
        

In [30]:
# Initialize Model
n_hidden = 24
input_size = len(vocab_dict)
rnn_model = RNN(input_size=input_size, hidden_size=n_hidden)

### Training

In [404]:
train_loader = torch.utils.data.DataLoader(train_list_of_tuples, batch_size=8, shuffle=True)

Help on DataLoader in module torch.utils.data.dataloader object:

class DataLoader(typing.Generic)
 |  DataLoader(dataset: torch.utils.data.dataset.Dataset[+T_co], batch_size: Optional[int] = 1, shuffle: Optional[bool] = None, sampler: Union[torch.utils.data.sampler.Sampler, Iterable, NoneType] = None, batch_sampler: Union[torch.utils.data.sampler.Sampler[List], Iterable[List], NoneType] = None, num_workers: int = 0, collate_fn: Optional[Callable[[List[~T]], Any]] = None, pin_memory: bool = False, drop_last: bool = False, timeout: float = 0, worker_init_fn: Optional[Callable[[int], NoneType]] = None, multiprocessing_context=None, generator=None, *, prefetch_factor: Optional[int] = None, persistent_workers: bool = False, pin_memory_device: str = '')
 |
 |  Data loader combines a dataset and a sampler, and provides an iterable over the given dataset.
 |
 |  The :class:`~torch.utils.data.DataLoader` supports both map-style and
 |  iterable-style datasets with single- or multi-process load

## LSTM

# References
[https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html](https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html)