In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from random import sample
import numpy as np
from random import shuffle
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

DEVICE = 'cpu'

In [5]:
filepath = "data/news.csv"
df = pd.read_csv(filepath)
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [6]:
# df.drop('Unnamed: 0', axis=1, inplace=True)
df['titletext'] = df['title'] + " " + df['text']
# Cap the sentences length
df['titletext'] = df['titletext'].str.slice(start=0, stop=1000)  # 최대 1000개 까지만
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,titletext
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,You Can Smell Hillary’s Fear Daniel Greenfield...
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,Watch The Exact Moment Paul Ryan Committed Pol...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,Kerry to go to Paris in gesture of sympathy U....
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,Bernie supporters on Twitter erupt in anger ag...
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,The Battle of New York: Why This Primary Matte...


In [7]:
UNK_TOKEN = 9  # 미등록 어휘

class Vocab:
    def __init__(self):
        self.word2id = {"__unk__": UNK_TOKEN}
        self.id2word = {UNK_TOKEN: "__unk__"}
        self.n_words = 1

        self.tag2id = {"FAKE": 0, "REAL": 1}
        self.id2tag = {0: "FAKE", 1: "REAL"}

    def index_words(self, words):
        word_indexes = [self.index_word(w) for w in words]
        return word_indexes

    def index_tags(self, tag):
        tag_index = self.tag2id[tag]
        return tag_index

    def index_word(self, w):
        if w not in self.word2id:
            self.word2id[w] = self.n_words
            self.id2word[self.n_words] = w
            self.n_words += 1
        return self.word2id[w]

In [8]:
vocab = Vocab()
def prepare_data(data, vocab, input_field):
    data_sequences = []

    for _, row in data.iterrows():
        words = row[input_field].split()
        tags = row["label"]
        word_ids = torch.tensor(vocab.index_words(words), dtype=torch.long).to(DEVICE)
        tag_ids = torch.tensor(vocab.index_tags(tags), dtype=torch.long).to(DEVICE)
        data_sequences.append([word_ids, tag_ids])

    return data_sequences, vocab

In [9]:
#Create data sequnce

sequences, vocab = prepare_data(df, vocab, "titletext")
x = [i[0] for i in sequences]
y = [i[1] for i in sequences]

# pad sentences to use batches
padded_x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True)
x = [i for i in padded_x]

In [10]:
# Number of unique words

print(vocab.n_words)

84569


In [11]:
# Split data to train, validation and test

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

test_sequences = list(zip(x_test,y_test))
test_sequences = [list(x) for x in test_sequences]

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=0)

train_sequences = list(zip(x_train,y_train))
train_sequences = [list(x) for x in train_sequences]
val_sequences = list(zip(x_val,y_val))
val_sequences = [list(x) for x in val_sequences]

In [190]:
class LSTMNet(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, n_layers, bidirectional, dropout):
        super(LSTMNet, self).__init__()
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.bidirectional = bidirectional
        directions = 2 if self.bidirectional else 1
        self.embedding = nn.Embedding(input_size, embedding_size)

        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, bidirectional=self.bidirectional, batch_first=True)
        self.fc1 = nn.Linear(hidden_size*directions, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_sentence):
        num_dimensions = len(input_sentence)
        sentence = input_sentence.clone().detach().to(DEVICE)
        embedded = self.embedding(sentence)
        packed_output, (hidden, cell) = self.lstm(embedded.view(num_dimensions, sentence.size()[1], self.embedding_size))
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        output = self.dropout(self.fc1(hidden))
        output = self.out(output)

        return output

In [227]:
class RNNNet(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, n_layers, bidirectional, dropout):
        super(RNNNet, self).__init__()
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.bidirectional = bidirectional
        self.directions = 2 if self.bidirectional else 1
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.RNN(embedding_size, hidden_size, n_layers, dropout=dropout, batch_first=True, bidirectional=self.bidirectional)
        # self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, bidirectional=self.bidirectional, batch_first=True)
        self.fc1 = nn.Linear(hidden_size*self.directions, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_sentence):
        sentence = input_sentence.clone().detach().to(DEVICE)
        embedded = self.embedding(sentence)  # shape=[batch, num_words, embedding_size]

        # print(embedded.shape)
        output, hidden = self.rnn(embedded) # output shape = [batch, 53(padding size), hidden_size]

        # output shape = torch.Size([32, 53, 64])
        output = self.fc1(output[:, -1])  # output[:, -1] -> 문장의 마지막 단어
        output = self.out(output)

        return output

In [228]:
# Dataset
title_sequences, vocab = prepare_data(df, vocab, "title")
title_x = [i[0] for i in title_sequences]
title_y = [i[1] for i in title_sequences]

# pad sentences to use batches
title_padded_x = torch.nn.utils.rnn.pad_sequence(title_x, batch_first=True)
title_x = [i for i in title_padded_x]

title_x_train, title_x_test, title_y_train, title_y_test = train_test_split(title_x, title_y, test_size=0.2, random_state=42)

title_test_sequences = list(zip(title_x_test,title_y_test))
title_test_sequences = [list(x) for x in title_test_sequences]
title_train_sequences = list(zip(title_x_train,title_y_train))
title_train_sequences = [list(x) for x in title_train_sequences]

In [229]:
BATCH_SIZE = 32
EPOCHS = 10

In [233]:
model = LSTMNet(input_size=vocab.n_words, embedding_size=32, hidden_size=64, output_size=len(vocab.id2tag), n_layers=2, bidirectional=True, dropout=0.2)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
train_loader = DataLoader(title_train_sequences, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(title_test_sequences, batch_size=BATCH_SIZE, shuffle=True)

print(model)

LSTMNet(
  (embedding): Embedding(84569, 32)
  (lstm): LSTM(32, 64, num_layers=2, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (out): Linear(in_features=64, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [234]:
def evaluate(eval_sequences, batch_size):
    eval_loader = DataLoader(eval_sequences, batch_size=batch_size, shuffle=True)
    preds = []
    tags = []
    with torch.no_grad():
        for words, tag in eval_loader:
            preds.append(model(words).argmax(dim=1).cpu().data.numpy())
            tags.append(tag.cpu().data.numpy())
    preds = np.concatenate(preds).ravel()
    tags = np.concatenate(tags).ravel()
    accuracy = (preds == tags).sum() / len(tags) * 100
    return accuracy

In [235]:
for e in range(EPOCHS):
    count = 0
    epoch_loss = 0.
    model.train()
    for words, tags in  iter(train_loader):
        model.zero_grad()
        seq_len = len(words)
        sentence_loss = 0
        output = model(words)
        sentence_loss = criterion(output, tags)
        sentence_loss.backward()
        optimizer.step()
        epoch_loss += sentence_loss.item()

    print(f"Epoch #{e}, Batch: {count},  Loss: {sentence_loss/len(train_loader)}")


    train_accuracy = evaluate(title_train_sequences, BATCH_SIZE)
    print(f"Epoch {e}, Training Accuracy: {train_accuracy}%")

    test_accuracy = evaluate(title_test_sequences, BATCH_SIZE)
    print(f"Epoch {e}, Validation Accuracy: {test_accuracy}%")


KeyboardInterrupt: 