In [1]:
import numpy as np
import pandas as pd

In [2]:
df_trump = pd.read_csv("../data/style/trump/df_trump.csv")
df_trump

Unnamed: 0,sequences
0,"Thank you, thank you Wow Wow, and I'm thrill..."
1,"Please, let's have a little fun We got plent..."
2,"You go back … go top-20, top-30 Take a look ..."
3,"And we go back, and we take a look We want t..."
4,Kim Jong-un doesn't know about the problems t...
...,...
1880,""" No, Trump is stopping it Trump is stopping ..."
1881,"That's why I say, ""Hey, if they don't like it..."
1882,And if you don't support me you're going to b...
1883,That had to be a set up I never saw that guy...


In [3]:
df_david = pd.read_csv("../data/style/david/df_david.csv", encoding="ISO-8859-1")
df_david

Unnamed: 0,text
0,A hundred years ago\n\nthere were one and a ha...
1,Human beings venture into the highest parts of...
2,Only 3 percent of the water\non our planet is ...
3,This is our planet's final frontier.\n\nAn inn...
4,A third of the land\non our planet is desert.\...
5,Both poles of our planet are covered with ice....
6,Vast open plains.\n\nImmense spaces.\n\nEerie ...
7,The coast - the frontier\nbetween land and sea...
8,Our planet's continents are fringed\nby shallo...
9,Trees.\n\nSurely among the most magnificent\no...


In [4]:
import re

In [5]:
df_david["text"] = df_david["text"].apply(lambda x: re.sub('\s+', ' ', x))

In [6]:
df_david

Unnamed: 0,text
0,A hundred years ago there were one and a half ...
1,Human beings venture into the highest parts of...
2,Only 3 percent of the water on our planet is f...
3,This is our planet's final frontier. An inner ...
4,A third of the land on our planet is desert. T...
5,Both poles of our planet are covered with ice....
6,Vast open plains. Immense spaces. Eerie silenc...
7,The coast - the frontier between land and sea....
8,Our planet's continents are fringed by shallow...
9,Trees. Surely among the most magnificent of al...


In [7]:
# df david split by sentence
df_david["text"] = df_david["text"].str.split(".")
df_david

Unnamed: 0,text
0,[A hundred years ago there were one and a half...
1,[Human beings venture into the highest parts o...
2,[Only 3 percent of the water on our planet is ...
3,"[This is our planet's final frontier, An inne..."
4,"[A third of the land on our planet is desert, ..."
5,[Both poles of our planet are covered with ice...
6,"[Vast open plains, Immense spaces, Eerie sil..."
7,[The coast - the frontier between land and sea...
8,[Our planet's continents are fringed by shallo...
9,"[Trees, Surely among the most magnificent of ..."


In [8]:
df_david = df_david.explode(column="text").reset_index(drop=True)

In [9]:
df_david

Unnamed: 0,text
0,A hundred years ago there were one and a half ...
1,"Now, over six billion crowd our fragile planet"
2,"But even so, there are still places barely to..."
3,This series will take to the last wildernesse...
4,Imagine our world without sun
...,...
2637,"As we explore them, so we gain not only under..."
2638,It's not just the future of the whale that to...
2639,"We can now destroy, or we can cherish"
2640,The choice is ours


In [10]:
# filter the sentences that are too long or too short
df_david = df_david[(df_david["text"].str.len() >= 50) & (df_david["text"].str.len() <= 150)]

In [11]:
df_trump.rename(columns={"sequences": "text"}, inplace=True)

In [12]:
df_trump

Unnamed: 0,text
0,"Thank you, thank you Wow Wow, and I'm thrill..."
1,"Please, let's have a little fun We got plent..."
2,"You go back … go top-20, top-30 Take a look ..."
3,"And we go back, and we take a look We want t..."
4,Kim Jong-un doesn't know about the problems t...
...,...
1880,""" No, Trump is stopping it Trump is stopping ..."
1881,"That's why I say, ""Hey, if they don't like it..."
1882,And if you don't support me you're going to b...
1883,That had to be a set up I never saw that guy...


In [13]:
df_trump["label"] = 1
df_david["label"] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_david["label"] = 0


In [187]:
df_david.to_csv("../data/style/david/df_david_filtered.csv", index=False)

# combine dataset

In [14]:
df = pd.concat([df_trump, df_david], ignore_index=True)
df

Unnamed: 0,text,label
0,"Thank you, thank you Wow Wow, and I'm thrill...",1
1,"Please, let's have a little fun We got plent...",1
2,"You go back … go top-20, top-30 Take a look ...",1
3,"And we go back, and we take a look We want t...",1
4,Kim Jong-un doesn't know about the problems t...,1
...,...,...
3513,They take many tons of water into their ballo...,0
3514,"Every day, each one swallows some four millio...",0
3515,Once and not so long ago three hundred thousa...,0
3516,"As we explore them, so we gain not only under...",0


In [15]:
# reshuffle
df = df.sample(frac=1)
df

Unnamed: 0,text,label
1292,I won't mention the name They've got a lot o...,1
883,He's on top of it And his father is a great ...,1
3515,Once and not so long ago three hundred thousa...,0
2267,They work together to drive shoals of fish in...,0
2411,Receptors in the snake's head pick up the hea...,0
...,...,...
217,They never catch us Because Mexico's paying ...,1
2790,At the heart of all that happens here is a si...,0
100,"Let me tell you, what's going to happen on No...",1
2091,It's hard to imagine what could have attracte...,0


# Clean Text

In [16]:
import nltk
from nltk import word_tokenize

from tqdm.notebook import tqdm
tqdm.pandas()

In [17]:
def clean_text(text):
    # remove punctuation
    text = text.translate(str.maketrans('', '', "\"\'"))
    text = re.sub(r'\s+', ' ', text)
    text = word_tokenize(text.lower().strip())
    text = [token.strip() for token in text if token.strip() != ""]

    return text

In [18]:
df["clean_text"] = df["text"].progress_apply(clean_text)

  0%|          | 0/3518 [00:00<?, ?it/s]

In [19]:
df["clean_text"] # already tokenized

1292    [i, wont, mention, the, name, theyve, got, a, ...
883     [hes, on, top, of, it, and, his, father, is, a...
3515    [once, and, not, so, long, ago, three, hundred...
2267    [they, work, together, to, drive, shoals, of, ...
2411    [receptors, in, the, snakes, head, pick, up, t...
                              ...                        
217     [they, never, catch, us, because, mexicos, pay...
2790    [at, the, heart, of, all, that, happens, here,...
100     [let, me, tell, you, ,, whats, going, to, happ...
2091    [its, hard, to, imagine, what, could, have, at...
3104    [its, actually, two, thousand, separate, reefs...
Name: clean_text, Length: 3518, dtype: object

In [20]:
# Step 3: Build Vocab
from collections import Counter


counts = Counter()
for row in df["clean_text"]:
    counts.update(row)

vocab2index = {"": 0, "UNK": 1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

len(vocab2index)

6164

In [21]:
df["clean_text"].apply(len).describe()

count    3518.000000
mean       19.839113
std         6.610641
min         7.000000
25%        14.000000
50%        20.000000
75%        25.000000
max        45.000000
Name: clean_text, dtype: float64

In [22]:
def encode_sentence(text, vocab2index, max_len=50):
    encoded = np.zeros(max_len, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in text])
    length = min(max_len, len(enc1)) # if above max len, cut the rest
    encoded[:length] = enc1[:length]

    return encoded

In [24]:
MAX_ENCODED_LEN = 50

In [25]:
df["clean_text_encoded"] = df["clean_text"].progress_apply(lambda x: encode_sentence(x, vocab2index, max_len=MAX_ENCODED_LEN))

  0%|          | 0/3518 [00:00<?, ?it/s]

In [26]:
df

Unnamed: 0,text,label,clean_text,clean_text_encoded
1292,I won't mention the name They've got a lot o...,1,"[i, wont, mention, the, name, theyve, got, a, ...","[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1..."
883,He's on top of it And his father is a great ...,1,"[hes, on, top, of, it, and, his, father, is, a...","[26, 27, 28, 11, 29, 30, 31, 32, 18, 9, 33, 34..."
3515,Once and not so long ago three hundred thousa...,0,"[once, and, not, so, long, ago, three, hundred...","[43, 30, 24, 51, 52, 42, 53, 54, 55, 56, 57, 5..."
2267,They work together to drive shoals of fish in...,0,"[they, work, together, to, drive, shoals, of, ...","[67, 68, 69, 70, 71, 72, 11, 73, 74, 5, 75, 0,..."
2411,Receptors in the snake's head pick up the hea...,0,"[receptors, in, the, snakes, head, pick, up, t...","[76, 77, 5, 78, 79, 80, 81, 5, 82, 83, 84, 85,..."
...,...,...,...,...
217,They never catch us Because Mexico's paying ...,1,"[they, never, catch, us, because, mexicos, pay...","[67, 287, 2001, 463, 630, 2611, 2612, 99, 5, 1..."
2790,At the heart of all that happens here is a si...,0,"[at, the, heart, of, all, that, happens, here,...","[101, 5, 1586, 11, 25, 64, 1107, 107, 18, 9, 5..."
100,"Let me tell you, what's going to happen on No...",1,"[let, me, tell, you, ,, whats, going, to, happ...","[612, 37, 186, 48, 45, 723, 98, 70, 1725, 27, ..."
2091,It's hard to imagine what could have attracte...,0,"[its, hard, to, imagine, what, could, have, at...","[159, 147, 70, 4022, 178, 231, 124, 4455, 29, ..."


# DataLoader for Torch

In [27]:
import torch
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split

In [28]:
np.random.seed(3407)
torch.manual_seed(3407)
torch.mps.manual_seed(3407)

In [29]:
X_train, X_val, y_train, y_val = train_test_split(df["clean_text_encoded"], df["label"], test_size=0.2, random_state=3407, stratify=df["label"])

In [30]:
class TextStyleDataset(Dataset):
    def __init__(self, data, labels):
        self.data = np.vstack(data)
        self.labels = torch.tensor(labels.to_numpy(), dtype=torch.float)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index], self.labels.data[index]

In [31]:
y_val.value_counts()

1    377
0    327
Name: label, dtype: int64

In [32]:
train_ds = TextStyleDataset(X_train, y_train)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)

val_ds = TextStyleDataset(X_val, y_val)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False)

In [33]:
for x in train_loader:
    print(x[0].shape)
    print(x[1])
    break

torch.Size([32, 50])
tensor([1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0., 0.,
        0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 1., 0.])


# ML Modeling

In [34]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('mps')
device

device(type='mps')

In [35]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [36]:
vocab_size = len(vocab2index)
vocab_size

6164

In [37]:
import json

with open("../models/vocab2index_style_classification.json", 'w') as f:
    json.dump(vocab2index, f, indent=4)

In [192]:
class SimpleLinearModel(nn.Module):
    def __init__(self, vocab_size, input_size, output_size):
        super(SimpleLinearModel, self).__init__()

        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim=128)

        self.linear_size = input_size * 128
        self.linear1 = nn.Linear(self.linear_size, 128)
        self.linear2 = nn.Linear(128, 64)
        self.dropout1 = nn.Dropout(0.5)
        self.linear3 = nn.Linear(64, output_size)


    def forward(self, inputs):
        # we assume the inputs already in embedding dimension
        output = self.embedding(inputs).view(-1, self.linear_size)
        output = F.relu(self.linear1(output))
        output = F.relu(self.linear2(output))
        output = self.dropout1(output)
        output = self.linear3(output)

        return output

In [193]:
linear_model = SimpleLinearModel(vocab_size=vocab_size, input_size=MAX_ENCODED_LEN, output_size=1).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(linear_model.parameters(), lr=0.001, weight_decay=1e-5)

In [194]:
EPOCHS = 10

In [195]:
def fit(model, criterion, optimizer, train_loader, val_loader, epochs=EPOCHS):
    train_accuracies = []
    val_accuracies = []
    train_losses = []
    val_losses = []

    for epoch in range(epochs):  # Loop over the dataset multiple times
        running_loss = 0.0
        total = 0
        correct = 0
        model.train()
        for inputs, labels in train_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
    
            outputs = model(inputs)
            outputs = outputs.squeeze(-1)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    
            running_loss += loss.item() 

            predicted = torch.round(F.sigmoid(outputs))
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
        # get train loss and accuracy
        train_loss = running_loss / len(train_loader.dataset)
        train_accuracy = correct / total
        train_losses.append(train_loss)
        train_accuracies.append(train_accuracy)
    
        # get test loss and accuracy
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            model.eval()
            for data in val_loader:
                inputs, labels = data
                inputs = inputs.to(device)
                labels = labels.to(device)

                outputs = model(inputs)
                outputs = outputs.squeeze(-1)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                predicted = torch.round(F.sigmoid(outputs))
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

        val_loss /= len(val_loader)
        val_accuracy = val_correct / val_total
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)

        print(f"Epoch {epoch}: train_loss: {train_loss:.4f}; train_accuracy: {train_accuracy:.4f}; val_loss: {val_loss:.4f}; val_accuracy: {val_accuracy:.4f}")

    return {
        "train_loss": train_losses,
        "train_accuracy": train_accuracies,
        "val_loss": val_losses,
        "val_accuracy": val_accuracies
    }

In [196]:
linear_model_result = fit(linear_model, criterion, optimizer, train_loader, val_loader, epochs=EPOCHS)

Epoch 0: train_loss: 0.0147; train_accuracy: 0.7854; val_loss: 0.3179; val_accuracy: 0.8565
Epoch 1: train_loss: 0.0063; train_accuracy: 0.9204; val_loss: 0.2351; val_accuracy: 0.9077
Epoch 2: train_loss: 0.0022; train_accuracy: 0.9751; val_loss: 0.2006; val_accuracy: 0.9304
Epoch 3: train_loss: 0.0009; train_accuracy: 0.9922; val_loss: 0.1468; val_accuracy: 0.9531
Epoch 4: train_loss: 0.0002; train_accuracy: 0.9982; val_loss: 0.2341; val_accuracy: 0.9403
Epoch 5: train_loss: 0.0002; train_accuracy: 0.9986; val_loss: 0.2424; val_accuracy: 0.9361
Epoch 6: train_loss: 0.0002; train_accuracy: 0.9975; val_loss: 0.1743; val_accuracy: 0.9631
Epoch 7: train_loss: 0.0000; train_accuracy: 1.0000; val_loss: 0.1857; val_accuracy: 0.9645
Epoch 8: train_loss: 0.0000; train_accuracy: 1.0000; val_loss: 0.1975; val_accuracy: 0.9645
Epoch 9: train_loss: 0.0000; train_accuracy: 1.0000; val_loss: 0.1953; val_accuracy: 0.9659


In [197]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, output_size):
        super(RNNModel, self).__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, 128)
        
        self.bilstm = nn.LSTM(128, 128, bidirectional=True, batch_first=True, dropout=0.1, num_layers=2)
        self.linear1 = nn.Linear(128 * 2, 64)
        self.dropout = nn.Dropout(0.2)
        self.linear2 = nn.Linear(64, output_size)


    def forward(self, x):
        output = self.embedding(x)
        output, _ = self.bilstm(output)

        output = output[:, -1, :]  # Get the output of the last time step
        output = F.relu(self.linear1(output))
        output = self.dropout(output)
        output = self.linear2(output)
    
        return output

In [198]:
rnn_model = RNNModel(vocab_size=vocab_size, output_size=1).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(rnn_model.parameters(), lr=0.001, weight_decay=1e-5)

In [199]:
rnn_model_result = fit(rnn_model, criterion, optimizer, train_loader, val_loader, epochs=EPOCHS)

Epoch 0: train_loss: 0.0216; train_accuracy: 0.5327; val_loss: 0.6749; val_accuracy: 0.5355
Epoch 1: train_loss: 0.0142; train_accuracy: 0.8262; val_loss: 0.2555; val_accuracy: 0.9119
Epoch 2: train_loss: 0.0036; train_accuracy: 0.9648; val_loss: 0.1323; val_accuracy: 0.9574
Epoch 3: train_loss: 0.0019; train_accuracy: 0.9797; val_loss: 0.1112; val_accuracy: 0.9702
Epoch 4: train_loss: 0.0005; train_accuracy: 0.9957; val_loss: 0.1304; val_accuracy: 0.9673
Epoch 5: train_loss: 0.0006; train_accuracy: 0.9929; val_loss: 0.1076; val_accuracy: 0.9716
Epoch 6: train_loss: 0.0003; train_accuracy: 0.9979; val_loss: 0.1099; val_accuracy: 0.9716
Epoch 7: train_loss: 0.0001; train_accuracy: 0.9993; val_loss: 0.1508; val_accuracy: 0.9688
Epoch 8: train_loss: 0.0000; train_accuracy: 1.0000; val_loss: 0.1705; val_accuracy: 0.9716
Epoch 9: train_loss: 0.0000; train_accuracy: 1.0000; val_loss: 0.1860; val_accuracy: 0.9673


# Predict

In [211]:
style_map = {
    0: "david",
    1: "trump"
}

In [214]:
def predict(model, text):
    text = clean_text(text)
    encoded_text = torch.tensor(encode_sentence(text, vocab2index, MAX_ENCODED_LEN)).unsqueeze(0).to(device)
    style_label = torch.tensor([0], dtype=torch.long).repeat(encoded_text.shape[0]).to(device)

    with torch.no_grad():
        model.eval()
        outputs = model(encoded_text)
        predicted_proba = F.sigmoid(outputs)
        predicted = torch.round(predicted_proba)
        predicted_style = style_map[int(predicted.data[0][0].cpu().numpy())]

    return predicted_proba, predicted, predicted_style

In [215]:
predict(linear_model, "You are fake news, believe me, nobody knows fake news better than I do!")

(tensor([[1.0000]], device='mps:0'), tensor([[1.]], device='mps:0'), 'trump')

In [216]:
predict(linear_model, "Observe this fake news. In the rich tapestry of media, there are some who, regrettably, disseminate misinformation")

(tensor([[0.8561]], device='mps:0'), tensor([[1.]], device='mps:0'), 'trump')

In [217]:
predict(rnn_model, "You are fake news, believe me, nobody knows fake news better than I do!")

(tensor([[0.9968]], device='mps:0'), tensor([[1.]], device='mps:0'), 'trump')

In [218]:
predict(rnn_model, "Observe this fake news. In the rich tapestry of media, there are some who, regrettably, disseminate misinformation")

(tensor([[0.0459]], device='mps:0'), tensor([[0.]], device='mps:0'), 'david')

In [219]:
torch.save(linear_model.state_dict(), "../models/style_classification_linear.pth")

In [220]:
torch.save(rnn_model.state_dict(), "../models/style_classification_rnn.pth")

In [221]:
import json

with open("../data/style/vocab2index.json", 'w') as f:
    json.dump(vocab2index, f, indent=4)

In [227]:
print(predict(linear_model, "i i the the the the the"))
print(predict(linear_model, "to to to to to to to to to"))
print(predict(linear_model, "the to to to to to"))

(tensor([[9.7496e-08]], device='mps:0'), tensor([[0.]], device='mps:0'), 'david')
(tensor([[4.1616e-09]], device='mps:0'), tensor([[0.]], device='mps:0'), 'david')
(tensor([[3.4351e-09]], device='mps:0'), tensor([[0.]], device='mps:0'), 'david')


In [226]:
print(predict(rnn_model, "i i the the the the the"))
print(predict(rnn_model, "to to to to to to to to to"))
print(predict(rnn_model, "the to to to to to"))

(tensor([[0.9891]], device='mps:0'), tensor([[1.]], device='mps:0'), 'trump')
(tensor([[0.0457]], device='mps:0'), tensor([[0.]], device='mps:0'), 'david')
(tensor([[0.0457]], device='mps:0'), tensor([[0.]], device='mps:0'), 'david')
