In [None]:
import torch
import numpy as np
from transformers import BertTokenizer
import pandas as pd
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from tqdm import tqdm
import torch.nn.functional as F
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics


This file implements the neural network architecture for the synthetic data. First, the data is loaded in, labels are cleaned, and data is limited to only the posts/comments and the appropriate tag.

In [None]:
df = pd.read_parquet("synthetic.parquet")
df["category"] = df["label_type"]
df["text"] = df["total_post"]
df = df[["text", "category"]]


Here we load the chosen tokenizer and define numerical labels for the categories

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
labels = {
    "republican": 0,
    "democrat": 1,
    "neutral": 2,
}

Below is defined our dataset class, which is used for loading data into the neural net.

In [None]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['category']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

Here we split the data into train, validation, and test.

In [None]:
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

Now we define the actual network and the training function

In [None]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear1 = nn.Linear(768, 30)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(30,3)
        self.softmax = nn.Softmax(dim = 1)

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear1(dropout_output)
        relu_layer = self.relu(linear_output)
        linear_2 = self.linear2(relu_layer)
        final_layer = self.softmax(linear_2)
        return final_layer

In [None]:
def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=4, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=4)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate, weight_decay=1e-7)

    if use_cuda:
        print("it's working!")
        model = model.cuda()
        criterion = criterion.cuda()

    for epoch_num in range(epochs):
        model.train()
        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_input["attention_mask"].to(device)
            input_id = train_input["input_ids"].squeeze(1).to(device)

            output = model(input_id, mask)
            batch_loss = criterion(
                output, F.one_hot(train_label, num_classes=3).float()
            )
            total_loss_train += batch_loss.item() * 4

            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():
            model.eval()
            for val_input, val_label in val_dataloader:

                val_label = val_label.to(device)
                mask = val_input["attention_mask"].to(device)
                input_id = val_input["input_ids"].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(
                    output, F.one_hot(val_label, num_classes=3).float()
                )
                total_loss_val += batch_loss.item() * 4

                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc

        print(
            f"Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}"
        )


After defining the model class, we defined remaining hyperparameters, and trained the model

In [None]:
EPOCHS = 15
LR = 1e-8
model = BertClassifier()
train(model, df_train, df_val, LR, EPOCHS)


Here we define a function to evaluate our model on the test data.

In [None]:
def evaluate(model, test_data):
    predictions = []

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:
            test_label = test_label.to(device)
            mask = test_input["attention_mask"].to(device)
            input_id = test_input["input_ids"].squeeze(1).to(device)
            output = model(input_id, mask)
            predictions.append(output.argmax(dim=1).item())
            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc

    print(f"Test Accuracy: {total_acc_test / len(test_data): .3f}")
    return predictions


Next, we utilized that function to evaluate the test data, and then built a confusion matrix.

In [None]:
output = evaluate(model, df_test)


In [None]:
reverse = {
    0: "republican",
    1: "democrat",
    2: "neutral",
}
df_test["predicted"] = output
df_test["actual"] = df_test["category"].map(labels)
df_test["predicted"] = df_test["predicted"].map(reverse)
df_test["actual"] = df_test["actual"].map(reverse)

In [None]:
confusion_matrix = pd.crosstab(df_test['actual'], df_test['predicted'], rownames=['Actual'], colnames=['Predicted'])


In [None]:
confusion_matrix

In [None]:
ax = sns.heatmap(confusion_matrix, annot=True, fmt="g", cmap="Blues")
ax.set_title("Text Classificaiton Confusion Matrix\n\n")
ax.set_xlabel("\nPredicted Values")
ax.set_ylabel("Actual Values ")
ax.xaxis.set_ticklabels(["Democrat", "Neutral", "Republican"])
ax.yaxis.set_ticklabels(["Democrat", "Neutral", "Republican"])
plt.savefig("../30_outputs/synthetic_neural_net.png", bbox_inches="tight", dpi = 300)



Finally, we extract the precision and accuracy for each category.

In [None]:
print(metrics.classification_report(df_test["actual"], df_test["predicted"], target_names = ["democrat", "neutral", "republican"]))

In [None]:
torch.save(model.state_dict(), "../20_models/model_synthetic")