In [1]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
conservative_keywords = [
    "Conservatism",
    "Moral order",
    "Custom",
    "Continuity",
    "Prescription",
    "Prudence",
    "Variety",
    "Property",
    "Voluntary community",
    "Restraints on power",
    "Permanence",
    "Progression",
    "Change",
    "Human nature",
    "Liberty",
    "Society",
    "Order",
    "Justice",
    "Responsibility",
    "Tradition",
    "Community",
    "Anarchy",
    "Tyranny",
    "Voluntarism",
    "Individualism",
    "Family",
    "Equality",
    "Property rights",
    "Freedom",
    "Decentralization",
    "Conservatism vs. radicalism",
    "Reconciliation",
    "Balance",
    "Permanence vs. Progression",
    "Constitution",
    "Checks and balances",
    "Prudent reform",
    "Stability",
    "Continuity",
    "Rationality",
    "Diversity",
    "Rights",
    "Society's complexity",
    "Oligarchy",
    "Property ownership",
    "Social institutions",
    "Private possession",
    "Private property",
    "Human passions",
    "Tension"
]


In [5]:
democracy_keywords = [
    "Participation",
    "Democracy",
    "Citizens",
    "Direct democracy",
    "Representative democracy",
    "Citizen participation",
    "Public debate",
    "Town meetings",
    "Peaceful protests",
    "Civil society",
    "Equality",
    "Discrimination",
    "Equal access",
    "Voting",
    "Accountability",
    "Transparency",
    "Corruption",
    "Political tolerance",
    "Minority rights",
    "Multi-party system",
    "Abuse of power",
    "Free and fair elections",
    "Freedom of economy",
    "Bill of rights",
    "Human rights",
    "Free courts",
    "Accepting election results",
    "Rule of law",
    "Government accountability",
    "Independence of judiciary",
    "Constitutional rights",
    "Access to justice",
    "Freedom of speech",
    "Freedom of assembly",
    "Economic freedom",
    "Equal voting rights",
    "Political debate",
    "Government transparency",
    "Public information",
    "Judicial system",
    "Dispute resolution",
    "Peaceful transfer of power",
    "Democratic process",
    "Majority support",
    "Equal application of laws",
    "Liberties",
    "Shadow report",
    "EU",
    "Rule of law situation",
    "Member states"
]

In [6]:
conservative_keywords = np.array([key.lower() for key in conservative_keywords])
democracy_keywords = np.array([key.lower() for key in democracy_keywords])

In [7]:
# Dataset from: https://github.com/JerryWeiAI/NewB/blob/master/README.md
f = open('political_affiliation_dataset.txt', 'r')
X = []
y = []
for line in f.readlines():
    X.append(line.split('\t')[1])
    y.append(int(line.split('\t')[0]))
X = np.array(X)
y = np.array(y)

In [8]:
# 0 is liberal, 1 is neutral, 2 is conservative
def classify(x):
    if x <= 4:
        return 0
    elif x >= 6:
        return 2
    else:
        return 1
y = np.array(list(map(classify, y)))

In [9]:
label_0_indices = np.where(y == 0)[0]
label_1_indices = np.where(y == 1)[0]
label_2_indices = np.where(y == 2)[0]

num_samples_per_label = 5000

selected_label_0_indices = np.random.choice(label_0_indices, num_samples_per_label, replace=False)
selected_label_1_indices = np.random.choice(label_1_indices, num_samples_per_label, replace=False)
selected_label_2_indices = np.random.choice(label_2_indices, num_samples_per_label, replace=False)

selected_indices = np.concatenate((selected_label_0_indices, selected_label_1_indices, selected_label_2_indices))

X_sample = X[selected_indices]
y_sample = y[selected_indices]

In [10]:
def contains_democratic_keyword(text):
    for keyword in democracy_keywords:
        if keyword.lower() in text.lower():
            return True
    return False
def contains_conservative_keyword(text):
    for keyword in conservative_keywords:
        if keyword.lower() in text.lower():
            return True
    return False

In [27]:
democratic = [contains_democratic_keyword(text) for text in X_sample]
sum(y_sample[democratic] == 2)

110

In [30]:
conservative = [contains_conservative_keyword(text) for text in X_sample]
sum(y_sample[conservative] == 0)

426

In [8]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
encoded_sentences = [tokenizer.encode(text, add_special_tokens=True) for text in X_sample]
labels = y_sample

In [9]:
encoded_set = [torch.LongTensor(text) for text in encoded_sentences]
train_labels = list(y_sample)

In [10]:
# Train-validation-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(encoded_set, train_labels, test_size=0.3, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(test_texts, test_labels, test_size=0.5, random_state=42)

In [11]:
train_texts = pad_sequence(train_texts, batch_first=True, padding_value=tokenizer.pad_token_id)
train_dataset = TensorDataset(torch.LongTensor(train_texts), torch.LongTensor(train_labels))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [12]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [13]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [14]:
for epoch in range(10):
    model.train()
    for batch in train_loader:
        input_ids, labels = batch
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        output = model(input_ids, labels=labels)
        loss = loss_fn(output.logits, labels)
        loss.backward()
        optimizer.step()

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [15]:
val_texts = pad_sequence(val_texts, batch_first=True, padding_value=tokenizer.pad_token_id)
val_dataset = TensorDataset(torch.LongTensor(val_texts), torch.LongTensor(val_labels))
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)

In [16]:
model.eval()
with torch.no_grad():
    val_loss = 0
    correct = 0
    for batch in val_loader:
        input_ids, labels = batch
        input_ids = input_ids.to(device)
        output = model(input_ids)
        labels = labels.to(device)
        val_loss += loss_fn(output.logits, labels).item()
        pred = output.logits.argmax(dim=1, keepdim=True)
        correct += pred.eq(labels.view_as(pred)).sum().item()

accuracy = correct / len(val_texts)
print(f"Valid accuracy: {accuracy}")

Test accuracy: 0.45911111111111114


In [17]:
#Classifier = Nearest Neighbors, Score (test, accuracy) = 42.78, Training time = 1729.20 seconds
#Classifier = Linear SVM, Score (test, accuracy) = 43.77, Training time = 57367.66 seconds

In [18]:
# Test accuracy: 0.6268256803614585 with only 2 epochs
# Sample test accuracy: 0.548 with 50 epochs, not equal size
# Test accuracy: 0.5417777777777778 with 5000 samples of each