In [1]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
# Dataset from: https://github.com/JerryWeiAI/NewB/blob/master/README.md
f = open('political_affiliation_dataset.txt', 'r')
X = []
y = []
for line in f.readlines():
    X.append(line.split('\t')[1])
    y.append(int(line.split('\t')[0]))
X = np.array(X)
y = np.array(y)

In [3]:
# 0 is liberal, 1 is neutral, 2 is conservative
def classify(x):
    if x <= 4:
        return 0
    elif x >= 6:
        return 2
    else:
        return 1
y = np.array(list(map(classify, y)))

In [54]:
label_0_indices = np.where(y == 0)[0]
label_1_indices = np.where(y == 1)[0]
label_2_indices = np.where(y == 2)[0]

num_samples_per_label = 5000

selected_label_0_indices = np.random.choice(label_0_indices, num_samples_per_label, replace=False)
selected_label_1_indices = np.random.choice(label_1_indices, num_samples_per_label, replace=False)
selected_label_2_indices = np.random.choice(label_2_indices, num_samples_per_label, replace=False)

selected_indices = np.concatenate((selected_label_0_indices, selected_label_1_indices, selected_label_2_indices))

X_sample = X[selected_indices]
y_sample = y[selected_indices]

In [59]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
train_df = pd.DataFrame({'text': X, 'label': y})
joined_text = []
stemmer = LancasterStemmer()
stopwords_punc = list(stopwords.words('english'))+list(punctuation)
for i in train_df.index:
    text = re.sub('[^a-zA-Z]', ' ', str(train_df['text'][i]))
    text = text.lower()
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    text=re.sub("(\\d|\\W)+"," ",text)
    text = text.split()
    lem = SnowballStemmer("english")
    text = [lem.stem(word) for word in text if not word in stopwords_punc] 
    joined_text.append(" ".join(text))
train_clean = pd.DataFrame({'text':joined_text,'label':train_df['label']})
X = train_clean['text']
y = train_clean['label']
tfidf = TfidfVectorizer(norm = "l2")
tfidf.fit(X)
Xt = tfidf.transform(X)
X_train, X_test, y_train, y_test = train_test_split(Xt, y, test_size=0.2, random_state= 21)
model = LogisticRegression(max_iter = 1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
np.mean(y_pred == np.array(y_test))
# 0.5593120160766003

In [None]:
# import numpy as np
# import time
# import matplotlib.pyplot as plt
# from matplotlib.colors import ListedColormap
# from sklearn.model_selection import train_test_split, cross_val_score
# from sklearn.preprocessing import StandardScaler
# from sklearn.datasets import make_moons, make_circles, make_classification
# from sklearn.neural_network import MLPClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.svm import SVC
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# from sklearn.linear_model import LogisticRegression

# names = [#"Nearest Neighbors", "Linear SVM", 
#          "RBF SVM", #"Gaussian Process",
#          "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "Logistic Regression","Naive Bayes",
# # "QDA"
#     ]


# classifiers = [
# #     KNeighborsClassifier(2),
# #     SVC(kernel="linear", C=0.025),
#     SVC(gamma=2, C=1),
#     #GaussianProcessClassifier(1.0 * RBF(1.0)),
#     DecisionTreeClassifier(max_depth=5),
#     RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
#     MLPClassifier(alpha=1, max_iter=1000),
#     AdaBoostClassifier(),
#     LogisticRegression(max_iter=1000),
#     GaussianNB()
# #     QuadraticDiscriminantAnalysis()
# ]

# X = train_clean['text']
# y = train_clean['label']

# tfidf = TfidfVectorizer(norm = "l2")
# tfidf.fit(X)
# Xt = tfidf.transform(X)
# Xt_dense = Xt.toarray()
# X_train, X_test, y_train, y_test = train_test_split(Xt, y, test_size=0.2, random_state= 21)

# # TODO (Apply): All cross-validation

# max_score = 0.0
# max_class = ''

# # iterate over classifiers
# for name, clf in zip(names, classifiers):
#     start_time = time.time()
    
#     # Do Cross Validation
#     if name == "Naive Bayes" or name == "QDA":
#         # For GaussianNB, use the dense data
#         scores = cross_val_score(clf, Xt_dense, y, cv=5)
#     else:
#         scores = cross_val_score(clf, Xt, y, cv=5)
    
#     score = 100.0 * np.mean(scores)
    
#     print('Classifier = %s, Score (test, accuracy) = %.2f,' %(name, score), 'Training time = %.2f seconds' % (time.time() - start_time))
    
#     if score > max_score:
#         clf_best = clf
#         max_score = score
#         max_class = name

# print(80*'-' )
# print('Best --> Classifier = %s, Score (test, accuracy) = %.2f' %(max_class, max_score))
# #plot the output of the various algorithms

In [58]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

In [60]:
max_seq_length = 512
Trimmed_X = [text[:max_seq_length] for text in X_sample]

In [61]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
encoded_sentences = [tokenizer.encode(text, add_special_tokens=True) for text in Trimmed_X]
labels = y_sample

In [62]:
encoded_set = [torch.LongTensor(text) for text in encoded_sentences]
train_labels = list(y_sample)

In [63]:
# Train-validation-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(encoded_set, train_labels, test_size=0.3, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(test_texts, test_labels, test_size=0.5, random_state=42)

In [64]:
train_texts = pad_sequence(train_texts, batch_first=True, padding_value=tokenizer.pad_token_id)
train_dataset = TensorDataset(torch.LongTensor(train_texts), torch.LongTensor(train_labels))
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [65]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [66]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [67]:
for epoch in range(50):
    model.train()
    for batch in train_loader:
        input_ids, labels = batch
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        output = model(input_ids, labels=labels)
        loss = loss_fn(output.logits, labels)
        loss.backward()
        optimizer.step()

In [68]:
val_texts = pad_sequence(val_texts, batch_first=True, padding_value=tokenizer.pad_token_id)
val_dataset = TensorDataset(torch.LongTensor(val_texts), torch.LongTensor(val_labels))
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True)

In [69]:
model.eval()
with torch.no_grad():
    val_loss = 0
    correct = 0
    for batch in val_loader:
        input_ids, labels = batch
        input_ids = input_ids.to(device)
        output = model(input_ids)
        labels = labels.to(device)
        val_loss += loss_fn(output.logits, labels).item()
        pred = output.logits.argmax(dim=1, keepdim=True)
        correct += pred.eq(labels.view_as(pred)).sum().item()

accuracy = correct / len(val_texts)
print(f"Test accuracy: {accuracy}")

Test accuracy: 0.5417777777777778


In [None]:
#Classifier = Nearest Neighbors, Score (test, accuracy) = 42.78, Training time = 1729.20 seconds
#Classifier = Linear SVM, Score (test, accuracy) = 43.77, Training time = 57367.66 seconds

In [None]:
# Test accuracy: 0.6268256803614585 with only 2 epochs
# Sample test accuracy: 0.548 with 50 epochs, not equal size
# Test accuracy: 0.5417777777777778 with 5000 samples of each