# Analysing Persuasion Strategies in Social Media -
## Evidence type dentification using DistilBERT
Author: Robin Snoek\
Written using tutorial by curiousily: https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/

In [1]:
%%capture
pip install transformers

In [2]:
import transformers
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import preprocessing
from sklearn.dummy import DummyClassifier
from collections import defaultdict
from textwrap import wrap

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
# Data overview
df = pd.read_csv("final_dataset.csv")
# df.info()

In [4]:
# Encode labels as numeric types for converting labels to into tensors
le = preprocessing.LabelEncoder()
df['targets'] = le.fit_transform(df['evidence_no_continue'])
# df.astype({'targets': 'long'}).dtypes
df.head()

Unnamed: 0,sentence,evidence,thread_id,comment_id,evidence_no_continue,targets
0,I don't know anyone who buys Apple products to...,Anecdote,t3_71l9yj,dnbz2sl,Anecdote,0
1,At work I use a Dell desktop that probably cos...,Anecdote,t3_71l9yj,dnbz2sl,Anecdote,0
2,"At home I have a $1,500 MacBook Air with a sol...",Anecdote,t3_71l9yj,dnbz2sl,Anecdote,0
3,My MacBook is *always* faster and more reliabl...,Continue,t3_71l9yj,dnbz2sl,Anecdote,0
4,I think both product lines probably have their...,Assumption,t3_71l9yj,dnbz2sl,Assumption,1


In [5]:
# Label distribution
df.evidence_no_continue.value_counts()

Assumption          3992
None                1557
Anecdote             727
Definition           143
Statistics/Study     100
Other                 62
Testimony             59
Name: evidence_no_continue, dtype: int64

In [6]:
%%capture
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")

# Sequence Length. 

Most of the sentences contain less than 75 tokens. But to be safe we choose a maximum length of 100.

In [None]:
# token_lens = []

# for txt in df.sentence:
#   tokens = tokenizer.encode(txt, max_length=512)
#   token_lens.append(len(tokens))

# sns.set(rc={'figure.figsize':(11.7,8.27)})
# sns.distplot(token_lens)
# plt.xlim([0, 256]);
# plt.xlabel('Token count');

In [8]:
MAX_LEN = 100

# Pytorch dataset

In [9]:
class EvidenceDataset(Dataset):

  def __init__(self, sentences, labels, tokenizer, max_len):
    self.sentences = sentences
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.sentences)

  def __getitem__(self, item):
    sentence = str(self.sentences[item])
    labels = self.labels[item]

    encoding = self.tokenizer.encode_plus(
      sentence,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      truncation=True,  # explicitly truncate examples to max length
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'sentences': sentence,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'labels': torch.tensor(self.labels[item], dtype=torch.long)
    }

Split data in 60% training, 20% validation and 20% test based on topic.

In [10]:
# Random topic split
train_ids, test_ids = train_test_split(
  df.thread_id.unique(),
  test_size=0.4,
  random_state=RANDOM_SEED
)

val_ids, test_ids = train_test_split(
  test_ids,
  test_size=0.5,
  random_state=RANDOM_SEED
)

df_train = df[df['thread_id'].isin(train_ids)]
df_val = df[df['thread_id'].isin(val_ids)]
df_test = df[df['thread_id'].isin(test_ids)]

df_train.shape, df_val.shape, df_test.shape

((4002, 6), (1320, 6), (1318, 6))

# Data loaders

In [11]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = EvidenceDataset(
    sentences=df.sentence.to_numpy(),
    labels=df.targets.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

# DistilBert classifier

In [12]:
class EvidenceClassifier(nn.Module):

  def __init__(self, n_classes):
    super(EvidenceClassifier, self).__init__()
    self.distilbert = AutoModel.from_pretrained(
        "distilbert-base-uncased", num_labels= n_classes,
        return_dict=False)
    # Pooling
    self.dense = nn.Linear(self.distilbert.config.hidden_size, self.distilbert.config.hidden_size)
    self.activation = nn.Tanh()

    self.drop = nn.Dropout()
    self.out = nn.Linear(self.distilbert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    output = self.distilbert(
      input_ids=input_ids,
      attention_mask=attention_mask,
    )
    # Pool output because DistilBert does not have pooled_output
    first_token_tensor = output[0][:, 0]
    pooled_output = self.dense(first_token_tensor)
    pooled_output = self.activation(pooled_output)

    return self.out(pooled_output)

In [13]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):

  model = model.train()
  losses = []
  correct_predictions = 0

  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["labels"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)

    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

In [14]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["labels"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      loss = loss_fn(outputs, targets)

      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
EPOCHS = 20
BATCH_SIZE = 16

class_names = ['Anecdote', 'Assumption', 'Definition', 'None', 'Other', 'Statistics/Study', 'Testimony']
model = EvidenceClassifier(len(class_names))

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

# class_weights = (1 - (df['targets'].value_counts().sort_index() / len(df))).values
# class_weights = torch.from_numpy(class_weights).float().to(device)

loss_fn = nn.CrossEntropyLoss().to(device)

# Train best model

In [None]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn,
                                      optimizer, device, scheduler,
                                      len(df_train))

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(model, val_data_loader, loss_fn, device,
                                 len(df_val))
  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc

In [None]:
# Training vs. validation accuracy
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')

plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

# Evaluate DistilBert

In [None]:
model.load_state_dict(torch.load('best_model_state.bin'))
model = model.to(device)

In [None]:
test_acc, _ = eval_model(model, test_data_loader, loss_fn, device, len(df_test))

In [None]:
def get_predictions(model, data_loader):
  model = model.eval()

  sentences = []
  predictions = []
  prediction_probs = []
  real_values = []

  with torch.no_grad():
    for d in data_loader:
      texts = d["sentences"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["labels"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )

      _, preds = torch.max(outputs, dim=1)

      sentences.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
      real_values.extend(targets)

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()

  return sentences, predictions, prediction_probs, real_values

In [None]:
# get predictions
y_sentences, y_pred, y_pred_probs, y_test = get_predictions(model, test_data_loader)

In [None]:
print(classification_report(y_test, y_pred, target_names=class_names))

In [None]:
def show_confusion_matrix(confusion_matrix):
  hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
  hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
  hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
  plt.ylabel('True sentiment')
  plt.xlabel('Predicted sentiment');

cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
show_confusion_matrix(df_cm)

In [None]:
# pred_dict = {'sentence': y_sentences, 'label': y_test, 'prediction': y_pred} 
# pred_df = pd.DataFrame(pred_dict)
# pred_df.to_csv('predictions.csv', sep = ';') 

#Baseline

In [17]:
y_sentences = df_test['sentence']
y = df_test['evidence_no_continue']
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(y_sentences, y)

DummyClassifier(strategy='most_frequent')

In [21]:
y_pred = dummy_clf.predict(y_sentences)
dummy_clf.score(y_sentences, y)

0.5493171471927162

In [22]:
print(classification_report(y, y_pred, target_names=class_names))

                  precision    recall  f1-score   support

        Anecdote       0.00      0.00      0.00       139
      Assumption       0.55      1.00      0.71       724
      Definition       0.00      0.00      0.00        14
            None       0.00      0.00      0.00       374
           Other       0.00      0.00      0.00        17
Statistics/Study       0.00      0.00      0.00        31
       Testimony       0.00      0.00      0.00        19

        accuracy                           0.55      1318
       macro avg       0.08      0.14      0.10      1318
    weighted avg       0.30      0.55      0.39      1318



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
