In [None]:
# Evaluating Student Writing 
# The classes being predicted are 'Claim','Counterclaim','Evidence','Position','Rebuttal','Concluding Statement' 
#and 'Lead'

# Built this model using BERT. Starting with the 'bert-base-cased' model 
# I had also build a simple BOW model and TFIDF but the model was struggling with identifying certain classes 
# especially CounterClaims and Rebuttals (understandable given the types of words in them). 
# Initial testing with BERT has looked better.
# I had also trained a BERT based model with just 5 classes - removed 'Concluding Statement' and 'Lead' from the train set
# with a minor improvement in accuracy. The accuracy improvement was  not enough to justify a separate model.
# This one uses all seven classes in one model

# One area I am struggling with is classifying text as "Evidence". "Evidence" texts tend to be large bodies of text 
# with  Positions and Claims in the body. Often times, individual sentences get classified as Claims. 
# The prediction model returns a list of Probabilities and, in a lot of the cases, for text which is Evidence,
# the second highest Probability has the correct one. Perhaps the output would be more accurate if I could provide 
# a couple of predictions for each piece of text. (That is against the rules of the Competition though)

# Noticing a similar issue with "Lead" text though not as severe

# Running this on Kaggle GPU and starting with the 'bert-base-cased' model

In [None]:
# !pip install transformers --upgrade --quiet

In [None]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
#from transformers import AutoModel, AutoTokenizer
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

In [None]:
%matplotlib inline

In [None]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

## Load Train Data and EDA

In [None]:
#load training data
df = pd.read_csv('/kaggle/input/feedback-prize-2021/train.csv')
df["text_length"] = df["discourse_text"].str.len()

In [None]:
codes = {'Claim':0, 'Counterclaim':1, 'Evidence':2, 'Position':3, 'Rebuttal':4, 'Concluding Statement':5, 'Lead':6 }
df['discourse_type_num'] = df['discourse_type'].map(codes)
df.head()

In [None]:
df.info()

In [None]:
df.groupby("discourse_type").text_length.mean().plot.bar(ylim=0, title="Average text Length by type")

In [None]:
df.groupby("discourse_type").discourse_type.count().plot.bar(ylim=0, title="Count of data points by Class")

In [None]:
# see one essay
df_DBF7EB6A9E02 = df.loc[df.id.isin(['DBF7EB6A9E02'])]
df_DBF7EB6A9E02

In [None]:
fn = '/kaggle/input/feedback-prize-2021/train/DBF7EB6A9E02.txt'
with open(fn) as f:
    contents = f.read()
    print(contents)

In [None]:
df_Conclusion = df.loc[df.discourse_type.isin(['Concluding Statement'])]
df_Conclusion.head(15)

## Choosing MAX Sequence Length

In [None]:
max(df.text_length)

In [None]:
plt.hist(df.text_length,  bins=500)  
plt.ylabel('Count')
plt.xlabel('Length of text');

In [None]:
# max length is 4099 but most are <100.
#I'm a bit torn about what length to choose. Leads seem to be very long strings of text. So does Evidence
# I might have trouble identifying Leads. Looking at some samples, Leads can be anything from a 
# couple of paragraphs up front to no Lead at all with the writer jumping straight into a Position.
# I tried an initial model droping Leads and Conclusions from the corpus. This includes all the classes in one model
#Pad to 100 words (tokens) and truncate 

In [None]:
MAX_LEN = 100
BATCH_SIZE = 16

In [None]:
df_final = df
df.shape, df_final.shape

## Define Datasets

In [None]:
# keeping case helps keep some info.
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

In [None]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
#create a Dataset
class MyDataset(Dataset):

  def __init__(self, inputtext, targets, tokenizer, max_len):
    self.inputtext = inputtext
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.inputtext)

  def __getitem__(self, item):
    inputtext = str(self.inputtext[item])
    target = self.targets[item]
    encoding = self.tokenizer.encode_plus(
      inputtext,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'input_text': inputtext,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [None]:
df_train, df_test = train_test_split(
  df_final,
  test_size=0.1,
  random_state=42,
  stratify=df_final.discourse_type.values
)

df_val, df_test = train_test_split(
  df_test,
  test_size=0.5,
  random_state=42,
  stratify=df_test.discourse_type.values
)

In [None]:
df_train.groupby("discourse_type").discourse_type.count().plot.bar(ylim=0, title="Count of data points by Class")

In [None]:
df_train.shape, df_val.shape, df_test.shape

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):

  ds = MyDataset(
    inputtext=df.discourse_text.to_numpy(),
    targets=df.discourse_type_num.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

In [None]:
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

## Create a Model

In [None]:
class TheClassifier(nn.Module):

  def __init__(self, n_classes):
    super(TheClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    returned = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask
    )
    pooled_output = returned["pooler_output"]
    output = self.drop(pooled_output)
    return self.out(output)

In [None]:
device

In [None]:
model = TheClassifier(7) # classifying to one of Claim, CounterClaim, Evidence, Position, Rebuttal, Concluding Statement, Lead
model = model.to(device)

## Train

In [None]:
EPOCHS = 10

optimizer = AdamW(model.parameters(), lr=1e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):

  model = model.train()
  losses = []

  correct_predictions = 0

  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)
    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())
    loss.backward()

    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )

      _, preds = torch.max(outputs, dim=1)

      loss = loss_fn(outputs, targets)

      correct_predictions += torch.sum(preds == targets)

      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
%%time
history = defaultdict(list)

best_accuracy = 0

for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)
  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(df_train)
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn,
    device,
    len(df_val)
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(model.state_dict(), '/kaggle/working/best_model_state.bin')
    best_accuracy = val_acc

In [None]:
print('Done')

In [None]:
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')

plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()

plt.ylim([0, 1]);

## Evaluation

In [None]:
#lead best model
model_best = TheClassifier(7)

In [None]:
m = torch.load('/kaggle/working/best_model_state.bin')

In [None]:
model_best.load_state_dict(m)

In [None]:
tokenizer.save_pretrained('/kaggle/working/saved_model/')

In [None]:
# Specify a path
PATH = "/kaggle/working/saved_model/whole_model.pt"

# Save
torch.save(model_best, PATH)

In [None]:
model_best = model_best.to(device)

In [None]:
test_acc, _ = eval_model(
  model_best,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)

test_acc.item()

In [None]:
def get_predictions(model, data_loader):
  model = model.eval()
  input_texts = []
  predictions = []
  prediction_probs = []
  real_values = []

  with torch.no_grad():
    for d in data_loader:
      texts = d["input_text"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )

      _, preds = torch.max(outputs, dim=1)
      input_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
      real_values.extend(targets)

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return input_texts, predictions, prediction_probs, real_values

In [None]:
y_input_texts, y_pred, y_pred_probs, y_test = get_predictions(
  model_best,
  test_data_loader
)

In [None]:
class_names = ['Claim','Counterclaim','Evidence','Position','Rebuttal','Concluding Statement','Lead']
print(classification_report(y_test, y_pred, target_names=class_names))

## Retrieve Preds for 1 piece of text

In [None]:
input_text = "The most detrimental outcome is death." #Claim
#input_text = "In conclusion, drivers should not be able to use"

In [None]:
encoded_text = tokenizer.encode_plus(
  input_text,
  max_length=MAX_LEN,
  add_special_tokens=True,
  return_token_type_ids=False,
  pad_to_max_length=True,
  truncation=True,
  return_attention_mask=True,
  return_tensors='pt',
)

In [None]:
input_ids = encoded_text['input_ids'].to(device)
attention_mask = encoded_text['attention_mask'].to(device)
output = model_best(input_ids, attention_mask)

_, prediction = torch.max(output, dim=1)

print(f'Review text: {input_text}')
print(f'Class  : {class_names[prediction]}')

In [None]:
_, preds = torch.topk(output, 2)

In [None]:
print(f'Predicted Class  : {class_names[preds[0,0].tolist()]}')
print(f'Second Predicted Class  : {class_names[preds[0,1].tolist()]}')

#class_names[preds[0,0].tolist()], class_names[preds[0,1].tolist()]

In [None]:
output.shape

In [None]:
print(output)

## Classify a piece of text

In [None]:
def file_split_train(fname):
    df_file = pd.DataFrame(columns=["id","class","predictionstring","phrase"])
    #print(fname)
    fn = '/kaggle/input/feedback-prize-2021/train/' + fname +'.txt'
    #print(fn)
    with open(fn) as f:
        contents = f.read()
        #print(contents)
    #print(contents)
    curr_index = 0
    after_split_by_newline = contents.split('\n')
    for para in after_split_by_newline:
        if (len(para) > 0):
            after_split_by_period = para.split(". ")
            #print(after_split_by_period)
            #print(len(s))
            for sent in  after_split_by_period:
                if (len(sent) > 0):
                    sent_with_period = sent + '. '
                    index_list = ''
                    split_by_space = sent.split(' ')
                    for wrd in split_by_space:
                        index_list = index_list + str(curr_index) + ' '
                        curr_index = curr_index + 1
                    #print(index_list)
                    #print(len(phr))
                    df_file.loc[len(df_file.index)] = [fname, "", index_list, sent_with_period] 
    return(df_file)

In [None]:
def file_split(fname):
    df_file = pd.DataFrame(columns=["id","class","predictionstring","phrase"])
    #print(fname)
    fn = '/kaggle/input/feedback-prize-2021/test/' + fname +'.txt'
    #print(fn)
    with open(fn) as f:
        contents = f.read()
        #print(contents)
    #print(contents)
    curr_index = 0
    after_split_by_newline = contents.split('\n')
    for para in after_split_by_newline:
        if (len(para) > 0):
            after_split_by_period = para.split(". ")
            #print(after_split_by_period)
            #print(len(s))
            for sent in  after_split_by_period:
                if (len(sent) > 0):
                    sent_with_period = sent + '. '
                    index_list = ''
                    split_by_space = sent.split(' ')
                    for wrd in split_by_space:
                        index_list = index_list + str(curr_index) + ' '
                        curr_index = curr_index + 1
                    #print(index_list)
                    #print(len(phr))
                    df_file.loc[len(df_file.index)] = [fname, "", index_list, sent_with_period] 
    return(df_file)

In [None]:
df_submission_train = pd.DataFrame(columns=["id","class","predictionstring","phrase"])
df_submission_train = df_submission_train.append(file_split_train('A97DE0D49AEA'))
df_submission_train = df_submission_train.append(file_split_train('DBF7EB6A9E02'))
df_submission_train

In [None]:
class_names = ['Claim','Counterclaim','Evidence','Position','Rebuttal','Concluding Statement','Lead']

In [None]:
file_names = []
file_names.append('A97DE0D49AEA')
file_names.append('DBF7EB6A9E02')

In [None]:
file_names 

In [None]:
# Make Predictions From Train
df_out = pd.DataFrame(columns=["id","class","predictionstring","phrase","raw_output","seondaryclass"])

for fnm in file_names:
    df_split = df_submission_train.loc[(df_submission_train.id == fnm)].reset_index()
    o_index_list = ''
    o_input_text = ''
    reached_conc = False
    ctr = 0
    for index, row in df_split.iterrows():
        input_text = row['phrase']
        prev_index_list = row['predictionstring']
        ctr = ctr + 1
        #call model
        #output = torch.randn(1, 7)
        
        encoded_text = tokenizer.encode_plus(
              input_text,
              max_length=MAX_LEN,
              add_special_tokens=True,
              return_token_type_ids=False,
              pad_to_max_length=True,
              truncation=True,
              return_attention_mask=True,
              return_tensors='pt',
        )
        input_ids = encoded_text['input_ids'].to(device)
        attention_mask = encoded_text['attention_mask'].to(device)
        output = model(input_ids, attention_mask)
        _, preds = torch.topk(output, 2)
        curr_pred = class_names[preds[0,0].tolist()]  #"Claim"
        sec_pred = class_names[preds[0,1].tolist()] #"Evidence"
        if (curr_pred == "Concluding Statement"):
            reached_conc = True
        if(reached_conc):
            curr_pred = "Concluding Statement"
            o_index_list = o_index_list + prev_index_list 
            o_input_text = o_input_text + input_text
        else:
            df_out.loc[len(df_out.index)] = [fnm, curr_pred, prev_index_list , input_text, output, sec_pred] 
        #if (ctr == 10):
            #curr_pred = "Concluding Statement"
            #print(curr_pred)
    if(reached_conc):
        df_out.loc[len(df_out.index)] = [fnm, "Concluding Statement", o_index_list , o_input_text, "", ""] 

In [None]:
df_out

In [None]:
df_out.to_csv('/kaggle/working/out-tain_set.csv', index=False)

## Run on Test Set

In [None]:
df_submission = pd.DataFrame(columns=["id","class","predictionstring","phrase"])
#df_submission = df_submission.append(file_split('A97DE0D49AEA'))
#df_submission = df_submission.append(file_split('DBF7EB6A9E02'))

file_names = []

import os
for dirname, _, filenames in os.walk('/kaggle/input/feedback-prize-2021/test'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        #print(filename.split(".")[0])
        DOC_ID = filename.split(".")[0]
        fn = '/kaggle/input/feedback-prize-2021/test/' + DOC_ID +'.txt'
        file_names.append(DOC_ID)
        #file_split(DOC_ID)
        df_submission = df_submission.append(file_split(DOC_ID))
        #with open(fn) as f:
            #contents = f.read()
            #print(contents)

In [None]:
df_submission

In [None]:
file_names

In [None]:
df_submission.to_csv('/kaggle/working/test_sub_only_split_by_period.csv', index=False)

In [None]:
# Make Predictions From Test
df_out = pd.DataFrame(columns=["id","class","predictionstring","phrase","raw_output","seondaryclass"])

for fnm in file_names:
    df_split = df_submission.loc[(df_submission.id == fnm)].reset_index()
    o_index_list = ''
    o_input_text = ''
    reached_conc = False
    ctr = 0
    for index, row in df_split.iterrows():
        input_text = row['phrase']
        prev_index_list = row['predictionstring']
        ctr = ctr + 1
        #call model
        #output = torch.randn(1, 7)
        
        encoded_text = tokenizer.encode_plus(
              input_text,
              max_length=MAX_LEN,
              add_special_tokens=True,
              return_token_type_ids=False,
              pad_to_max_length=True,
              truncation=True,
              return_attention_mask=True,
              return_tensors='pt',
        )
        input_ids = encoded_text['input_ids'].to(device)
        attention_mask = encoded_text['attention_mask'].to(device)
        output = model(input_ids, attention_mask)
        _, preds = torch.topk(output, 2)
        curr_pred = class_names[preds[0,0].tolist()]  #"Claim"
        sec_pred = class_names[preds[0,1].tolist()] #"Evidence"
        if (curr_pred == "Concluding Statement"):
            reached_conc = True
        if(reached_conc):
            curr_pred = "Concluding Statement"
            o_index_list = o_index_list + prev_index_list 
            o_input_text = o_input_text + input_text
        else:
            df_out.loc[len(df_out.index)] = [fnm, curr_pred, prev_index_list , input_text, output, sec_pred] 
        #if (ctr == 10):
            #curr_pred = "Concluding Statement"
            #print(curr_pred)
    if(reached_conc):
        df_out.loc[len(df_out.index)] = [fnm, "Concluding Statement", o_index_list , o_input_text, "", ""] 

In [None]:
 df_out.to_csv('/kaggle/working/submission.csv', index=False)

In [None]:
df_out