In [1]:
import pandas as pd
import numpy as np




In [2]:
# load train data
file_path = "./data/feedback_scores/train.csv"

df = pd.read_csv(file_path)

In [3]:
df.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [4]:
# how many unique values does each column have?
df.nunique()

text_id        3911
full_text      3911
cohesion          9
syntax            9
vocabulary        9
phraseology       9
grammar           9
conventions       9
dtype: int64

In [6]:
# list all unique values in ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
for col in ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']:
		print(col, df[col].unique())

cohesion [3.5 2.5 3.  4.5 4.  2.  1.  5.  1.5]
syntax [3.5 2.5 4.5 3.  4.  2.  1.  1.5 5. ]
vocabulary [3.  4.5 4.  3.5 2.5 2.  5.  1.5 1. ]
phraseology [3.  2.  4.5 3.5 2.5 4.  5.  1.5 1. ]
grammar [4.  2.  3.  2.5 3.5 4.5 5.  1.5 1. ]
conventions [3.  2.5 5.  4.  2.  3.5 4.5 1.  1.5]


So, looking at the dataset, here there are two things I can do, either I can classify into the 9 buckets that each of the scores has. Or, I can do a regression and move into the closest score to the predicted regression


In [None]:
df['full_text'][0]

In [None]:
# convert the scores into categorical variables of 0, 1, 2, 3, 4, etc for the number of unique values in each score column 

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

df['cohesion'] = le.fit_transform(df['cohesion'])
df['syntax'] = le.fit_transform(df['syntax'])
df['vocabulary'] = le.fit_transform(df['vocabulary'])
df['phraseology'] = le.fit_transform(df['phraseology'])
df['grammar'] = le.fit_transform(df['grammar'])
df['conventions'] = le.fit_transform(df['conventions'])

In [8]:
# convert the 6 columns into 1 column with a list of 6 values
df['scores'] = df[['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']].values.tolist()

df.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,scores
0,0016926B079C,I think that students would benefit from learn...,5,5,4,4,6,4,"[5, 5, 4, 4, 6, 4]"
1,0022683E9EA5,When a problem is a change you have to let it ...,3,3,4,2,2,3,"[3, 3, 4, 2, 2, 3]"
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",4,5,4,4,4,3,"[4, 5, 4, 4, 4, 3]"
3,003885A45F42,The best time in life is when you become yours...,7,7,7,7,6,8,"[7, 7, 7, 7, 6, 8]"
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,3,4,4,4,3,3,"[3, 4, 4, 4, 3, 3]"


In [9]:
# remove the 6 columns
df = df.drop(columns=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'])

In [10]:
df.head(10)

Unnamed: 0,text_id,full_text,scores
0,0016926B079C,I think that students would benefit from learn...,"[5, 5, 4, 4, 6, 4]"
1,0022683E9EA5,When a problem is a change you have to let it ...,"[3, 3, 4, 2, 2, 3]"
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...","[4, 5, 4, 4, 4, 3]"
3,003885A45F42,The best time in life is when you become yours...,"[7, 7, 7, 7, 6, 8]"
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,"[3, 4, 4, 4, 3, 3]"
5,004AC288D833,"Dear Principal,\r\n\r\nOur school should have ...","[5, 6, 6, 5, 5, 6]"
6,005661280443,Imagine if you could prove other people that y...,"[5, 6, 5, 5, 6, 6]"
7,008DDDDD8E8D,I think it's a good idea for the estudnets to ...,"[3, 3, 3, 3, 3, 2]"
8,009BCCC61C2A,positive attitude is the key to success. I agr...,"[4, 4, 5, 5, 4, 4]"
9,009F4E9310CB,Asking more than one person for and advice hel...,"[4, 4, 5, 3, 4, 3]"



---

# Custom Dataset and, Dataloader for BERT

In [11]:
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# set up the GPU
APPLE_M1_FLAG=1

device = None
if APPLE_M1_FLAG:
	# try to setup M1 GPU
	is_gpu = torch.backends.mps.is_available()
	if is_gpu:
		device = torch.device("mps")
		print("DEVICE: M1 GPU")
	else:
		device = torch.device("cpu")
		print("DEVICE: CPU")
else:
	# use GPU if available
	if torch.cuda.is_available():       
		device = torch.device("cuda")
		print("DEVICE: GPU")
	else:
		device = torch.device("cpu")
		print("DEVICE: CPU")

DEVICE: M1 GPU


In [25]:
class CustomDataset(Dataset):
	def __init__(self, dataframe, tokenizer, max_len):
		self.tokenizer = tokenizer
		self.data = dataframe
		self.full_text = dataframe.full_text
		self.targets = self.data.scores
		self.max_len = max_len

	def __len__(self):
		return len(self.full_text)
	
	def __getitem__(self, index):
		full_text = str(self.full_text[index])
		full_text = " ".join(full_text.split())

		inputs = self.tokenizer.encode_plus(
			full_text,
			None,
			add_special_tokens=True,
			max_length=self.max_len,
			padding='max_length',
			truncation=True,
			return_token_type_ids=True
		)

		ids = inputs['input_ids']
		mask = inputs['attention_mask']
		token_type_ids = inputs["token_type_ids"]

		return {
			'ids': torch.LongTensor(ids),
			'mask': torch.LongTensor(mask),
			'token_type_ids': torch.LongTensor(token_type_ids),
			'targets': torch.LongTensor(self.targets[index])
		}



---

### HYPERPARAMETERS:

In [46]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [47]:
# creating the dataset and the dataloader
train_size = 0.8

print(f"FULL Dataset: {df.shape}")

train_ds = df.sample(frac=train_size, random_state=200)
test_ds = df.drop(train_ds.index).reset_index(drop=True)
train_ds = train_ds.reset_index(drop=True)

print(f"TRAIN Dataset: {train_ds.shape}")
print(f"TEST Dataset: {test_ds.shape}")

train_set = CustomDataset(train_ds, tokenizer, MAX_LEN)
test_set = CustomDataset(test_ds, tokenizer, MAX_LEN)



FULL Dataset: (3911, 3)
TRAIN Dataset: (3129, 3)
TEST Dataset: (782, 3)


In [48]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
				'shuffle': True,
				'num_workers': 0
				}

test_params = {'batch_size': VALID_BATCH_SIZE,
				'shuffle': True,
				'num_workers': 0
				}

train_loader = DataLoader(train_set, **train_params)
test_loader = DataLoader(test_set, **test_params)


---

# Model:

In [61]:
from torch import nn
class BERT_Classifier(nn.Module):
	def __init__(self):
		super(BERT_Classifier, self).__init__()
		self.bert = BertModel.from_pretrained('bert-base-uncased')
		self.drop = nn.Dropout(0.0)
		self.out = nn.Linear(768, 6)

	def forward (self, ids, mask, token_type_ids):
		_, pooled_output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
		output_2 = self.drop(pooled_output)
		output = self.out(output_2)
		return output
	
model = BERT_Classifier()
model.to(device)

BERT_Classifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

#### How does the Model work?

- We are using the BERT model. We then added a `Dropout` and `Linear Layer` as well. We add these layers to ensure the model is able to regularize and classify the data better.

- In the forward loop, there are 2 outputs from the `BERT` model layer. 

- The output of this, `pooled_output` is passed through the `Dropout` layer and then the `Linear` layer.

- We set the number of dimesnions in the `Linear` layer to be equal to the number of classes we have in the dataset.

In [62]:
def loss_fn(outputs, targets):
	return nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)


---

# Training:

In [63]:
def train(epoch):
	model.train()
	for idx, data in enumerate(train_loader, 0):
		ids = data['ids'].to(device, dtype=torch.long)
		mask = data['mask'].to(device, dtype=torch.long)
		token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
		targets = data['targets'].to(device, dtype=torch.float)

		optimizer.zero_grad()
		outputs = model(ids, mask, token_type_ids)
		loss = loss_fn(outputs, targets)
		if idx % 100 == 0:
			print(f"Batch: {idx} | Loss: {loss.item()}")
		optimizer.zero_grad()
		loss.backward()
		optimizer.step()
	
	print(f"EPOCH: {epoch} | LOSS: {loss.item()}")


In [64]:
for epoch in range(EPOCHS):
	train(epoch)

Batch: 0 | Loss: 0.38547179102897644
Batch: 100 | Loss: -9.934698104858398
Batch: 200 | Loss: -13.504341125488281
Batch: 300 | Loss: -19.44586181640625
EPOCH: 0 | LOSS: -30.354496002197266
Batch: 0 | Loss: -19.200740814208984
Batch: 100 | Loss: -24.988859176635742
Batch: 200 | Loss: -30.479351043701172
Batch: 300 | Loss: -38.318939208984375
EPOCH: 1 | LOSS: -46.97802734375
Batch: 0 | Loss: -30.341270446777344


KeyboardInterrupt: 

In [53]:
def validation(epoch):
	model.eval()
	fin_targets = []
	fin_outputs = []

	with torch.no_grad():
		for _, data in enumerate(test_loader, 0):
			ids = data['ids'].to(device, dtype=torch.long)
			mask = data['mask'].to(device, dtype=torch.long)
			token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
			targets = data['targets'].to(device, dtype=torch.long)
			outputs = model(ids, mask, token_type_ids)
			fin_targets.extend(targets.cpu().detach().numpy().tolist())
			fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
	return fin_outputs, fin_targets

In [59]:
def calculate_accuracy(outputs, targets):
	print(outputs[0], targets[0])

In [60]:
for epoch in range(EPOCHS):
	outputs, targets = validation(epoch=epoch)
	accuracy = calculate_accuracy(outputs, targets)
	print(f"EPOCH: {epoch} | ACCURACY: {accuracy}")
	break
	


[0.9999994039535522, 0.9999992847442627, 0.9999995231628418, 0.9999997615814209, 0.9999997615814209, 0.9999986886978149] [5, 3, 3, 3, 2, 4]
EPOCH: 0 | ACCURACY: None
