In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
PARENT_DIR = "/content/gdrive/My Drive/EBAY"
DATA_DIR = PARENT_DIR + "/data"
DATA_PATH = DATA_DIR + "/data.csv"
VER = "BertWithFeatureClassification"

OUTPUT_DIR = PARENT_DIR + "/output/"+VER
MODEL_DIR = PARENT_DIR + "/model/"+VER

MODEL_PATH = MODEL_DIR + "/pytorch_model.bin"
CONFIG_PATH = MODEL_DIR + "/config.json"
VOCAB_PATH = MODEL_DIR + "/vocab.txt"
BERT = "bert-base-cased"
REPORT_PATH = OUTPUT_DIR + "/results.txt"

In [4]:
import os
if not os.path.exists(MODEL_DIR):
  os.makedirs(MODEL_DIR)
if not os.path.exists(OUTPUT_DIR):
  os.makedirs(OUTPUT_DIR)

In [5]:
batch_size = 8
max_len = 150
epochs = 20
max_grad_norm = 1.0
full_finetuning = False
lr = 3e-5

In [6]:
!pip install seqeval
!pip install transformers

Collecting seqeval
[?25l  Downloading https://files.pythonhosted.org/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz (43kB)
[K     |███████▌                        | 10kB 19.8MB/s eta 0:00:01[K     |███████████████                 | 20kB 4.8MB/s eta 0:00:01[K     |██████████████████████▌         | 30kB 6.0MB/s eta 0:00:01[K     |██████████████████████████████  | 40kB 6.2MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 3.4MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-cp36-none-any.whl size=16171 sha256=3ff900cdc3ba661ca5f05c8ee56db0175ec9fe64c89f855687a452cefb941c2a
  Stored in directory: /root/.cache/pip/wheels/52/df/1b/45d75646c37428f7e626214704a0e35bd3cfc32eda37e59e5f
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Collecting transformers

In [7]:
import pandas as pd
import math
import numpy as np
from seqeval.metrics import f1_score
from sklearn.metrics import classification_report,accuracy_score,f1_score, multilabel_confusion_matrix
import torch.nn.functional as F

import torch
import os
from tqdm import tqdm,trange
from torch.optim import AdamW
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler, TensorDataset
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer
from transformers import get_linear_schedule_with_warmup
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import torch.nn as nn
import seaborn as sns

In [8]:
df = pd.read_csv(DATA_PATH)

# creating instance of labelencoder
labelencoder = LabelEncoder()

# Assigning numerical values for decades
df['label'] = labelencoder.fit_transform(df['Decade'])
num_of_labels = len(df['label'].unique())
classes = list(labelencoder.classes_)
df.head()

Unnamed: 0,Decade,Age,Title,Review Text,Division Name,Department Name,Class Name,temp,Text,Word Count,label
0,1980,33,,Absolutely wonderful - silky and sexy and comf...,Intimates,Intimate,Intimates,,"Title: , Division: Intimates, Department: Inti...",8,6
1,1980,34,,Love this dress! it's sooo pretty. i happene...,General,Dresses,Dresses,,"Title: , Division: General, Department: Dresse...",62,6
2,1960,60,Some major design flaws,I had such high hopes for this dress and reall...,General,Dresses,Dresses,Some major design flaws,"Title: Some major design flaws, Division: Gene...",98,4
3,1970,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",General Petite,Bottoms,Pants,My favorite buy!,"Title: My favorite buy!, Division: General Pet...",22,5
4,1970,47,Flattering shirt,This shirt is very flattering to all due to th...,General,Tops,Blouses,Flattering shirt,"Title: Flattering shirt, Division: General, De...",36,5


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
n_gpu

1

In [10]:
# Splitting into train val and test splits
train_text, temp_text, train_labels, temp_labels = train_test_split(df['Text'], df['label'], 
                                                                    random_state=2018, 
                                                                    test_size=0.3, 
                                                                    stratify=df['label'])


val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=2018, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)

In [11]:
print(train_text)
print(train_labels)

21285    Title: Classic, Division: General, Department:...
19034    Title: A tad thin and see through, Division: G...
22154    Title: Failure, Division: General Petite, Depa...
5378     Title: Frankenstein experiment gone wrong, Div...
6909     Title: Beautiful design, tight fit, Division: ...
                               ...                        
21304    Title: Beautiful dress, Division: General Peti...
16418    Title: , Division: General, Department: Tops, ...
7368     Title: Soft blouse, Division: General, Departm...
6050     Title: You will love this!, Division: General ...
10761    Title: So cozy and lovely!, Division: General,...
Name: Text, Length: 15848, dtype: object
21285    7
19034    6
22154    3
5378     5
6909     5
        ..
21304    4
16418    5
7368     7
6050     4
10761    5
Name: label, Length: 15848, dtype: int64


In [12]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained(BERT,do_lower_case=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [13]:
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = max_len,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = max_len,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = max_len,
    pad_to_max_length=True,
    truncation=True
)

## convert lists to tensors
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)



In [14]:
print(f"Train size: {train_seq.shape}, Val size: {val_seq.shape}, Test size: {test_seq.shape}")

Train size: torch.Size([15848, 150]), Val size: torch.Size([3396, 150]), Test size: torch.Size([3397, 150])


In [15]:
from sklearn.utils.class_weight import compute_class_weight

#compute the class weights
class_weights = compute_class_weight('balanced', np.unique(train_labels), train_labels)

print("Class Weights:",class_weights)

# converting list of class weights to a tensor
weights= torch.tensor(class_weights,dtype=torch.float)

# push to GPU
weights = weights.to(device)

# define the loss function
cross_entropy  = nn.CrossEntropyLoss() 

Class Weights: [195.65432099  29.34814815   9.07674685   1.25241031   0.66473722
   0.44099396   0.3333123    0.81901809  16.93162393]


In [16]:
# model = BertForSequenceClassification.from_pretrained(BERT, num_labels=num_of_labels)
bert = BertModel.from_pretrained('bert-base-uncased')

for param in bert.parameters():
    param.requires_grad = False

class BERT_Arch(nn.Module):

    def __init__(self, bert, num_of_labels):
      
      super(BERT_Arch, self).__init__()

      self.bert = bert 
      
      # dropout layer
      self.dropout = nn.Dropout(0.1)
      
      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      self.fc1 = nn.Linear(768,512)
      
      # dense layer decades (Output layer)
      self.fc2 = nn.Linear(512,num_of_labels)

    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      _, cls_hs = self.bert(sent_id, attention_mask=mask)
      
      x = self.fc1(cls_hs)

      x = self.relu(x)

      x = self.dropout(x)

      # output layer
      x = self.fc2(x)
      
      return x

model = BERT_Arch(bert, len(classes))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [17]:
model.cuda();

In [18]:
optimizer = AdamW(model.parameters(), lr=lr)

In [19]:
# function to train the model
def train():
  print("\nTraining...")
  
  model.train()

  total_loss = 0

  # iterate over batches
  for step,batch in enumerate(train_dataloader):
    
    # push the batch to gpu
    batch = [r.to(device) for r in batch]
 
    b_input_ids, b_input_mask, b_labels = batch    

    # clear previously calculated gradients 
    model.zero_grad()      

    # get model predictions for the current batch
    outputs = model(b_input_ids, b_input_mask)

    # compute the loss between actual and predicted values
    loss = cross_entropy(outputs.view(-1, num_of_labels), b_labels.view(-1))

    # add on to the total loss
    total_loss = total_loss + loss.item()

    # backward pass to calculate the gradients
    loss.backward()

    # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

    # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
    torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)

    # update parameters
    optimizer.step()

  # compute the training loss of the epoch
  avg_loss = total_loss / len(train_dataloader)
  
  return avg_loss

In [20]:
# function for evaluating the model
def evaluate():
  
  print("\nEvaluating...")
  
  # deactivate dropout layers
  model.eval()

  total_loss = 0
  
  # empty list to save the model predictions
  y_pred, y_true = [], []

  # iterate over batches
  for step,batch in enumerate(val_dataloader):

    # push the batch to gpu
    batch = [t.to(device) for t in batch]

    b_input_ids, b_input_mask, b_labels = batch

    # deactivate autograd
    with torch.no_grad():
      
      # model predictions
      outputs = model(b_input_ids, b_input_mask)

      # compute the validation loss between actual and predicted values
      loss = cross_entropy(outputs.view(-1, num_of_labels), b_labels.view(-1))

      total_loss = total_loss + loss.item()

      y_pred.extend(torch.argmax(outputs, 1).tolist())
      y_true.extend(b_labels.tolist())

  # compute the validation loss of the epoch
  avg_loss = total_loss / len(val_dataloader)

  print('Classification Report:')
  print(classification_report(y_true, y_pred, target_names=[str(c) for c in classes], digits=4))

  return avg_loss, y_true, y_pred

In [None]:
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for _ in trange(epochs,desc="Epoch"):
    
    #train model
    train_loss = train()

    # append training loss
    train_losses.append(train_loss)
    print(f'\nTraining Loss: {train_loss:.3f}')

    #evaluate model
    valid_loss, _, _ = evaluate()

    valid_losses.append(valid_loss)

    print(f'Validation Loss: {valid_loss:.3f}')
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_DIR+'/saved_weights.pt')
    
    

In [22]:
!ls '$MODEL_DIR'

saved_weights.pt


In [None]:
# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(train_losses, 'b-o', label="training loss")
plt.plot(valid_losses, 'r-o', label="validation loss")

# Label the plot.
plt.title("Learning curve")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.savefig(OUTPUT_DIR + "/loss.png")

plt.show()

# Test

In [23]:
path = MODEL_DIR + '/saved_weights.pt'
model.load_state_dict(torch.load(path))

# wrap tensors
test_data = TensorDataset(test_seq, test_mask, test_y)

# sampler for sampling the data during training
test_sampler = SequentialSampler(test_data)

# dataLoader for validation set
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size=batch_size)

print("\nTesting...")
  
# deactivate dropout layers
model.eval()

# empty list to save the model predictions
y_pred, y_true = [], []

# iterate over batches
for step,batch in enumerate(test_dataloader):

  # push the batch to gpu
  batch = [t.to(device) for t in batch]

  b_input_ids, b_input_mask, b_labels = batch

  # deactivate autograd
  with torch.no_grad():
    
    # model predictions
    outputs = model(b_input_ids, b_input_mask)

    y_pred.extend(torch.argmax(outputs, 1).tolist())
    y_true.extend(b_labels.tolist())




Testing...


In [24]:
# Get acc , recall, F1 result report
report = classification_report(y_true, y_pred, zero_division=1, digits=4)


# Save the report into file
with open(REPORT_PATH, "w") as writer:
    print("***** Eval results(Lenient) *****")
    print("\n%s"%(report))
    print("F1 score: %f"%(f1_score(y_true, y_pred, average='micro')))
    print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))
    
    writer.write("F1 score(Lenient):\n")
    writer.write(str(f1_score(y_true, y_pred, average='micro')))
    writer.write("\n\nAccuracy score:\n")
    writer.write(str(accuracy_score(y_true, y_pred)))
    writer.write("\n\n")  
    writer.write(report)

***** Eval results(Lenient) *****

              precision    recall  f1-score   support

           0     1.0000    0.0000    0.0000         1
           1     1.0000    0.0000    0.0000        13
           2     1.0000    0.0000    0.0000        41
           3     1.0000    0.0000    0.0000       302
           4     1.0000    0.0000    0.0000       568
           5     1.0000    0.0000    0.0000       856
           6     0.3335    1.0000    0.5002      1133
           7     1.0000    0.0000    0.0000       461
           8     1.0000    0.0000    0.0000        22

    accuracy                         0.3335      3397
   macro avg     0.9259    0.1111    0.0556      3397
weighted avg     0.7777    0.3335    0.1668      3397

F1 score: 0.333530
Accuracy score: 0.333530


In [25]:
print([(x,y) for x,y in zip(y_true, y_pred)])

[(4, 6), (4, 6), (4, 6), (6, 6), (6, 6), (4, 6), (6, 6), (6, 6), (6, 6), (2, 6), (6, 6), (5, 6), (6, 6), (6, 6), (7, 6), (4, 6), (6, 6), (6, 6), (6, 6), (6, 6), (5, 6), (3, 6), (4, 6), (6, 6), (6, 6), (5, 6), (4, 6), (3, 6), (5, 6), (3, 6), (4, 6), (3, 6), (5, 6), (7, 6), (6, 6), (4, 6), (5, 6), (6, 6), (7, 6), (6, 6), (5, 6), (6, 6), (6, 6), (4, 6), (5, 6), (6, 6), (5, 6), (7, 6), (5, 6), (3, 6), (3, 6), (6, 6), (4, 6), (4, 6), (5, 6), (6, 6), (4, 6), (6, 6), (5, 6), (6, 6), (5, 6), (4, 6), (6, 6), (7, 6), (4, 6), (7, 6), (5, 6), (6, 6), (6, 6), (4, 6), (6, 6), (6, 6), (6, 6), (4, 6), (6, 6), (3, 6), (6, 6), (6, 6), (6, 6), (6, 6), (6, 6), (7, 6), (4, 6), (4, 6), (6, 6), (4, 6), (5, 6), (4, 6), (3, 6), (6, 6), (6, 6), (7, 6), (2, 6), (4, 6), (4, 6), (7, 6), (6, 6), (4, 6), (3, 6), (2, 6), (5, 6), (6, 6), (3, 6), (7, 6), (7, 6), (4, 6), (6, 6), (6, 6), (6, 6), (5, 6), (6, 6), (6, 6), (6, 6), (3, 6), (5, 6), (5, 6), (5, 6), (4, 6), (6, 6), (5, 6), (5, 6), (3, 6), (6, 6), (6, 6), (2, 6),

In [26]:
confusion_matrix = [[0 for _ in range(num_of_labels)] for _ in range(num_of_labels)]
for true, actual in zip(y_true,y_pred):
  confusion_matrix[true][actual] += 1

print(confusion_matrix)

[[0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 13, 0, 0], [0, 0, 0, 0, 0, 0, 41, 0, 0], [0, 0, 0, 0, 0, 0, 302, 0, 0], [0, 0, 0, 0, 0, 0, 568, 0, 0], [0, 0, 0, 0, 0, 0, 856, 0, 0], [0, 0, 0, 0, 0, 0, 1133, 0, 0], [0, 0, 0, 0, 0, 0, 461, 0, 0], [0, 0, 0, 0, 0, 0, 22, 0, 0]]


In [27]:
conf_matrix = pd.DataFrame(confusion_matrix, columns=[c for c in classes], index=[c for c in classes])

In [28]:
conf_matrix

Unnamed: 0,1920,1930,1940,1950,1960,1970,1980,1990,2000
1920,0,0,0,0,0,0,1,0,0
1930,0,0,0,0,0,0,13,0,0
1940,0,0,0,0,0,0,41,0,0
1950,0,0,0,0,0,0,302,0,0
1960,0,0,0,0,0,0,568,0,0
1970,0,0,0,0,0,0,856,0,0
1980,0,0,0,0,0,0,1133,0,0
1990,0,0,0,0,0,0,461,0,0
2000,0,0,0,0,0,0,22,0,0
