**NLP-Interview**

In [1]:
import os, sys, gc
from pathlib import Path
from tqdm.auto import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
class CFG:
    SEED = 2342
    Data_dir = Path("../data/")
    model_name = "bert-base-uncased"
    MAX_LEN = 256
    
Data_dir = CFG.Data_dir
os.listdir(Data_dir)

['test_data.xlsx', '.ipynb_checkpoints']

In [3]:
# 类目: Category
# 类目: Title
df = pd.read_excel(Data_dir/'test_data.xlsx')
df.head(10)

Unnamed: 0,类目,标题
0,Pet Hair Trimmer,4 in 1 Pet Hair Clipper With 4 Blades Grooming...
1,Pet Hair Trimmer,"Animal Clipper pet care series sonar sn-270, r..."
2,Pet Hair Trimmer,Babyliss 35007690 block knife set (40mm) Clipp...
3,Pet Hair Trimmer,Dog Hair Trimmer USB Rechargeable Professional...
4,Pet Hair Trimmer,Dropshipping Dog Noise-Low Design Pet Hair Cli...
5,Pet Hair Trimmer,For cutting animals rechargeable-network goods...
6,Pet Hair Trimmer,Glove quitapelos so pets removes the hair and ...
7,Pet Hair Trimmer,"NANI Pet Clippers Dog, area Dog preparation Ki..."
8,Pet Hair Trimmer,New USB Rechargeable Pet Hair Trimmer for Dogs...
9,Pet Hair Trimmer,Pet dog clippers electric hair cutting machine...


In [4]:
df.shape

(100003, 2)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100003 entries, 0 to 100002
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   类目      100002 non-null  object
 1   标题      99970 non-null   object
dtypes: object(2)
memory usage: 1.5+ MB


**The distribution of the targets**

In [6]:
df.iloc[:, 0].value_counts()

Home & Garden                                              19888
Beauty & Health                                            11266
Automobiles, Parts & Accessories                           10570
Consumer Electronics                                       10438
Computer & Office                                           6325
                                                           ...  
Drawer Organizers                                              1
Elephants.                                                     1
Dowel                                                          1
Double-Sided Tape                                              1
Conga 1790 vacuum cleaner Robot  Cleaner Accessories           1
Name: 类目, Length: 1461, dtype: int64

In [7]:
df.describe()  # classification problem

Unnamed: 0,类目,标题
count,100002,99970
unique,1461,99565
top,Home & Garden,SSD External Hard Drive 2TB 1TB HD Externo USB...
freq,19888,4


In [8]:
# check for missing values 
# the missing rows in the data
df[df.isnull().any(axis=1)]

Unnamed: 0,类目,标题
13555,"School lunch box, excursions, picnics.",
13556,670 ml - 1020 ml,
26379,Fitted Sheet + fitted sheet + pillowcase,
26394,"White, Gray.",
26395,Elephants.,
27234,Educational Building Construction,
27239,Educational,
32245,100% po,
32253,100% polyester,
32255,100% polyester,


In [9]:
# no dropping any missing values because there are specific number of categories needed
# therefore all missing values will be replaced by "unknown"
df['类目'] = df['类目'].astype('str')

In [10]:
# using the description of the items as inputs
# for bert, input is represented as [CLS] + " " + [SEP] + " " + [SEP]
df['input'] = '[CLS] ' + df.标题.astype('str') + ' [SEP]'

In [11]:
df['input']

0         [CLS] 4 in 1 Pet Hair Clipper With 4 Blades Gr...
1         [CLS] Animal Clipper pet care series sonar sn-...
2         [CLS] Babyliss 35007690 block knife set (40mm)...
3         [CLS] Dog Hair Trimmer USB Rechargeable Profes...
4         [CLS] Dropshipping Dog Noise-Low Design Pet Ha...
                                ...                        
99998     [CLS] Multi-purpose Handheld Electric High-pre...
99999     [CLS] Multi-set Dirt Disposal Replacement Bags...
100000    [CLS] Multicooker Rice Cooker 11 in 1 DIY Func...
100001    [CLS] Multifunction Automatic UV Sterilizer fo...
100002    [CLS] Multifunction Electric Remove Calluses H...
Name: input, Length: 100003, dtype: object

**Splitting the data into Train, Validation and Test sets**

Since this is a classification problem, I will be testing different versions of the Stratify

In [12]:
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
n_splits = 5
cv = StratifiedKFold(n_splits=5)

# to avoid data leakage, the titles or just the training data will not be exposed
X = df.index.values
y = df.iloc[:, 0].values

df['fold'] = -1

for fold, (tr_idx, val_idx) in enumerate(tqdm(cv.split(X, y), file=sys.stdout, total=n_splits)):
    df.iloc[val_idx, -1] = fold

  0%|          | 0/5 [00:00<?, ?it/s]

In [13]:
# checking the distribution of the folds
df['fold'].value_counts()

0    20001
1    20001
2    20001
3    20000
4    20000
Name: fold, dtype: int64

In [14]:
# now creating the splits
df['stage'] = df['fold'].apply(lambda x: 'test' if x==0 else ('valid' if x == 4 else 'train'))

train_df = df[df['stage'] == 'train']
valid_df = df[df['stage'] == 'valid']
test_df = df[df['stage'] == 'test']

df['stage'].value_counts()

train    60002
test     20001
valid    20000
Name: stage, dtype: int64

In [15]:
# drop off the fold and stage columns from the train, valid and test sets
train_df.drop(['fold', 'stage'], axis=1, inplace=True)
valid_df.drop(['fold', 'stage'], axis=1, inplace=True)
test_df.drop(['fold', 'stage'], axis=1, inplace=True)

**Tokenization**

In [16]:
# Before tokenizing the inputs, first need to convert the 类目 (categories) into numbers
from sklearn.preprocessing import LabelEncoder
# since there are specific categories that will be used, then using the whole dataset to 
# label encode will be fine
encoder = LabelEncoder().fit(df['类目'])

train_df['类目'] = encoder.transform(train_df['类目'])
valid_df['类目'] = encoder.transform(valid_df['类目'])
test_df['类目'] = encoder.transform(test_df['类目'])

In [17]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import BertForSequenceClassification, BertTokenizer

In [18]:
class ItemsDataset(Dataset):
    def __init__(self, df, isTest=False):
        self.df = df
        self.isTest = isTest
        self.tokenizer = BertTokenizer.from_pretrained(CFG.model_name)
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        inputs = self.tokenizer.encode_plus(
            row['标题'],
            None,
            add_special_tokens=True,
            max_length=CFG.MAX_LEN,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        
        ids = torch.tensor(inputs['input_ids'], dtype=torch.long)
        mask = torch.tensor(inputs['attention_mask'], dtype=torch.long)
        token_type_ids = torch.tensor(inputs['token_type_ids'], dtype=torch.long)
        
        if self.isTest:
            return (ids, mask, token_type_ids)
        else:
            target = torch.tensor(row['类目'], dtype=torch.int)
            return (ids, mask, token_type_ids, target)

In [19]:
trainSet = ItemsDataset(train_df)

In [20]:
trainDataLoader = DataLoader(trainSet, batch_size=32, num_workers=2, shuffle=True)

In [21]:
for batch in trainDataLoader:
    break

In [22]:
batch

[tensor([[  101,  5055,  6861,  ...,     0,     0,     0],
         [  101, 29454, 13700,  ...,     0,     0,     0],
         [  101,  2176,  1011,  ...,     0,     0,     0],
         ...,
         [  101,  4524, 21025,  ...,     0,     0,     0],
         [  101,  9530, 29336,  ...,     0,     0,     0],
         [  101,  5423, 27068,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 tensor([1455,   40,  237, 1455, 1071,  676, 1455,  932, 1456,  237, 1455,  692,
          237, 1455, 1455,  748,   97, 1250, 1455, 1455, 1250, 1455,  997,  692,
         1455, 1

In [27]:
# baseline BertModel
class ItemsBertModel(nn.Module):
    def __init__(self):
        super(ItemsBertModel, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained(CFG.model_name)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(3072, 1462)  # since there are 1442 unique categories that can be predicted
        
    def forward(self, ids, mask, token_type_ids):
        _, output = self.bert(ids, mask, token_type_ids, return_dict=False)
        output = self.drop(output)
        output = self.fc(output)
        return output

In [24]:
model = BertForSequenceClassification.from_pretrained(CFG.model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [26]:
len(df.iloc[:, 0].unique())

1462

In [28]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Using {device}")

Using cpu


In [31]:
def read_data(data):
    return (data[0].to(device), data[1].to(device), data[2].to(device)), data[-1].to(device)

In [39]:
def train(model, optimizer, loss_fn, dataloader, epochs=2):
    model = model.to(device)
    model.train()  # setting the model for training
    
    tbar = tqdm(enumerate(dataloader), total=len(dataloader))
    for epoch in range(epochs):
        for i, data in tbar:
            inputs, targets = read_data(data)
            outputs = model(inputs[0], inputs[1], inputs[2])
            
            loss = loss_fn(outputs, targets)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [42]:
bert_model = ItemsBertModel()
optimizer = torch.optim.AdamW(bert_model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

train(
    model=bert_model,
    optimizer=optimizer,
    loss_fn=loss_fn,
    dataloader=trainDataLoader
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

  0%|          | 0/1876 [00:00<?, ?it/s]

RuntimeError: DataLoader worker (pid 286) is killed by signal: Killed. 

In [37]:
model(i[0], i[1], i[2])

SequenceClassifierOutput(loss=None, logits=tensor([[-0.1450,  0.3444],
        [-0.1975,  0.3207],
        [-0.0806,  0.2845],
        [-0.0675,  0.2766],
        [-0.1638,  0.3340],
        [-0.1287,  0.2182],
        [-0.0280,  0.2735],
        [-0.0734,  0.3042],
        [-0.1054,  0.1189],
        [-0.1563,  0.3533],
        [-0.2042,  0.3164],
        [-0.1648,  0.2444],
        [-0.0964,  0.2343],
        [-0.1582,  0.2922],
        [-0.0766,  0.2321],
        [-0.2211,  0.2530],
        [-0.1638,  0.2067],
        [-0.1941,  0.3093],
        [-0.0618,  0.2394],
        [-0.1630,  0.3626],
        [-0.2241,  0.2636],
        [-0.0163,  0.1854],
        [-0.1381,  0.3077],
        [-0.1952,  0.2983],
        [-0.1372,  0.3189],
        [-0.0617,  0.2539],
        [-0.1952,  0.3123],
        [-0.1026,  0.2432],
        [-0.1222,  0.3108],
        [-0.0505,  0.2921],
        [-0.0847,  0.2059],
        [-0.1074,  0.3166]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=No

In [38]:
o

tensor([1455,   40,  237, 1455, 1071,  676, 1455,  932, 1456,  237, 1455,  692,
         237, 1455, 1455,  748,   97, 1250, 1455, 1455, 1250, 1455,  997,  692,
        1455, 1250,  125,  878,  692,  161,  693,  237], dtype=torch.int32)