**NLP-Interview**

In [1]:
import os, sys, gc
from pathlib import Path
from tqdm.auto import tqdm

import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
class CFG:
    SEED = 2332
    Data_dir = Path("../data/")
    model_name = "bert-base-uncased"
    MAX_LEN = 256
    BATCH_SIZE = 32
    N_SPLITS = 25
    RATIO = 1
    
Data_dir = CFG.Data_dir
os.listdir(Data_dir)

['test_data.xlsx', '.ipynb_checkpoints']

In [3]:
# 类目: Category
# 类目: Title
df = pd.read_excel(Data_dir/'test_data.xlsx')
df.head(10)

Unnamed: 0,类目,标题
0,Pet Hair Trimmer,4 in 1 Pet Hair Clipper With 4 Blades Grooming...
1,Pet Hair Trimmer,"Animal Clipper pet care series sonar sn-270, r..."
2,Pet Hair Trimmer,Babyliss 35007690 block knife set (40mm) Clipp...
3,Pet Hair Trimmer,Dog Hair Trimmer USB Rechargeable Professional...
4,Pet Hair Trimmer,Dropshipping Dog Noise-Low Design Pet Hair Cli...
5,Pet Hair Trimmer,For cutting animals rechargeable-network goods...
6,Pet Hair Trimmer,Glove quitapelos so pets removes the hair and ...
7,Pet Hair Trimmer,"NANI Pet Clippers Dog, area Dog preparation Ki..."
8,Pet Hair Trimmer,New USB Rechargeable Pet Hair Trimmer for Dogs...
9,Pet Hair Trimmer,Pet dog clippers electric hair cutting machine...


In [4]:
df.shape

(100003, 2)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100003 entries, 0 to 100002
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   类目      100002 non-null  object
 1   标题      99970 non-null   object
dtypes: object(2)
memory usage: 1.5+ MB


**The distribution of the targets**

In [6]:
# Selecting the 45 most popular categories (类目)
top_10_df = pd.DataFrame(df.iloc[:, 0].value_counts().sort_values(ascending=False))[:40]
top_10_df = top_10_df.reset_index()
top_10_df.columns = ['类目', 'Occurrence']
top_10_df

Unnamed: 0,类目,Occurrence
0,Home & Garden,19888
1,Beauty & Health,11266
2,"Automobiles, Parts & Accessories",10570
3,Consumer Electronics,10438
4,Computer & Office,6325
5,Furniture,5017
6,Home Appliances,3582
7,Electronic Components & Supplies,3340
8,EL Products,1575
9,Electronics Stocks,1486


In [7]:
# create a new data frame containing the number of times each product appears
# this makes it easy to find specific rows by just using Booleans(<,>,...)
overall_df = pd.DataFrame(df.iloc[:, 0]).value_counts()
overall_df = overall_df.reset_index()
overall_df.columns = ['类目', 'Occurrence']
new_df = pd.merge(df, overall_df, on=['类目'], how='left')
new_df.head()

Unnamed: 0,类目,标题,Occurrence
0,Pet Hair Trimmer,4 in 1 Pet Hair Clipper With 4 Blades Grooming...,13.0
1,Pet Hair Trimmer,"Animal Clipper pet care series sonar sn-270, r...",13.0
2,Pet Hair Trimmer,Babyliss 35007690 block knife set (40mm) Clipp...,13.0
3,Pet Hair Trimmer,Dog Hair Trimmer USB Rechargeable Professional...,13.0
4,Pet Hair Trimmer,Dropshipping Dog Noise-Low Design Pet Hair Cli...,13.0


In [8]:
# From the 10 most popular product categories, the least popular will be used to select 
# rows that are at least above it.

min_occ = top_10_df['Occurrence'].min()
new_df = (
    new_df
    .loc[(new_df['Occurrence'] >= min_occ)]
    .sort_values(by='Occurrence', ascending=True)
    .reset_index(drop=True))

new_df.shape

(83220, 3)

In [9]:
# check for missing values 
# the missing rows in the data
new_df[new_df.isnull().any(axis=1)]

Unnamed: 0,类目,标题,Occurrence


In [10]:
# using the description of the items as inputs
# for bert, input is represented as [CLS] + " " + [SEP] + " " + [SEP]
new_df['input'] = '[CLS] ' + new_df.标题.astype('str') + ' [SEP]'
new_df['input'].head()

0    [CLS] Doll enchantimals base with pet сэйдж ск...
1    [CLS] L.O.L. Surprise 566977 pupa remix hairfl...
2    [CLS] L.O.L. Surprise winter chill spaces-4 11...
3         [CLS] LOL Surprise set advent calendar [SEP]
4                 [CLS] Machine Barbie ambulance [SEP]
Name: input, dtype: object

**Tokenize the categories into numbers**

This can be done using pandas .factorize()

*.factorize()* usually returns the (array, index of the words), but it's the array that's needed

In [11]:
# Creating a new column label since Hugging face transformers requires target columns to be named as label
# that is used to represent the specific product categories
 
new_df['label'] = new_df['类目'].factorize()[0]
new_df['label'].unique() # just to confirm that 100 products have been selected

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39])

In [12]:
# store the corresponding ids and categories for future use

In [13]:
category_to_label = dict(new_df[['类目', 'label']].values)
label_to_category = {v: k for k, v in category_to_label.items()}

**Splitting the data into Train, Validation and Test sets**

Since this is a classification problem, I will be testing different versions of the Stratify

In [14]:
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit

# sample 10% of the data to be used for further computation
new_df = new_df.sample(frac=CFG.RATIO, random_state=CFG.SEED).reset_index(drop=True)

cv = StratifiedShuffleSplit(n_splits=CFG.N_SPLITS)

# to avoid data leakage, the titles or just the training data will not be exposed
X = new_df.index.values 
y = new_df.iloc[:, 0].values

new_df['fold'] = -1

for fold, (tr_idx, val_idx) in enumerate(tqdm(cv.split(X, y), file=sys.stdout, total=CFG.N_SPLITS)):
    new_df.iloc[val_idx, -1] = fold

  0%|          | 0/25 [00:00<?, ?it/s]

In [15]:
# checking the distribution of the folds
# new_df['fold'].value_counts()

In [16]:
# now creating the splits
new_df['stage'] = new_df['fold'].apply(lambda x: 'test' if x==7 else ('valid' if x == 15 else 'train'))

train_df = new_df[new_df['stage'] == 'train']
valid_df = new_df[new_df['stage'] == 'valid']
test_df = new_df[new_df['stage'] == 'test']

# new_df['stage'].value_counts()

In [17]:
# drop off the fold and stage columns from the train, valid and test sets
train_df.drop(['fold', 'stage'], axis=1, inplace=True)
valid_df.drop(['fold', 'stage'], axis=1, inplace=True)
test_df.drop(['fold', 'stage'], axis=1, inplace=True)

**Classical Machine learning algorithms**

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(sublinear_tf=True,
                            min_df=5,
                            ngram_range=(1, 2),
                            stop_words='english')

vectorizer.fit(train_df['input'])
X_train = vectorizer.transform(train_df['input']).toarray()
X_valid = vectorizer.transform(valid_df['input']).toarray()
X_test = vectorizer.transform(test_df['input']).toarray()

In [19]:
y_train = train_df['label']; y_valid = valid_df['label']; y_test = test_df['label']

In [20]:
from sklearn.ensemble import HistGradientBoostingClassifier, BaggingClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB, GaussianNB
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [21]:
# %%time
# bag_clf = HistGradientBoostingClassifier().fit(X_train, y_train)
# bag_preds = bag_clf.predict(X_valid)

# print("BaggingBoostingClassifier\n")
# print(classification_report(y_valid, bag_preds, target_names=new_df['类目'].unique()))

In [22]:
%%time
bayes_clf = ComplementNB().fit((X_train), y_train)
bayes_preds = bayes_clf.predict((X_valid))

print("MultinomialNB\n")
print(classification_report(y_valid, bayes_preds, target_names=new_df['类目'].unique()))

MultinomialNB

                                  precision    recall  f1-score   support

            Consumer Electronics       1.00      0.50      0.67         6
                      Capacitors       1.00      0.56      0.71         9
                   Home & Garden       1.00      1.00      1.00         5
Automobiles, Parts & Accessories       0.89      1.00      0.94         8
                       Gift Sets       0.00      0.00      0.00         4
                 Beauty & Health       0.78      1.00      0.88         7
                       Furniture       0.78      0.58      0.67        12
                        Classics       1.00      0.71      0.83         7
           Furniture Accessories       0.70      1.00      0.82         7
                       Backpacks       1.00      0.62      0.77         8
            Digital Wristwatches       1.00      0.73      0.84        11
               Computer & Office       0.86      1.00      0.92         6
Electronic Components 

**Inference using the Naive bayes complementNB**

In [24]:
# test_preds = bayes_clf.predict(X_test)

# print("Inference on my select test set\n")
# print(classification_report(y_test, test_preds, target_names=new_df['类目'].unique()))

**Model performance with no tweaking**

*Naive bayes*

    - The biggest challenge is ensuring that the unique targets belong to all the dataset

    - Training on 40 targets and the whole dataset, the model doesn't perform but still it times of running time, it's all under 1 month
    
    - Naive Bayes rocks. 
    
    - StratifiedShuffleSplit takes the performance to the next level, am currently getting the best scores using StratifiedShuffleSplit, performance is less with 5s 

    - When the number of targets increases, all the metrics drastically improve and running time is still less than 5s

    - With increase in the data, it still gets better, so far with 40% of the data, running times are all less than 5s and all metrics improve 

    *MultiNominalNB*
    - Faster running time with wall time of 68.8 ms and total: 143 ms 
    
    - Also very good precision, recall and f1-score 
    
    *ComplementNB*
    
    - The faster than Multinominal, Wall time: 53.1 ms, total: 129 ms even with sampling of 25% of the data
    
    - Also far superior metrics (f1-score)
    
    *GaussianNB*
    
    - Performs poorly compared to the other Naive Bayes techniques but still much better than the Gradient Boosting machines 
    
    - Increasing the number of splits from 10 to 15 improves the recall, while the precision improves slowly.
    

*Bagging*

    -Wall: 1min 36s (faster than HistGradient by far)
    
    - Higher precision, recall and f1-score compared to the HistGradient
    
*Bonus*: Transforming the data by MinMax doesn't contribute that much to the performance of the model

**Saving the Model**

Using pickle to save the model

In [25]:
import pickle

pickle.dump(bayes_clf, open('bayes_clf.pkl', 'wb'))

In [28]:
# loading the model
model = pickle.load(open('bayes_clf.pkl', 'rb'))  # load the trained model

# drop columns that are no longer needed
new_df.drop(['Occurrence', 'fold', 'stage'], axis=1, inplace=True)

In [29]:
X_full_data = vectorizer.transform(new_df['input'])

# predicting the 40 most popular product categories
predictions = model.predict(X_full_data)

In [31]:
# add the predictions to the test
new_df['predictions'] = predictions
new_df.to_excel('Predicted_data.xlsx', index=False)
pd.read_excel('Predicted_data.xlsx').head(10)

Unnamed: 0,类目,标题,input,label,predictions
0,Consumer Electronics,T95 TV Box Android 10.0 H616 Quad Core 2.4G Wi...,[CLS] T95 TV Box Android 10.0 H616 Quad Core 2...,36,36
1,Capacitors,10pcs Film capacitors К73-17 CL21 400V 15nF 22...,[CLS] 10pcs Film capacitors К73-17 CL21 400V 1...,11,32
2,Home & Garden,Rodanny Dog Car Seat Cover Waterproof Folding ...,[CLS] Rodanny Dog Car Seat Cover Waterproof Fo...,39,39
3,Home & Garden,Pet Dog Cat Feeder Bowl With Water Bottle Auto...,[CLS] Pet Dog Cat Feeder Bowl With Water Bottl...,39,39
4,"Automobiles, Parts & Accessories",SRXTZM New 12v 18A Max.120W Female Car Cigaret...,[CLS] SRXTZM New 12v 18A Max.120W Female Car C...,37,37
5,Gift Sets,"Candy Cane jet's, 42g","[CLS] Candy Cane jet's, 42g [SEP]",24,24
6,Beauty & Health,Korea Lip Sleeping Mask Moisturizing Lip Balm ...,[CLS] Korea Lip Sleeping Mask Moisturizing Lip...,38,38
7,Furniture,20 cm 4Pcs Duty Plastic Strong Loading Furnitu...,[CLS] 20 cm 4Pcs Duty Plastic Strong Loading F...,34,34
8,Classics,PLASTIC 2°PRIMARIA. PIECE To PIECE,[CLS] PLASTIC 2°PRIMARIA. PIECE To PIECE [SEP],15,39
9,Home & Garden,202525-Cat & Doglife For Cats Neck Collar Ring...,[CLS] 202525-Cat & Doglife For Cats Neck Colla...,39,39


**Preparation for Bert**

*Tokenization* using *BertTokenizer*

In [None]:
# Due to shortage of GPU compute on sagemaker labs, 
# from transformers import BertForSequenceClassification, BertTokenizer

# tokenizer = BertTokenizer.from_pretrained(CFG.model_name)
# tokenizer

In [None]:
# # function to encode
# def encode_func(data):
#     encoded = tokenizer.batch_encode_plus(data,
#                                          add_special_tokens=True,
#                                          max_length=CFG.MAX_LEN,
#                                          padding="max_length",
#                                          return_attention_mask=True,
#                                          truncation=True,
#                                          return_tensors='pt')
    
#     input_ids = encoded['input_ids']
#     attention_mask = encoded['attention_mask']
#     return input_ids, attention_mask

In [None]:
# # encoding the datasets
# train_input_ids, train_attn_masks = encode_func(train_df['input'].values.tolist())
# valid_input_ids, valid_attn_masks = encode_func(valid_df['input'].values.tolist())
# test_input_ids, test_attn_masks = encode_func(test_df['input'].values.tolist())

In [None]:
# # creating Datasets and Dataloaders
# import torch
# from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler

# # turing the labels into tensors
# y_train = torch.LongTensor(train_df['label'].values.tolist())
# y_valid = torch.LongTensor(valid_df['label'].values.tolist())
# y_test = torch.LongTensor(test_df['label'].values.tolist())

In [None]:
# # create dataloaders for training
# trainDataset = TensorDataset(train_input_ids, train_attn_masks, y_train)
# trainSampler = RandomSampler(trainDataset)
# trainDataloader = DataLoader(trainDataset, sampler=trainSampler, batch_size=CFG.BATCH_SIZE)

# # creating validation dataloaders
# validDataset = TensorDataset(valid_input_ids, valid_attn_masks, y_valid)
# validSampler = RandomSampler(validDataset)
# validDataloader = DataLoader(validDataset, sampler=validSampler, batch_size=CFG.BATCH_SIZE)

# # creating test dataloader
# testDataset = TensorDataset(test_input_ids, test_attn_masks, y_test)
# testSampler = RandomSampler(testDataset)
# testDataloader = DataLoader(testDataset, sampler=testSampler, batch_size=CFG.BATCH_SIZE)

In [None]:
# # baseline bert model
# N_labels = len(train_df.label.unique())

# model = BertForSequenceClassification.from_pretrained(CFG.model_name,
#                                                      num_labels=N_labels,
#                                                      output_attentions=False,
#                                                      output_hidden_states=False)

In [None]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(f"Using {device}")

In [None]:
# # setting up the optimizer
# from torch.optim import AdamW
# from transformers import get_linear_schedule_with_warmup

# Epochs = 5
# LR = 1e-5

# optimizer = AdamW(model.parameters(), lr=LR)
# scheduler = get_linear_schedule_with_warmup(optimizer,
#                                            num_warmup_steps=0,
#                                            num_training_steps=len(trainDataloader)*Epochs)

In [None]:
# # training loop
# from torch.nn.utils import clip_grad_norm_

# train_loss_per_epoch = []
# valid_loss_per_epoch = []

# for epoch in range(Epochs):
#     print(f'Epoch {epoch+1}')
    
#     # Training loop
#     model.train()
#     train_loss = 0
#     for step, batch in enumerate(tqdm(trainDataloader, desc='training', file=sys.stdout)):
#         input_ids, attn_mask, labels = [data.to(device) for data in batch]
#         output = model(input_ids = input_ids, attention_mask = attn_mask, labels=labels)
        
#         loss = output.loss
#         train_loss += loss.item()
        
#         optimizer.zero_grad()
#         loss.backward()
#         del loss
        
#         clip_grad_norm_(parameters=model.parameters(), max_norm=1.0)
#         optimizer.step()
#         scheduler.step()
        
#     train_loss_per_epoch.append(train_loss/(step+1))
    
#     # Validation loop
#     model.eval()
#     valid_loss = 0
#     valid_pred = []
#     with torch.no_grad():
#         for step_val, batch in enumerate(validDataloader, desc='validation', file=sys.stdout):
#             input_ids, attn_mask, labels = [data.to(device) for data in batch]
#             output = model(input_ids=inputs_ids, attention_mask=attn_mask, labels=labels)
            
#             loss = output.loss
#             valid_loss += loss.item()
            
#             valid_pred.append(np.argmax(output.logits.cpu().detach().numpy(), axis=-1))
            
#     valid_loss_per_epoch.append(valid_loss/(step_val+1))
#     valid_pred = np.concatenate(valid_pred)
    
#     # output message
#     print(f"{step+1}/{math.ceil(len(train_df)/CFG.BATCH_SIZE)} train loss {train_loss/(step+1)}")
#     print(f"{step_val+1}/{math.ceil(len(valid_df)/CFG.BATCH_SIZE)} train loss {valid_loss/(step_val)}")

In [None]:
# for batch in trainDataloader:
#     break

In [None]:
# batch