<a href="https://colab.research.google.com/github/pythonuzgit/elmurodov/blob/master/Natural%20Language%20Processing%20with%20PyTorch/Luxury_Products_Apparel_analysis_with_Hugging_Face_using_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer


import torch
import torch.nn as np
# Torch ML libraries
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader


In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
df =pd.read_csv('/content/Luxury_Products_Apparel_Data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Category,SubCategory,ProductName,Description
0,8037,Accessories,Bags,"""Prada Striped Shell Belt Bag""","""One of Prada's most functional designs, this ..."
1,13670,Accessories,Socks,"""Falke - Lhasa Wool And Cashmere-blend Socks -...","""Falke - Casual yet luxurious, Falke's dark na..."
2,13983,Suits,Tuxedos,"""peak lapel tuxedo suit jacket""","""White and black linen blend peak lapel tuxedo..."
3,12081,Accessories,Gloves,"""Thom Browne Navy 4-Bar Rib Gloves""","""Pair of rib knit cashmere gloves in navy. Sig..."
4,15617,Accessories,Cufflinks,"""Alice Made This - Bayley Round Patina-brass C...","""Alice Made This - Made in the UK, these teal ..."


In [None]:
df.isnull().sum()

Unnamed: 0      0
Category       37
SubCategory    37
ProductName     0
Description     0
dtype: int64

In [None]:
# Dropping missing values
df.dropna(axis = 0, how ='any',inplace=True) ;

we are only intersted in the text and target columns. So we drop the rest

In [None]:
df.drop(columns = ['Unnamed: 0', 'SubCategory', 'ProductName'], inplace = True)

A function to clean data it removes all the punctuation marks, urls etc

In [None]:
def normalize_text(Description):
  Description = Description.str.lower()
  Description = Description.str.replace(r"\#", "")
  Description = Description.str.replace(r"http\S+","URL")
  Description = Description.str.replace(r"@","")
  Description = Description.str.replace(r"[^A-Za-z0-9()!?\'\`\"]", " ")
  Description = Description.str.replace("\s{2,}", " ")

  return Description

In [None]:
df['Description'] = normalize_text(df['Description'])

In [None]:
df.head()

Unnamed: 0,Category,Description
0,Accessories,"""one of prada's most functional designs this b..."
1,Accessories,"""falke casual yet luxurious falke's dark navy ..."
2,Suits,"""white and black linen blend peak lapel tuxedo..."
3,Accessories,"""pair of rib knit cashmere gloves in navy sign..."
4,Accessories,"""alice made this made in the uk these teal bay..."


In [None]:
df.Category.value_counts()

Accessories                1277
Shoes                      1048
Shirts                      569
Activewear                  537
Pants                       516
Jackets/Coats               315
Underwear and Nightwear     266
Suits                       195
Sweaters                    141
Jewelry                      99
Name: Category, dtype: int64

In [None]:
df = df[df.Category.isin(['Accessories', 'Shoes', 'Shirts', 'Activewear', 'Pants', 'Jackets/Coats',
                          'Underwear and Nightwear', 'Suits', 'Sweaters', 'Jewelry'])]

In [None]:
df.Category.value_counts()

Accessories                1277
Shoes                      1048
Shirts                      569
Activewear                  537
Pants                       516
Jackets/Coats               315
Underwear and Nightwear     266
Suits                       195
Sweaters                    141
Jewelry                      99
Name: Category, dtype: int64

In [None]:
possible_labels = df.Category.unique()

In [None]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [None]:
label_dict

{'Accessories': 0,
 'Activewear': 3,
 'Jackets/Coats': 2,
 'Jewelry': 8,
 'Pants': 7,
 'Shirts': 6,
 'Shoes': 5,
 'Suits': 1,
 'Sweaters': 9,
 'Underwear and Nightwear': 4}

In [None]:
df.Category = df['Category'].map(label_dict)

In [None]:
df.head()

Unnamed: 0,Category,Description
0,0,"""one of prada's most functional designs this b..."
1,0,"""falke casual yet luxurious falke's dark navy ..."
2,1,"""white and black linen blend peak lapel tuxedo..."
3,0,"""pair of rib knit cashmere gloves in navy sign..."
4,0,"""alice made this made in the uk these teal bay..."


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.Category.values, 
                                                  test_size=0.15, 
                                                  random_state=42,
                                                  stratify=df.Category.values)

In [None]:
df['data_type'] = ['not_set']*df.shape[0]

In [None]:
df.head()

Unnamed: 0,Category,Description,data_type
0,0,"""one of prada's most functional designs this b...",not_set
1,0,"""falke casual yet luxurious falke's dark navy ...",not_set
2,1,"""white and black linen blend peak lapel tuxedo...",not_set
3,0,"""pair of rib knit cashmere gloves in navy sign...",not_set
4,0,"""alice made this made in the uk these teal bay...",not_set


In [None]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [None]:
df.groupby(['Category', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Description
Category,data_type,Unnamed: 2_level_1
0,train,1085
0,val,192
1,train,166
1,val,29
2,train,268
2,val,47
3,train,456
3,val,81
4,train,226
4,val,40


Loading Tokenizer and Encoding our Data

In [None]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [None]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].Description.values,
    add_special_tokens=True,
    return_attention_mask=True,
    #pad_to_max_length=True,
    padding = True,
    max_length=256,
    return_tensors='pt',
    truncation = True
)


In [None]:
encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].Description.values,
    add_special_tokens=True,
    return_attention_mask=True,
    #pad_to_max_length=True,
    padding = True,
    max_length=256,
    return_tensors='pt',
    truncation = True
)

In [None]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].Category.values)

In [None]:
input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].Category.values)

In [None]:
dataset_train = TensorDataset(input_ids_train, 
                              attention_masks_train,
                              labels_train)

dataset_val = TensorDataset(input_ids_val, 
                            attention_masks_val,
                           labels_val)

In [None]:
len(dataset_train)

4218

In [None]:
dataset_val.tensors

(tensor([[  101,  1000,  6904,  ...,     0,     0,     0],
         [  101,  1000, 10353,  ...,     0,     0,     0],
         [  101,  1000,  2304,  ...,     0,     0,     0],
         ...,
         [  101,  1000,  7683,  ...,     0,     0,     0],
         [  101,  1000,  2085,  ...,     0,     0,     0],
         [  101,  1000, 22953,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([0, 6, 0, 3, 0, 2, 0, 0, 3, 4, 2, 3, 7, 0, 6, 5, 5, 3, 0, 2, 6, 6, 3, 4,
         6, 3, 6, 3, 3, 1, 0, 4, 4, 5, 3, 1, 7, 5, 5, 7, 5, 3, 5, 7, 6, 5, 0, 5,
         6, 0, 6, 2, 3, 5, 0, 5, 5, 5, 0, 6, 6, 0, 8, 4, 7, 3, 4, 7, 7, 6, 8, 0,
         0, 9, 6, 9, 6, 6, 0, 0, 5, 4, 2, 7, 3, 5, 5, 7, 0, 6, 5, 0, 5, 5, 3, 5,
         7, 0, 0, 5, 1, 5, 5, 1, 7, 0, 0, 5, 1, 0, 7, 0, 0, 0, 0, 5, 7, 5, 0, 0,

Setting up BERT Pretrained Model

In [None]:
from transformers import BertForSequenceClassification

In [None]:
model = BertForSequenceClassification.from_pretrained(
                                      'bert-base-uncased', 
                                      num_labels = len(label_dict),
                                      output_attentions = False,
                                      output_hidden_states = False
                                     )

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Creating Data Loaders

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [None]:
batch_size = 4

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=32
)

Setting Up Optimizer and Scheduler

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5,
    eps = 1e-8
)

In [None]:
epochs = 10

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps = len(dataloader_train)*epochs
)

Defining our Performance Metrics

In [None]:
import numpy as np
from sklearn.metrics import f1_score

In [None]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

In [None]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_pred = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy:{len(y_pred[y_pred==label])}/{len(y_true)}\n')

Creating our Training Loop

In [None]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [None]:
def evaluate(dataloader_val):


  model.eval()

  loss_val_total = 0
  predictions, true_vals = [], []

  for batch in tqdm(dataloader_val):
    batch = tuple(b.to(device) for b in batch)

    inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

    with torch.no_grad():
      outputs = model(**inputs)   



    loss = outputs[0]
    logits = outputs[1]
    loss_val_total += loss.item()
    
    logits = logits.detach().cpu().numpy()
    label_ids = inputs['labels'].cpu().numpy()
    predictions.append(logits)
    true_vals.append(label_ids)

  loss_val_avg = loss_val_total/len(dataloader_val) 
  predictions = np.concatenate(predictions, axis = 0)

  true_vals = np.concatenate(true_vals, axis= 0)   
  return loss_val_avg, predictions, true_vals


In [None]:
from tqdm.notebook import trange, tqdm

In [None]:
for epoch in tqdm(range(1, epochs+1)):
  
  model.train()
  loss_train_total = 0

  progress_bar = tqdm(dataloader_train,
                      desc = 'Epoch {:1d}'. format(epoch), leave = False, disable = False)
  
  for batch in progress_bar:
    model.zero_grad()
    batch = tuple(b.to(device) for b in batch)
    inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }

    outputs = model(**inputs)
    loss = outputs[0]
    loss_train_total += loss.item()
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    

    optimizer.step()
    scheduler.step()

    progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})


  tqdm.write('\nEpoch {epoch}')

  loss_train_avg = loss_train_total /len(dataloader_train)
  tqdm.write(f'Training loss : {loss_train_avg}')

  val_loss, predictions, true_vals = evaluate(dataloader_val)
  val_f1 = f1_score_func(predictions, true_vals)
  tqdm.write(f'Validation loss: {val_loss}')
  tqdm.write(f'F1 Score (weighted): {val_f1}')


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/1055 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.787890510878998


  0%|          | 0/24 [00:00<?, ?it/s]

Validation loss: 0.2901279953463624
F1 Score (weighted): 0.9271038522466378


Epoch 2:   0%|          | 0/1055 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.2186015558275681


  0%|          | 0/24 [00:00<?, ?it/s]

Validation loss: 0.28752636135322973
F1 Score (weighted): 0.9429186394011164


Epoch 3:   0%|          | 0/1055 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.16151185675664315


  0%|          | 0/24 [00:00<?, ?it/s]

Validation loss: 0.295975726878775
F1 Score (weighted): 0.9541223781759617


Epoch 4:   0%|          | 0/1055 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.11743752471330218


  0%|          | 0/24 [00:00<?, ?it/s]

Validation loss: 0.3168622847442748
F1 Score (weighted): 0.9478782829378576


Epoch 5:   0%|          | 0/1055 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.09469170505884413


  0%|          | 0/24 [00:00<?, ?it/s]

Validation loss: 0.3013980497974747
F1 Score (weighted): 0.954260836718603


Epoch 6:   0%|          | 0/1055 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.06799450201771091


  0%|          | 0/24 [00:00<?, ?it/s]

Validation loss: 0.32082577767869225
F1 Score (weighted): 0.9543316562120097


Epoch 7:   0%|          | 0/1055 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.05082225065339871


  0%|          | 0/24 [00:00<?, ?it/s]

Validation loss: 0.36988856074094656
F1 Score (weighted): 0.9451140981558782


Epoch 8:   0%|          | 0/1055 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.03672879717400478


  0%|          | 0/24 [00:00<?, ?it/s]

Validation loss: 0.3534217172054923
F1 Score (weighted): 0.950378872808031


Epoch 9:   0%|          | 0/1055 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.02902702393627646


  0%|          | 0/24 [00:00<?, ?it/s]

Validation loss: 0.3767620804392209
F1 Score (weighted): 0.9490935602646803


Epoch 10:   0%|          | 0/1055 [00:00<?, ?it/s]


Epoch {epoch}
Training loss : 0.025701828086101813


  0%|          | 0/24 [00:00<?, ?it/s]

Validation loss: 0.37924096743760555
F1 Score (weighted): 0.9490935602646803


Evaluating our Model

In [None]:
accuracy_per_class(predictions, true_vals)

Class: Accessories
Accuracy:185/192

Class: Suits
Accuracy:27/29

Class: Jackets/Coats
Accuracy:44/47

Class: Activewear
Accuracy:71/81

Class: Underwear and Nightwear
Accuracy:39/40

Class: Shoes
Accuracy:155/157

Class: Shirts
Accuracy:82/85

Class: Pants
Accuracy:71/78

Class: Jewelry
Accuracy:14/15

Class: Sweaters
Accuracy:19/21

