In [1]:
# !pip install torch torchvision torchaudio
# !pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting torchaudio
  Downloading torchaudio-0.10.2-cp37-cp37m-manylinux1_x86_64.whl (2.9 MB)
     |████████████████████████████████| 2.9 MB 15.4 MB/s            
  Downloading torchaudio-0.10.1-cp37-cp37m-manylinux1_x86_64.whl (2.9 MB)
     |████████████████████████████████| 2.9 MB 91.8 MB/s            
[?25h  Downloading torchaudio-0.10.0-cp37-cp37m-manylinux1_x86_64.whl (2.9 MB)
     |████████████████████████████████| 2.9 MB 92.6 MB/s            
[?25h  Downloading torchaudio-0.9.1-cp37-cp37m-manylinux1_x86_64.whl (1.9 MB)
     |████████████████████████████████| 1.9 MB 107.1 MB/s            
[?25h  Downloading torchaudio-0.9.0-cp37-cp37m-manylinux1_x86_64.whl (1.9 MB)
     |████████████████████████████████| 1.9 MB 112.5 MB/s            
[?25h  Downloading torchaudio-0.8.1-cp37-cp37m-manylinux1_x86_64.whl (1.9 MB)
     |████████████████████████████████| 1.9 MB 30.1 MB/s            
[?25hInstall

In [2]:
import os
import math
import datetime

from tqdm import tqdm

import pandas as pd
import numpy as np

In [3]:
test = pd.read_csv('data/test.csv')
train = pd.read_csv('data/train.csv')

In [4]:
train.head()

Unnamed: 0,sentence,label
0,You guys provide EMI option?,0
1,Do you offer Zero Percent EMI payment options?,0
2,0% EMI.,0
3,EMI,0
4,I want in installment,0


In [5]:
from sklearn.model_selection import train_test_split

train_df, eval_df = train_test_split(train, test_size=0.2, random_state=42, stratify=train['label'].tolist())
train_df.reset_index(drop=True, inplace=True)
eval_df.reset_index(drop=True, inplace=True)
train_df.head()

Unnamed: 0,sentence,label
0,Pillows,19
1,Down payments,0
2,How can I replace the mattress.,17
3,Tell me about the latest offers,20
4,My order Number,16


In [6]:
eval_df.head()

Unnamed: 0,sentence,label
0,King Size,8
1,Distributors/Retailers/Showrooms,11
2,Can I get delivery on this pincode,10
3,Do you have any distributors in Mumbai city,11
4,Paisa Finance,0


In [7]:
from datasets import Dataset

softmattress_train = Dataset.from_pandas(train_df)
softmattress_eval = Dataset.from_pandas(eval_df)
softmattress_train

Dataset({
    features: ['sentence', 'label'],
    num_rows: 262
})

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", padding=True, truncation=True, num_labels=21)

def preprocess_function(examples):
    return tokenizer(examples["sentence"], padding=True, truncation=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [9]:
tokenized_softmattress_train = softmattress_train.map(preprocess_function, batched=True)
tokenized_softmattress_eval = softmattress_eval.map(preprocess_function, batched=True)

columns_to_return = ['input_ids', 'label', 'attention_mask']
tokenized_softmattress_train.set_format(type='torch', columns=columns_to_return)
tokenized_softmattress_eval.set_format(type='torch', columns=columns_to_return)

tokenized_softmattress_train[2]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

{'label': tensor(17),
 'input_ids': tensor([  101,  2129,  2064,  1045,  5672,  1996, 13342,  1012,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0])}

In [10]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

In [11]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=21)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [12]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    f1 = f1_score(labels, pred, average='weighted')

    return {"accuracy": accuracy,"f1_score":f1}

In [13]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=25,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_softmattress_train,
    eval_dataset=tokenized_softmattress_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence.
***** Running training *****
  Num examples = 262
  Num Epochs = 25
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 425


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=425, training_loss=1.161474609375, metrics={'train_runtime': 65.3043, 'train_samples_per_second': 100.3, 'train_steps_per_second': 6.508, 'total_flos': 100996371639000.0, 'train_loss': 1.161474609375, 'epoch': 25.0})

In [14]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence.
***** Running Evaluation *****
  Num examples = 66
  Batch size = 16


{'eval_loss': 0.48421910405158997,
 'eval_accuracy': 0.9393939393939394,
 'eval_f1_score': 0.9361231361231361,
 'eval_runtime': 0.3833,
 'eval_samples_per_second': 172.196,
 'eval_steps_per_second': 13.045,
 'epoch': 25.0}

In [15]:
import torch

softmattress_test = Dataset.from_pandas(test)
tokenized_softmattress_test = softmattress_test.map(preprocess_function, batched=True)

columns_to_return = ['input_ids', 'attention_mask']
tokenized_softmattress_test.set_format(type='torch', columns=columns_to_return)


preds = trainer.predict(test_dataset=tokenized_softmattress_test)
probs = torch.from_numpy(preds[0]).softmax(1)
predictions = probs.numpy()
top_predicted = np.argmax(predictions, axis=1)

out_df = test
out_df['predicted_node'] = [top_predicted[i] for i in range(len(test))]
out_df['predicted_node_score'] = [predictions[i][top_predicted[i]] for i in range(len(test))]
out_df

  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence.
***** Running Prediction *****
  Num examples = 397
  Batch size = 16


Unnamed: 0,sentence,label,predicted_node,predicted_node_score
0,There are only 2 models,21,8,0.159618
1,Single,21,0,0.172722
2,What's difference between ergo and ortho,4,4,0.279291
3,Return order,17,16,0.473220
4,Hai not recieved my product,15,17,0.573305
...,...,...,...,...
392,Trail option are there,6,11,0.716519
393,I want to buy SOF ergo mattress,3,3,0.603088
394,Pillow extra shoft and support difference?,19,7,0.298042
395,Extra support and extra shift difference?,21,7,0.213597


In [21]:
threshold = 0.25
count = 0
for index, rows in out_df.iterrows():
    if rows[3] > threshold:
        if rows[1] == rows[2]:
            count += 1
    else:
        if rows[1] == 21:
            count += 1
            

print(count)

229
