In [3]:
!pip install nltk

[0m

In [2]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=5def2711278b9df7fb8aaedde969401d131bab96c09f18616d39ad671b3798df
  Stored in directory: /root/.cache/pip/wheels/83/71/2b/40d17d21937fed496fb99145227eca8f20b4891240ff60c86f
Successfully built sentence_transformers
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-2.2.2
[0m

In [3]:
from datasets import load_dataset
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
from textblob import TextBlob

from sentence_transformers import SentenceTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from catboost import CatBoostClassifier, Pool

from tqdm.notebook import tqdm
tqdm.pandas()

In [4]:
nltk.download("stopwords")
stopwords = stopwords.words('english')
nlp = spacy.load('en_core_web_sm', disable = ['parser','ner'])

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
ds = load_dataset("banking77")

Downloading builder script:   0%|          | 0.00/2.34k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.75k [00:00<?, ?B/s]

Downloading and preparing dataset banking77/default (download: 1.03 MiB, generated: 897.51 KiB, post-processed: Unknown size, total: 1.91 MiB) to /root/.cache/huggingface/datasets/banking77/default/1.1.0/aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b...


Downloading data:   0%|          | 0.00/158k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10003 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3080 [00:00<?, ? examples/s]

Dataset banking77 downloaded and prepared to /root/.cache/huggingface/datasets/banking77/default/1.1.0/aec0289529599d4572d76ab00c8944cb84f88410ad0c9e7da26189d31f62a55b. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [53]:
X_train = pd.DataFrame(ds['train']['text'], columns=['text'])
y_train = ds['train']['label']

X_test = pd.DataFrame(ds['test']['text'], columns=['text'])
y_test = ds['test']['label']

# Counting statistics of texts for CatBoost

In [8]:
def lemmatizations(text: str) -> str:
    return " ".join([
        token.lemma_.lower() for token in nlp(text) 
        if not token.is_stop and not token.is_digit and not token.is_punct
    ])


def count_text_stats(text: str) -> pd.Series:
    stats = {}
    doc = nlp(text)
    
    stats["chars"] = len(text)
    stats["tokens"] = len(doc)
    stats["chars_per_token"] = stats["chars"] / stats["tokens"]
    
    vowels = set("aeiouy")
    consonants = set("bcdfghjklmnpqrstvwxz")
    stats["vowels"] = 0
    stats["consonants"] = 0
    for token in doc:
        for char in str(token).lower():
            if char in vowels:
                stats["vowels"] += 1
            elif char in consonants:
                stats["consonants"] += 1
    stats["vowels_share"] = stats["vowels"] / stats["chars"]
    stats["vowels_per_token"] = stats["vowels"] / stats["tokens"]
    stats["consonants_share"] = stats["consonants"] / stats["chars"]
    stats["consonants_per_token"] = stats["consonants"] / stats["tokens"]
    
    stats["verbs"] = 0
    stats["adverbs"] = 0
    stats["nouns"] = 0
    stats["pronouns"] = 0
    stats["adjectives"] = 0
    stats["numerals"] = 0
    stats["punct"] = 0
    stats["posother"] = 0
    for token in doc:
        if token.pos_ in {"VERB", "AUX"}:
            stats["verbs"] += 1
        elif token.pos_ == "ADV":
            stats["adverbs"] += 1
        elif token.pos_ in {"NOUN", "PROPN"}:
            stats["nouns"] += 1
        elif token.pos_ == "PRON":
            stats["pronouns"] += 1
        elif token.pos_ == "ADJ":
            stats["adjectives"] += 1
        elif token.pos_ == "NUM":
            stats["numerals"] += 1
        elif token.pos_ == "PUNCT":
            stats["punct"] += 1
        else:
            stats["posother"] += 1
    stats["verbs_share"] = stats["verbs"] / stats["tokens"]
    stats["adverbs_share"] = stats["adverbs"] / stats["tokens"]
    stats["nouns_share"] = stats["nouns"] / stats["tokens"]
    stats["pronouns_share"] = stats["pronouns"] / stats["tokens"]
    stats["adjectives_share"] = stats["adjectives"] / stats["tokens"]
    stats["numerals_share"] = stats["numerals"] / stats["tokens"]
    stats["punct_share"] = stats["punct"] / stats["tokens"]
    stats["posother_share"] = stats["posother"] / stats["tokens"]
    stats["stops"] = sum([token.is_stop for token in doc])
    stats["stops_share"] = stats["stops"] / stats["tokens"]
    stats["ners"] = len(doc.ents)
    stats["polarity"] = TextBlob(text).sentiment.polarity
    
    return pd.Series(stats, dtype=np.float32)


def add_stats_features(dataset: pd.DataFrame, text_col: str) -> pd.DataFrame:
    stats = pd.DataFrame([
        count_text_stats(text) for text in tqdm(dataset[text_col], desc="Counting Texts Stats")
    ])
    stats.columns = [f"{text_col.split('_')[0]}.{scol}" for scol in stats.columns]
    return pd.concat([dataset, stats], axis=1)

In [9]:
X_train_stats = add_stats_features(X_train, 'text')
X_test_stats = add_stats_features(X_test, 'text')

Counting Texts Stats:   0%|          | 0/10003 [00:00<?, ?it/s]

Counting Texts Stats:   0%|          | 0/3080 [00:00<?, ?it/s]

In [10]:
X_train_stats["lemmas"] = X_train["text"].progress_apply(lambda x: lemmatizations(x))
X_test_stats["lemmas"] = X_test["text"].progress_apply(lambda x: lemmatizations(x))

  0%|          | 0/10003 [00:00<?, ?it/s]

  0%|          | 0/3080 [00:00<?, ?it/s]

In [11]:
X_train_stats, X_val_stats, y_train_stats, y_val_stats = train_test_split(X_train_stats, y_train, test_size=0.3, random_state=42)

# A. Train CatBoost without adding sentence embeddings

In [17]:
def calc_metrics(y_pred, y_test):
    print('Accuracy:', accuracy_score(y_pred, y_test))
    print('Weighted F1:', f1_score(y_pred, y_test, average='weighted'))
    print('Micro F1:', f1_score(y_pred, y_test, average='micro'))
    print('Macro F1:', f1_score(y_pred, y_test, average='macro'))

In [16]:
train_pool = Pool(
    X_train_stats.drop(["text"], axis=1),
    label=y_train_stats,
    text_features=["lemmas"], 
)

val_pool = Pool(
    X_val_stats.drop(["text"], axis=1),
    label=y_val_stats,
    text_features=["lemmas"], 
)

test_pool = Pool(
    X_test_stats.drop(["text"], axis=1),
    text_features=["lemmas"],   
)

In [18]:
catboost = CatBoostClassifier(
    loss_function="MultiClass",
    eval_metric="MultiClass",
    early_stopping_rounds=20, 
    task_type="GPU",
)

catboost.fit(train_pool, eval_set=val_pool, verbose=50)
y_pred = catboost.predict(test_pool)
calc_metrics(y_pred, y_test)

Learning rate set to 0.127997
0:	learn: 4.1241955	test: 4.1443419	best: 4.1443419 (0)	total: 123ms	remaining: 2m 2s
50:	learn: 1.5743492	test: 1.4311431	best: 1.4311431 (50)	total: 4.99s	remaining: 1m 32s
100:	learn: 1.0424298	test: 0.9489913	best: 0.9489913 (100)	total: 11s	remaining: 1m 37s
150:	learn: 0.8719927	test: 0.8186267	best: 0.8186267 (150)	total: 15.8s	remaining: 1m 28s
200:	learn: 0.7911411	test: 0.7639872	best: 0.7639872 (200)	total: 20.5s	remaining: 1m 21s
250:	learn: 0.7264193	test: 0.7263629	best: 0.7263629 (250)	total: 25.6s	remaining: 1m 16s
300:	learn: 0.6706372	test: 0.6952283	best: 0.6952283 (300)	total: 30.3s	remaining: 1m 10s
350:	learn: 0.6359994	test: 0.6801354	best: 0.6801354 (350)	total: 35.4s	remaining: 1m 5s
400:	learn: 0.5961825	test: 0.6635057	best: 0.6635057 (400)	total: 40.8s	remaining: 1m
450:	learn: 0.5640552	test: 0.6528548	best: 0.6528501 (449)	total: 46.4s	remaining: 56.5s
500:	learn: 0.5352344	test: 0.6424881	best: 0.6424881 (500)	total: 51.6s	re

# B. Get sentence embeddings using SentenceTransformer _nq-distilbert-base-v1_

In [19]:
model_distilbert = SentenceTransformer('nq-distilbert-base-v1')

Downloading (…)a2d19/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)17900a2d19/README.md:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading (…)900a2d19/config.json:   0%|          | 0.00/540 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)a2d19/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/554 [00:00<?, ?B/s]

Downloading (…)17900a2d19/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)00a2d19/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [22]:
train_emb_distilbert = model_distilbert.encode(X_train_stats['text'].to_list())

Batches:   0%|          | 0/219 [00:00<?, ?it/s]

In [23]:
val_emb_distilbert = model_distilbert.encode(X_val_stats['text'].to_list())

Batches:   0%|          | 0/94 [00:00<?, ?it/s]

In [25]:
test_emb_distilbert = model_distilbert.encode(X_test_stats['text'])

Batches:   0%|          | 0/97 [00:00<?, ?it/s]

In [77]:
X_train_stats['distilbert_emb'] = [i for i in list(train_emb_distilbert)]
X_val_stats['distilbert_emb'] = [i for i in list(val_emb_distilbert)]
X_test_stats['distilbert_emb'] = [i for i in list(test_emb_distilbert)]

In [78]:
X_train_stats.head(2)

Unnamed: 0,text,text.chars,text.tokens,text.chars_per_token,text.vowels,text.consonants,text.vowels_share,text.vowels_per_token,text.consonants_share,text.consonants_per_token,...,text.adjectives_share,text.numerals_share,text.punct_share,text.posother_share,text.stops,text.stops_share,text.ners,text.polarity,lemmas,distilbert_emb
6123,"What does it mean when a transfer is ""pending""?",47.0,12.0,3.916667,13.0,23.0,0.276596,1.083333,0.489362,1.916667,...,0.0,0.0,0.25,0.166667,6.0,0.5,0.0,-0.3125,mean transfer pende,"[0.32410955, -0.3853174, 1.0511389, -0.1438839..."
9072,How much are the fees for adding funds using a...,67.0,13.0,5.153846,21.0,34.0,0.313433,1.615385,0.507463,2.615385,...,0.153846,0.0,0.076923,0.307692,7.0,0.538462,0.0,0.1,fee add fund international card,"[-0.49318787, 0.29713967, -0.06914067, -0.3041..."


### Training CatBoost

In [80]:
train_pool = Pool(
    X_train_stats.drop(["text"], axis=1),
    label=y_train_stats,
    text_features=["lemmas"], 
    embedding_features=['distilbert_emb'],   
)

val_pool = Pool(
    X_val_stats.drop(["text"], axis=1),
    label=y_val_stats,
    text_features=["lemmas"], 
    embedding_features=['distilbert_emb'],   
)

test_pool = Pool(
    X_test_stats.drop(["text"], axis=1),
    text_features=["lemmas"], 
    embedding_features=['distilbert_emb'],   
)

In [81]:
catboost = CatBoostClassifier(
    loss_function="MultiClass",
    eval_metric="MultiClass",
    early_stopping_rounds=20, 
    task_type="GPU",
)

In [82]:
catboost.fit(train_pool, eval_set=val_pool, verbose=50)
y_pred = catboost.predict(test_pool)

Learning rate set to 0.127997
0:	learn: 4.1370844	test: 4.1584042	best: 4.1584042 (0)	total: 128ms	remaining: 2m 8s
50:	learn: 1.5056401	test: 1.2914619	best: 1.2914619 (50)	total: 6.15s	remaining: 1m 54s
100:	learn: 0.9368360	test: 0.7906873	best: 0.7906873 (100)	total: 13s	remaining: 1m 55s
150:	learn: 0.7311382	test: 0.6470560	best: 0.6470560 (150)	total: 18.8s	remaining: 1m 45s
200:	learn: 0.6312710	test: 0.5967614	best: 0.5967614 (200)	total: 24.7s	remaining: 1m 38s
250:	learn: 0.5689882	test: 0.5728252	best: 0.5723770 (249)	total: 30.6s	remaining: 1m 31s
300:	learn: 0.5265271	test: 0.5563918	best: 0.5562002 (299)	total: 36.7s	remaining: 1m 25s
350:	learn: 0.4812349	test: 0.5477748	best: 0.5469993 (330)	total: 43.6s	remaining: 1m 20s
bestTest = 0.5469993075
bestIteration = 330
Shrink model to first 331 iterations.


In [83]:
calc_metrics(y_pred, y_test)

Accuracy: 0.8844155844155844
Weighted F1: 0.8846075063246825
Micro F1: 0.8844155844155844
Macro F1: 0.884223662506486


# C. Get sentence embeddings using SentenceTransformer _nq-distilbert-base-v1_

In [84]:
model_miniLM = SentenceTransformer('all-MiniLM-L6-v2')

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [86]:
train_emb_miniLM = model_miniLM.encode(X_train_stats['text'].to_list())
val_emb_miniLM = model_miniLM.encode(X_val_stats['text'].to_list())
test_emb_miniLM = model_miniLM.encode(X_test_stats['text'].to_list())

Batches:   0%|          | 0/219 [00:00<?, ?it/s]

Batches:   0%|          | 0/94 [00:00<?, ?it/s]

Batches:   0%|          | 0/97 [00:00<?, ?it/s]

In [87]:
X_train_stats['miniLM_emb'] = [i for i in list(train_emb_miniLM)]
X_val_stats['miniLM_emb'] = [i for i in list(val_emb_miniLM)]
X_test_stats['miniLM_emb'] = [i for i in list(test_emb_miniLM)]

In [88]:
train_pool = Pool(
    X_train_stats.drop(["text", "distilbert_emb"], axis=1),
    label=y_train_stats,
    text_features=["lemmas"], 
    embedding_features=['miniLM_emb'],   
)

val_pool = Pool(
    X_val_stats.drop(["text", "distilbert_emb"], axis=1),
    label=y_val_stats,
    text_features=["lemmas"], 
    embedding_features=['miniLM_emb'],   
)

test_pool = Pool(
    X_test_stats.drop(["text", "distilbert_emb"], axis=1),
    text_features=["lemmas"], 
    embedding_features=['miniLM_emb'],   
)

In [89]:
catboost = CatBoostClassifier(
    loss_function="MultiClass",
    eval_metric="MultiClass",
    early_stopping_rounds=20, 
    task_type="GPU",
)

In [90]:
catboost.fit(train_pool, eval_set=val_pool, verbose=50)
y_pred = catboost.predict(test_pool)

Learning rate set to 0.127997
0:	learn: 4.1010434	test: 4.1237263	best: 4.1237263 (0)	total: 164ms	remaining: 2m 43s
50:	learn: 1.3519264	test: 1.1563761	best: 1.1563761 (50)	total: 6.16s	remaining: 1m 54s
100:	learn: 0.8204609	test: 0.6460964	best: 0.6460964 (100)	total: 12.4s	remaining: 1m 50s
150:	learn: 0.6414609	test: 0.5344791	best: 0.5344791 (150)	total: 18.5s	remaining: 1m 44s
200:	learn: 0.5724199	test: 0.4991681	best: 0.4991681 (200)	total: 24.3s	remaining: 1m 36s
250:	learn: 0.5162643	test: 0.4720092	best: 0.4717839 (249)	total: 30.1s	remaining: 1m 29s
300:	learn: 0.4654509	test: 0.4531031	best: 0.4531031 (300)	total: 37.2s	remaining: 1m 26s
350:	learn: 0.4216975	test: 0.4419919	best: 0.4419919 (350)	total: 42.7s	remaining: 1m 18s
400:	learn: 0.3888459	test: 0.4305068	best: 0.4305068 (400)	total: 48.6s	remaining: 1m 12s
bestTest = 0.428711507
bestIteration = 423
Shrink model to first 424 iterations.


In [91]:
calc_metrics(y_pred, y_test)

Accuracy: 0.9149350649350649
Weighted F1: 0.915300248802941
Micro F1: 0.9149350649350649
Macro F1: 0.9145698810671887
