Tutorial from: https://towardsdatascience.com/text-classification-with-state-of-the-art-nlp-library-flair-b541d7add21f

In [1]:
# !pip install flair
!pip install --upgrade git+https://github.com/flairNLP/flair.git

Collecting git+https://github.com/flairNLP/flair.git
  Cloning https://github.com/flairNLP/flair.git to /tmp/pip-req-build-62cnu63h
  Running command git clone -q https://github.com/flairNLP/flair.git /tmp/pip-req-build-62cnu63h
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: flair
  Building wheel for flair (PEP 517) ... [?25l[?25hdone
  Created wheel for flair: filename=flair-0.4.5-cp36-none-any.whl size=148758 sha256=9b7bf6bad97647872e6d328a980d3ce157e9ca185bf2d3eb7cec95e5a579c576
  Stored in directory: /tmp/pip-ephem-wheel-cache-1pvibebp/wheels/84/82/73/d2b3b59b7be74ea05f2c6d64132efe27df52daffb47d1dc7bb
Successfully built flair
Installing collected packages: flair
  Found existing installation: flair 0.4.5
    Uninstalling flair-0.4.5:
      Successfully uninstalled flair-0.4.5
Successfully installed flair-0.4.5


In [0]:
import pandas as pd
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path

## Using a Pre-Trained Classification Model

In [3]:
# An example showing the use of pre-trained sentiment analysis model trained on the IMDB dataset

from flair.models import TextClassifier
from flair.data import Sentence
classifier = TextClassifier.load('en-sentiment')
sentence = Sentence('Flair is pretty neat!')
classifier.predict(sentence)
# print sentence with predicted labels
print('Sentence above is: ', sentence.labels)

2020-05-23 14:29:30,919 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-distilbert/sentiment-en-mix-distillbert.pt not found in cache, downloading to /tmp/tmp7mioqugz


100%|██████████| 266170364/266170364 [00:19<00:00, 13665621.54B/s]

2020-05-23 14:29:50,757 copying /tmp/tmp7mioqugz to cache at /root/.flair/models/sentiment-en-mix-distillbert.pt





2020-05-23 14:29:51,490 removing temp file /tmp/tmp7mioqugz
2020-05-23 14:29:52,068 loading file /root/.flair/models/sentiment-en-mix-distillbert.pt


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


Sentence above is:  [POSITIVE (0.9997)]


# Training a Custom Text Classifier

## Load and Pre-process the data

In [0]:
data = pd.read_csv("./spam.csv", encoding='latin-1').sample(frac=1).drop_duplicates()
data = data[['v1', 'v2']].rename(columns={"v1":"label", "v2":"text"})
 
data['label'] = '__label__' + data['label'].astype(str)
data.iloc[0:int(len(data)*0.8)].to_csv('train.csv', sep='\t', index = False, header = False)
data.iloc[int(len(data)*0.8):int(len(data)*0.9)].to_csv('test.csv', sep='\t', index = False, header = False)
data.iloc[int(len(data)*0.9):].to_csv('dev.csv', sep='\t', index = False, header = False);

## Train the model

In [6]:
# Format data in FastText format, in which each line in the file represents a text document. 
# A document can have one or multiple labels that are defined at the beginning of the line starting with the prefix __label__.

from flair.data import Corpus
from flair.datasets import ClassificationCorpus

# this is the folder in which train, test and dev files reside
data_folder = './'

# load corpus containing training, test and dev data
corpus: Corpus = ClassificationCorpus(data_folder,
                                      test_file='test.csv',
                                      dev_file='dev.csv',
                                      train_file='train.csv')

2020-05-23 14:35:32,274 Reading data from .
2020-05-23 14:35:32,276 Train: train.csv
2020-05-23 14:35:32,277 Dev: dev.csv
2020-05-23 14:35:32,278 Test: test.csv


In [7]:
from flair.data_fetcher import NLPTaskDataFetcher
from flair import datasets
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path

# corpus = NLPTaskDataFetcher.load_classification_corpus(Path('./'), test_file='test.csv', dev_file='dev.csv', train_file='train.csv')

word_embeddings = [WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast')]

document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)

classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)
trainer = ModelTrainer(classifier, corpus)
trainer.train('./', max_epochs=10)

2020-05-23 14:35:32,840 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmpt9hftob_


100%|██████████| 160000128/160000128 [00:07<00:00, 21389040.83B/s]

2020-05-23 14:35:40,750 copying /tmp/tmpt9hftob_ to cache at /root/.flair/embeddings/glove.gensim.vectors.npy





2020-05-23 14:35:41,247 removing temp file /tmp/tmpt9hftob_
2020-05-23 14:35:42,578 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim not found in cache, downloading to /tmp/tmp9qe4c8q3


100%|██████████| 21494764/21494764 [00:01<00:00, 13444180.89B/s]


2020-05-23 14:35:44,599 copying /tmp/tmp9qe4c8q3 to cache at /root/.flair/embeddings/glove.gensim
2020-05-23 14:35:44,624 removing temp file /tmp/tmp9qe4c8q3


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


2020-05-23 14:35:46,771 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-1024-v0.2rc.pt not found in cache, downloading to /tmp/tmpehcekv1q


100%|██████████| 19689779/19689779 [00:01<00:00, 12910411.31B/s]

2020-05-23 14:35:48,839 copying /tmp/tmpehcekv1q to cache at /root/.flair/embeddings/lm-news-english-forward-1024-v0.2rc.pt
2020-05-23 14:35:48,865 removing temp file /tmp/tmpehcekv1q





2020-05-23 14:35:49,658 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-1024-v0.2rc.pt not found in cache, downloading to /tmp/tmpxo8zwuwu


100%|██████████| 19689779/19689779 [00:01<00:00, 12972707.70B/s]

2020-05-23 14:35:51,701 copying /tmp/tmpxo8zwuwu to cache at /root/.flair/embeddings/lm-news-english-backward-1024-v0.2rc.pt
2020-05-23 14:35:51,729 removing temp file /tmp/tmpxo8zwuwu





2020-05-23 14:35:52,097 Computing label dictionary. Progress:


  if sys.path[0] == '':
100%|██████████| 4635/4635 [00:03<00:00, 1352.57it/s]

2020-05-23 14:35:55,699 [b'spam', b'ham']
2020-05-23 14:35:55,722 ----------------------------------------------------------------------------------------------------
2020-05-23 14:35:55,723 Model: "TextClassifier(
  (document_embeddings): DocumentLSTMEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=




2020-05-23 14:36:52,042 epoch 1 - iter 12/129 - loss 0.32657409 - samples/sec: 6.86
2020-05-23 14:37:42,181 epoch 1 - iter 24/129 - loss 0.28789113 - samples/sec: 7.73
2020-05-23 14:38:29,614 epoch 1 - iter 36/129 - loss 0.26211145 - samples/sec: 8.10
2020-05-23 14:39:20,707 epoch 1 - iter 48/129 - loss 0.23121870 - samples/sec: 7.52
2020-05-23 14:40:10,757 epoch 1 - iter 60/129 - loss 0.21610988 - samples/sec: 7.68
2020-05-23 14:41:04,395 epoch 1 - iter 72/129 - loss 0.20536498 - samples/sec: 7.17
2020-05-23 14:42:01,033 epoch 1 - iter 84/129 - loss 0.19021510 - samples/sec: 6.85
2020-05-23 14:42:56,510 epoch 1 - iter 96/129 - loss 0.18295101 - samples/sec: 6.92
2020-05-23 14:43:40,902 epoch 1 - iter 108/129 - loss 0.17265417 - samples/sec: 8.66
2020-05-23 14:44:30,018 epoch 1 - iter 120/129 - loss 0.16223200 - samples/sec: 7.82
2020-05-23 14:45:02,216 ----------------------------------------------------------------------------------------------------
2020-05-23 14:45:02,218 EPOCH 1 d

{'dev_loss_history': [0.06923426687717438,
  0.06292398273944855,
  0.06887958943843842,
  0.06310033053159714,
  0.06192026287317276,
  0.09851007908582687,
  0.06243825703859329,
  0.06359170377254486,
  0.06407099962234497,
  0.06524772942066193],
 'dev_score_history': [0.9805825242718447,
  0.9805825242718447,
  0.9805825242718447,
  0.9825242718446602,
  0.9825242718446602,
  0.9728155339805825,
  0.9805825242718447,
  0.9786407766990292,
  0.9786407766990292,
  0.9786407766990292],
 'test_score': 0.9941634241245136,
 'train_loss_history': [0.15621276603751752,
  0.08184008190206202,
  0.06434077575492363,
  0.046985920841711784,
  0.06609469038243009,
  0.045215376169969464,
  0.044740884456523626,
  0.04659193183986701,
  0.035643750678724426,
  0.027727153767243228]}