In [1]:
!pip install flair
# !pip install --upgrade git+https://github.com/flairNLP/flair.git



In [0]:
import pandas as pd
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path

## Using a Pre-Trained Classification Model

In [5]:
# An example showing the use of pre-trained sentiment analysis model trained on the IMDB dataset

from flair.models import TextClassifier
from flair.data import Sentence
classifier = TextClassifier.load('en-sentiment')
sentence = Sentence('Flair is pretty neat!')
classifier.predict(sentence)
# print sentence with predicted labels
print('Sentence above is: ', sentence.labels)

2020-05-22 19:18:48,874 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-distilbert/sentiment-en-mix-distillbert.pt not found in cache, downloading to /tmp/tmpxavbeoff


100%|██████████| 266170364/266170364 [00:11<00:00, 22756707.52B/s]

2020-05-22 19:19:00,934 copying /tmp/tmpxavbeoff to cache at /root/.flair/models/sentiment-en-mix-distillbert.pt





2020-05-22 19:19:01,669 removing temp file /tmp/tmpxavbeoff
2020-05-22 19:19:02,131 loading file /root/.flair/models/sentiment-en-mix-distillbert.pt


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


Sentence above is:  [POSITIVE (0.9997)]


# Training a Custom Text Classifier

## Load and Pre-process the data

In [0]:
data = pd.read_csv("./spam.csv", encoding='latin-1').sample(frac=1).drop_duplicates()
data = data[['v1', 'v2']].rename(columns={"v1":"label", "v2":"text"})
 
data['label'] = '__label__' + data['label'].astype(str)
data.iloc[0:int(len(data)*0.8)].to_csv('train.csv', sep='\t', index = False, header = False)
data.iloc[int(len(data)*0.8):int(len(data)*0.9)].to_csv('test.csv', sep='\t', index = False, header = False)
data.iloc[int(len(data)*0.9):].to_csv('dev.csv', sep='\t', index = False, header = False);

## Train the model

In [12]:
# Format data in FastText format, in which each line in the file represents a text document. 
# A document can have one or multiple labels that are defined at the beginning of the line starting with the prefix __label__.

from flair.data import Corpus
from flair.datasets import ClassificationCorpus

# this is the folder in which train, test and dev files reside
data_folder = './'

# load corpus containing training, test and dev data
corpus: Corpus = ClassificationCorpus(data_folder,
                                      test_file='test.csv',
                                      dev_file='dev.csv',
                                      train_file='train.csv')

2020-05-22 19:29:23,220 Reading data from .
2020-05-22 19:29:23,221 Train: train.csv
2020-05-22 19:29:23,221 Dev: dev.csv
2020-05-22 19:29:23,222 Test: test.csv


In [0]:
from flair.data_fetcher import NLPTaskDataFetcher
from flair import datasets
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path

# corpus = NLPTaskDataFetcher.load_classification_corpus(Path('./'), test_file='test.csv', dev_file='dev.csv', train_file='train.csv')

word_embeddings = [WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast')]

document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)

classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)
trainer = ModelTrainer(classifier, corpus)
trainer.train('./', max_epochs=10)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


2020-05-22 19:29:32,975 Computing label dictionary. Progress:


  if sys.path[0] == '':
100%|██████████| 4634/4634 [00:03<00:00, 1373.55it/s]

2020-05-22 19:29:36,520 [b'ham', b'spam']
2020-05-22 19:29:36,539 ----------------------------------------------------------------------------------------------------
2020-05-22 19:29:36,540 Model: "TextClassifier(
  (document_embeddings): DocumentLSTMEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=




2020-05-22 19:29:36,554  - anneal_factor: "0.5"
2020-05-22 19:29:36,555  - max_epochs: "10"
2020-05-22 19:29:36,555  - shuffle: "True"
2020-05-22 19:29:36,556  - train_with_dev: "False"
2020-05-22 19:29:36,557  - batch_growth_annealing: "False"
2020-05-22 19:29:36,558 ----------------------------------------------------------------------------------------------------
2020-05-22 19:29:36,559 Model training base path: "."
2020-05-22 19:29:36,559 ----------------------------------------------------------------------------------------------------
2020-05-22 19:29:36,560 Device: cpu
2020-05-22 19:29:36,561 ----------------------------------------------------------------------------------------------------
2020-05-22 19:29:36,562 Embeddings storage mode: cpu
2020-05-22 19:29:36,563 ----------------------------------------------------------------------------------------------------




2020-05-22 19:30:18,456 epoch 1 - iter 12/129 - loss 0.36587998 - samples/sec: 9.21
2020-05-22 19:31:05,270 epoch 1 - iter 24/129 - loss 0.31755644 - samples/sec: 8.28
2020-05-22 19:31:54,322 epoch 1 - iter 36/129 - loss 0.27401958 - samples/sec: 7.84
2020-05-22 19:32:42,719 epoch 1 - iter 48/129 - loss 0.24629015 - samples/sec: 7.94


2020-05-22 19:28:27,125 Reading data from .
2020-05-22 19:28:27,126 Train: train.csv
2020-05-22 19:28:27,128 Dev: dev.csv
2020-05-22 19:28:27,130 Test: test.csv
