In [1]:
!pip install flair





In [2]:
from flair.models import TextClassifier
from flair.data import Sentence
classifier = TextClassifier.load('en-sentiment')
sentence = Sentence('Flair is pretty neat!')
classifier.predict(sentence)
# print sentence with predicted labels
print('Sentence above is: ', sentence.labels)

2020-06-08 21:42:48,322 loading file /Users/patsnap/.flair/models/imdb-v0.4.pt
Sentence above is:  [POSITIVE (0.6636101007461548)]


Flair’s classification dataset format is based on the Facebook’s FastText format. 

        __label__<class_1>\t<text>

In [3]:
import pandas as pd
data = pd.read_csv("/Users/patsnap/Desktop/Neo4J_and_other_codes/text_classification/spam.csv", encoding='latin-1').sample(frac=1).drop_duplicates()
print(data.head(3))
data = data[['v1', 'v2']].rename(columns={"v1":"label", "v2":"text"})
print(data.head(3))
data['label'] = '__label__' + data['label'].astype(str)
print(data.head(3))
data.iloc[0:int(len(data)*0.8)].to_csv('train.csv', sep='\t', index = False, header = False)
data.iloc[int(len(data)*0.8):int(len(data)*0.9)].to_csv('test.csv', sep='\t', index = False, header = False)
data.iloc[int(len(data)*0.9):].to_csv('dev.csv', sep='\t', index = False, header = False);

       v1                                                 v2 Unnamed: 2  \
5176  ham  Company is very good.environment is terrific a...        NaN   
1250  ham  Ummmmmaah Many many happy returns of d day my ...        NaN   
748   ham  Is there a reason we've not spoken this year? ...        NaN   

     Unnamed: 3 Unnamed: 4  
5176        NaN        NaN  
1250        NaN        NaN  
748         NaN        NaN  
     label                                               text
5176   ham  Company is very good.environment is terrific a...
1250   ham  Ummmmmaah Many many happy returns of d day my ...
748    ham  Is there a reason we've not spoken this year? ...
             label                                               text
5176  __label__ham  Company is very good.environment is terrific a...
1250  __label__ham  Ummmmmaah Many many happy returns of d day my ...
748   __label__ham  Is there a reason we've not spoken this year? ...


In [4]:
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path


In [5]:
corpus = NLPTaskDataFetcher.load_classification_corpus(Path('./'), test_file='test.csv', dev_file='dev.csv', train_file='train.csv')
word_embeddings = [WordEmbeddings('glove'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast')]
document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)
classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)
trainer = ModelTrainer(classifier, corpus)
trainer.train('./', max_epochs=10)

2020-06-08 21:49:56,667 Reading data from .
2020-06-08 21:49:56,670 Train: train.csv
2020-06-08 21:49:56,672 Dev: dev.csv
2020-06-08 21:49:56,673 Test: test.csv


  """Entry point for launching an IPython kernel.


2020-06-08 21:50:00,077 Computing label dictionary. Progress:


  This is separate from the ipykernel package so we can avoid doing imports until
100%|██████████| 4135/4135 [00:00<00:00, 230710.71it/s]

2020-06-08 21:50:00,121 [b'ham', b'spam']
2020-06-08 21:50:00,175 ----------------------------------------------------------------------------------------------------
2020-06-08 21:50:00,176 Model: "TextClassifier(
  (document_embeddings): DocumentLSTMEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.25, inplace=False)
          (encoder): Embedding(275, 100)
          (rnn): LSTM(100, 1024)
          (decoder): Linear(in_features=1024, out_features=275, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=2148, out_features=




2020-06-08 21:50:00,281 ----------------------------------------------------------------------------------------------------
2020-06-08 21:50:47,472 epoch 1 - iter 13/130 - loss 0.37714449 - samples/sec: 8.82
2020-06-08 21:51:20,023 epoch 1 - iter 26/130 - loss 0.29513310 - samples/sec: 12.79
2020-06-08 21:51:51,910 epoch 1 - iter 39/130 - loss 0.26151532 - samples/sec: 13.06
2020-06-08 21:52:30,293 epoch 1 - iter 52/130 - loss 0.23365807 - samples/sec: 10.85
2020-06-08 21:53:17,562 epoch 1 - iter 65/130 - loss 0.21558568 - samples/sec: 8.81
2020-06-08 21:53:54,579 epoch 1 - iter 78/130 - loss 0.19767844 - samples/sec: 11.25
2020-06-08 21:54:36,469 epoch 1 - iter 91/130 - loss 0.18322272 - samples/sec: 9.94
2020-06-08 21:55:12,339 epoch 1 - iter 104/130 - loss 0.17251134 - samples/sec: 11.61
2020-06-08 21:55:48,757 epoch 1 - iter 117/130 - loss 0.16295676 - samples/sec: 11.43
2020-06-08 21:56:34,481 epoch 1 - iter 130/130 - loss 0.15653595 - samples/sec: 9.10
2020-06-08 21:56:34,515 --

2020-06-08 22:02:34,920 epoch 7 - iter 65/130 - loss 0.05686576 - samples/sec: 77.82
2020-06-08 22:02:40,117 epoch 7 - iter 78/130 - loss 0.05415901 - samples/sec: 80.45
2020-06-08 22:02:45,077 epoch 7 - iter 91/130 - loss 0.05013913 - samples/sec: 84.25
2020-06-08 22:02:49,131 epoch 7 - iter 104/130 - loss 0.04935752 - samples/sec: 103.22
2020-06-08 22:02:53,426 epoch 7 - iter 117/130 - loss 0.04655833 - samples/sec: 97.39
2020-06-08 22:02:58,240 epoch 7 - iter 130/130 - loss 0.04683353 - samples/sec: 86.83
2020-06-08 22:02:58,268 ----------------------------------------------------------------------------------------------------
2020-06-08 22:02:58,269 EPOCH 7 done: loss 0.0468 - lr 0.1000
2020-06-08 22:02:59,936 DEV : loss 0.014583970420062542 - score 0.9942
2020-06-08 22:02:59,974 BAD EPOCHS (no improvement): 3
2020-06-08 22:02:59,976 ----------------------------------------------------------------------------------------------------
2020-06-08 22:03:05,048 epoch 8 - iter 13/130 - 

{'test_score': 0.9807,
 'dev_score_history': [0.9787,
  0.9942,
  0.9942,
  0.9961,
  0.9923,
  0.9942,
  0.9942,
  0.9942,
  0.9845,
  0.9923],
 'train_loss_history': [0.15653595264571218,
  0.07622880785940932,
  0.0681796175271022,
  0.06285607342679914,
  0.05896615899848537,
  0.04831122554695377,
  0.04683353302467507,
  0.043395707786476126,
  0.032521696647521685,
  0.029768515147644885],
 'dev_loss_history': [tensor(0.0440),
  tensor(0.0218),
  tensor(0.0198),
  tensor(0.0160),
  tensor(0.0192),
  tensor(0.0137),
  tensor(0.0146),
  tensor(0.0194),
  tensor(0.0438),
  tensor(0.0276)]}

In [8]:
from flair.models import TextClassifier
from flair.data import Sentence
classifier = TextClassifier.load('./best-model.pt')
sentence = Sentence('Hi. Yes mum, I will...')
classifier.predict(sentence)
print(sentence.labels)

2020-06-09 08:35:48,208 loading file ./best-model.pt
[ham (0.999051034450531)]


#add discrete parameters
search_space.add(Parameter.PARAMNAME, hp.choice, options=[1, 2, ..])
#add continuous parameters
search_space.add(Parameter.PARAMNAME, hp.uniform, low=0.0, high=0.5)

# parameters list
https://github.com/flairNLP/flair/blob/master/flair/hyperparameter/parameter.py

In [None]:
#optimising hyperparameters
from flair.hyperparameter.param_selection import TextClassifierParamSelector, OptimizationValue
from hyperopt import hp
from flair.hyperparameter.param_selection import SearchSpace, Parameter
from flair.embeddings import WordEmbeddings, FlairEmbeddings
from flair.data_fetcher import NLPTaskDataFetcher
from pathlib import Path
corpus = NLPTaskDataFetcher.load_classification_corpus(Path('./'), test_file='test.csv', dev_file='dev.csv', train_file='train.csv')
word_embeddings = [[WordEmbeddings('glove'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward')]]
search_space = SearchSpace()
search_space.add(Parameter.EMBEDDINGS, hp.choice, options=word_embeddings)
search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[32, 64, 128, 256, 512])
search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.05, 0.1, 0.15, 0.2])
search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16, 32, 64])
param_selector = TextClassifierParamSelector(
    corpus=corpus, 
    multi_label=False, 
    base_path='resources/results', 
    document_embedding_type='lstm',
    max_epochs=10, 
    training_runs=1,
    optimization_value=OptimizationValue.DEV_SCORE
)
param_selector.optimize(search_space, max_evals=100)

2020-06-09 08:40:33,287 Reading data from .
2020-06-09 08:40:33,288 Train: train.csv
2020-06-09 08:40:33,289 Dev: dev.csv
2020-06-09 08:40:33,289 Test: test.csv


  import sys


2020-06-09 08:40:37,259 Computing label dictionary. Progress:


100%|██████████| 4135/4135 [00:00<00:00, 261839.26it/s]

2020-06-09 08:40:37,291 [b'ham', b'spam']
  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]2020-06-09 08:40:37,326 ----------------------------------------------------------------------------------------------------
2020-06-09 08:40:37,327 Evaluation run: 1
2020-06-09 08:40:37,327 Evaluating parameter combination:
2020-06-09 08:40:37,328 	dropout: 0.42660427747990465
2020-06-09 08:40:37,330 	embeddings: /Users/patsnap/.flair/embeddings/glove.gensim,/Users/patsnap/.flair/embeddings/news-forward-0.4.1.pt,/Users/patsnap/.flair/embeddings/news-backward-0.4.1.pt
2020-06-09 08:40:37,332 	hidden_size: 256
2020-06-09 08:40:37,333 	learning_rate: 0.15
2020-06-09 08:40:37,335 	mini_batch_size: 16
2020-06-09 08:40:37,340 	rnn_layers: 2
2020-06-09 08:40:37,342 ----------------------------------------------------------------------------------------------------
2020-06-09 08:40:37,413 ----------------------------------------------------------------------------------------------------
2020-06-0




2020-06-09 08:40:37,650 ----------------------------------------------------------------------------------------------------
2020-06-09 08:40:37,651 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
      (list_embedding_1): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05, inplace=False)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=300, bias=True)
        )
      )
      (list_embedding_2): FlairEmbeddings(
        (lm): LanguageModel(
          (drop): Dropout(p=0.05, inplace=False)
          (encoder): Embedding(300, 100)
          (rnn): LSTM(100, 2048)
          (decoder): Linear(in_features=2048, out_features=300, bias=True)
        )
      )
    )
    (word_reprojection_map): Linear(in_features=4196, out_features=4196, bias=True)
    (rnn): GRU(4196, 256, 

2020-06-09 09:36:49,318 epoch 5 - iter 150/259 - loss 0.02544574 - samples/sec: 16.95
2020-06-09 09:37:12,624 epoch 5 - iter 175/259 - loss 0.02751743 - samples/sec: 17.16
2020-06-09 09:37:36,886 epoch 5 - iter 200/259 - loss 0.02898525 - samples/sec: 16.49
2020-06-09 09:37:59,150 epoch 5 - iter 225/259 - loss 0.02725408 - samples/sec: 17.97
2020-06-09 09:38:23,295 epoch 5 - iter 250/259 - loss 0.03028203 - samples/sec: 16.57
2020-06-09 09:38:31,066 ----------------------------------------------------------------------------------------------------
2020-06-09 09:38:31,067 EPOCH 5 done: loss 0.0313 - lr 0.1500
2020-06-09 09:38:38,461 DEV : loss 0.026333022862672806 - score 0.9903
Epoch     5: reducing learning rate of group 0 to 7.5000e-02.
  0%|          | 0/100 [58:01<?, ?trial/s, best loss=?]2020-06-09 09:38:38,515 BAD EPOCHS (no improvement): 4
2020-06-09 09:38:38,517 ----------------------------------------------------------------------------------------------------
2020-06-09 09:3

2020-06-09 10:07:01,964 Done evaluating parameter combination:
2020-06-09 10:07:01,966 	dropout: 0.42660427747990465
2020-06-09 10:07:01,968 	embeddings: 0-/Users/patsnap/.flair/embeddings/glove.gensim,1-/Users/patsnap/.flair/embeddings/news-forward-0.4.1.pt,2-/Users/patsnap/.flair/embeddings/news-backward-0.4.1.pt
2020-06-09 10:07:01,969 	hidden_size: 256
2020-06-09 10:07:01,970 	learning_rate: 0.15
2020-06-09 10:07:01,971 	mini_batch_size: 16
2020-06-09 10:07:01,972 	rnn_layers: 2
2020-06-09 10:07:01,973 score: 0.009633333333333346
2020-06-09 10:07:01,974 variance: 7.475555555555338e-06
2020-06-09 10:07:01,975 test_score: 0.9826

2020-06-09 10:07:01,976 ----------------------------------------------------------------------------------------------------
  1%|          | 1/100 [1:26:24<142:34:42, 5184.67s/trial, best loss: 0.009633333333333346]2020-06-09 10:07:02,044 ----------------------------------------------------------------------------------------------------
2020-06-09 10:07:02

2020-06-09 10:52:50,080 ----------------------------------------------------------------------------------------------------
2020-06-09 10:53:16,035 epoch 4 - iter 25/259 - loss 0.03537118 - samples/sec: 15.41
2020-06-09 10:53:41,244 epoch 4 - iter 50/259 - loss 0.04567709 - samples/sec: 15.87
2020-06-09 10:54:06,370 epoch 4 - iter 75/259 - loss 0.04161712 - samples/sec: 15.92
2020-06-09 10:54:31,670 epoch 4 - iter 100/259 - loss 0.03571075 - samples/sec: 15.81
2020-06-09 10:54:56,819 epoch 4 - iter 125/259 - loss 0.03149224 - samples/sec: 15.91
2020-06-09 10:55:24,026 epoch 4 - iter 150/259 - loss 0.03560795 - samples/sec: 14.70
2020-06-09 10:55:49,201 epoch 4 - iter 175/259 - loss 0.03686586 - samples/sec: 15.89
2020-06-09 10:56:15,110 epoch 4 - iter 200/259 - loss 0.03503803 - samples/sec: 15.44
2020-06-09 10:56:42,789 epoch 4 - iter 225/259 - loss 0.03186551 - samples/sec: 14.45
2020-06-09 10:57:07,081 epoch 4 - iter 250/259 - loss 0.03104917 - samples/sec: 16.47
2020-06-09 10:57:1

2020-06-09 11:44:48,657 epoch 10 - iter 75/259 - loss 0.00974862 - samples/sec: 15.44
2020-06-09 11:45:10,560 epoch 10 - iter 100/259 - loss 0.00797147 - samples/sec: 18.26
2020-06-09 11:45:32,854 epoch 10 - iter 125/259 - loss 0.00814239 - samples/sec: 17.94
2020-06-09 11:45:57,216 epoch 10 - iter 150/259 - loss 0.00720294 - samples/sec: 16.42
2020-06-09 11:46:20,872 epoch 10 - iter 175/259 - loss 0.00662584 - samples/sec: 16.91
2020-06-09 11:46:45,383 epoch 10 - iter 200/259 - loss 0.00612845 - samples/sec: 16.32
2020-06-09 11:47:11,883 epoch 10 - iter 225/259 - loss 0.00599642 - samples/sec: 15.10
2020-06-09 11:47:39,193 epoch 10 - iter 250/259 - loss 0.00556203 - samples/sec: 14.65
2020-06-09 11:47:47,919 ----------------------------------------------------------------------------------------------------
2020-06-09 11:47:47,919 EPOCH 10 done: loss 0.0054 - lr 0.1000
2020-06-09 11:47:58,627 DEV : loss 0.02171877957880497 - score 0.9942
2020-06-09 11:47:58,661 BAD EPOCHS (no improvem