In [1]:
!pip install spacy-transformers
!python -m spacy download en_trf_bertbaseuncased_lg

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_trf_bertbaseuncased_lg')


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!cp /content/drive/MyDrive/emd/test.csv .
!cp /content/drive/MyDrive/emd/train.csv .

In [4]:
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [5]:
seed = 0
random.seed(seed)
np.random.seed(seed)

In [6]:
train_df = pd.read_csv("train.csv")
train_df = train_df[['reviewText','score']]
train_df['score'] = train_df['score'].astype(int)

In [7]:
train_df.head()

Unnamed: 0,reviewText,score
0,I enjoyed this game. There was just enough dif...,4
1,I guess it's a good app if your like REALLY go...,3
2,I never would have expected the level of quali...,4
3,"Love to play game, lots of fun and pass the ti...",5
4,this is a great game my grandson loves that th...,5


In [8]:
DF_TRAIN, DF_VAL = train_test_split(train_df, test_size=0.1, stratify=train_df['score'])
DF_TRAIN.shape

(450153, 2)

In [9]:
DF_TRAIN = DF_TRAIN.sample(40000)

In [10]:
from collections import Counter

Counter(DF_TRAIN['score'])

Counter({1: 4411, 2: 2419, 3: 4584, 4: 8470, 5: 20116})

In [11]:

def sample_from_df_row(row):
    return row['reviewText'], {'cats': {str(i):row['score']==i for i in range(1,6)}}

row = train_df.iloc[0]
print(row)
sample_from_df_row(row)

reviewText    I enjoyed this game. There was just enough dif...
score                                                         4
Name: 0, dtype: object


('I enjoyed this game. There was just enough difficulty to keep me playing. I may have wished for a hint once in a while... but I made it through. I would recommend this game to those who enjoy seek/find games &amp; strategy.',
 {'cats': {'1': False, '2': False, '3': False, '4': True, '5': False}})

In [12]:
train_dataset = [sample_from_df_row(row) for idx, row in tqdm(DF_TRAIN.iterrows(), total=len(DF_TRAIN))]

HBox(children=(FloatProgress(value=0.0, max=40000.0), HTML(value='')))




In [13]:
train_dataset[0]

("These are the same psychological tests the professionals give.  I think it's interesting to find out what brain functions I should exercise more often. :)",
 {'cats': {'1': False, '2': False, '3': False, '4': False, '5': True}})

In [14]:
DF_VAL = DF_VAL.sample(10000)

In [15]:
import spacy
from spacy.util import minibatch
import random
import torch

is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    torch.set_default_tensor_type("torch.cuda.FloatTensor")

nlp = spacy.load("en_trf_bertbaseuncased_lg")
print(nlp.pipe_names) # ["sentencizer", "trf_wordpiecer", "trf_tok2vec"]
textcat = nlp.create_pipe("trf_textcat", config={"exclusive_classes": True})
for label in (1,2,3,4,5):
    textcat.add_label(str(label))
nlp.add_pipe(textcat)

['sentencizer', 'trf_wordpiecer', 'trf_tok2vec']


In [16]:
from sklearn.metrics import f1_score

def maximum_keys(dic):
    maximum = max(dic.values())
    keys = list(filter(lambda x:dic[x] == maximum,dic.keys()))
    return int(keys[0])

pipe_exceptions = ["trf_textcat", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.resume_training()
    for i in range(5):
        random.shuffle(train_dataset)
        losses = {}

        for batch in tqdm(minibatch(train_dataset, size=64), total=len(train_dataset)//64):
            texts, cats = zip(*batch)
            nlp.update(texts, cats, sgd=optimizer, losses=losses)

        with textcat.model.use_params(optimizer.averages):
            y_pred = list()
            for ind, row in DF_VAL.iterrows():
                cats = nlp(row['reviewText']).cats
                y_pred.append(maximum_keys(cats))
            score = f1_score(y_pred, DF_VAL['score'].tolist(), average='macro')

        print(i, losses, score)

HBox(children=(FloatProgress(value=0.0, max=625.0), HTML(value='')))

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)


RuntimeError: ignored

In [None]:
nlp.to_disk("/bert-textcat")

In [None]:
test_text = "This movie sucked"
doc = nlp(test_text)
print(test_text, doc.cats)