In [3]:
import pandas as pd

#### Taking 4 hand annotated datastsets

In [4]:
model = pd.read_excel('All_Data/model.xlsx', index_col=None)
feature = pd.read_excel('All_Data/feature.xlsx')
train = pd.read_excel('All_Data/train.xlsx')
encoding = pd.read_excel('All_Data/encoding.xlsx')

In [5]:
df = pd.concat([model[['Comment', 'MLTD']], feature[['Comment', 'MLTD']], train[['Comment', 'MLTD']], encoding[['Comment', 'MLTD']]], axis=0)

In [6]:
df['MLTD'].value_counts()

Yes    1207
No      313
Name: MLTD, dtype: int64

#### Dropping duplcate values

In [7]:
df = df.drop_duplicates()

In [8]:
df

Unnamed: 0,Comment,MLTD
0,TODO cont feat names no longer required only n...,Yes
1,TODO: check whether model_builder is necessary...,No
2,TODO model.default_collection only in BaseMode...,No
3,TODO Lida Xu please re-write the CNN model,Yes
4,"\""\""\"" || TODO: This test fails due to the ran...",Yes
...,...,...
39,TODO: remove label encoding when class bug is ...,Yes
44,TODO: Check if weight is tied to encoding embe...,Yes
45,TODO: use `tokenize.detect_encoding`,Yes
46,TODO don't like that: encoding after each event,Yes


#### Escaping unicode characters

In [9]:
df['Comment'] = df['Comment'].apply(lambda x:x.encode('ascii').decode('unicode_escape'))

  df['Comment'] = df['Comment'].apply(lambda x:x.encode('ascii').decode('unicode_escape'))


In [10]:
df.head()

Unnamed: 0,Comment,MLTD
0,TODO cont feat names no longer required only n...,Yes
1,TODO: check whether model_builder is necessary...,No
2,TODO model.default_collection only in BaseMode...,No
3,TODO Lida Xu please re-write the CNN model,Yes
4,""""""" || TODO: This test fails due to the random...",Yes


#### Loading a transformer model into the spaCy pipeline

In [43]:
from spacy.training.example import Example
from spacy.util import minibatch
import random

In [44]:
import spacy
nlp = spacy.blank('en')

config = {
    "model" : {
        "@architectures": "spacy-transformers.TransformerModel.v3",
        "name": "microsoft/codebert-base"
    }
}

# nlp.add_pipe("transformer", config = config)
textcat = nlp.add_pipe("textcat", last=True)
textcat.add_label("Yes")
textcat.add_label("No")

1

In [45]:
train_data = []

for idx, row in df.iterrows():
    labels = {"Yes":False, "No":False}
    if row['MLTD'] == "Yes":
        labels['Yes'] = True
    else:
        labels['No'] = True
    tup = (row['Comment'], {"cats":labels})
    train_data.append(tup)

In [46]:
train_examples = []

for text, annotations in train_data[:200]:
    train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))

In [47]:
# Train the model
nlp.initialize()

<thinc.optimizers.Optimizer at 0x7f78f29447c0>

In [48]:
for i in range(20): # Number of training iterations
    random.shuffle(train_examples)
    losses = {}
    for batch in minibatch(train_examples, size=8): # Batch size
        nlp.update(batch, drop=0.5, losses=losses) # Dropout rate
    print(losses)

{'textcat': 4.565066196024418}
{'textcat': 3.3305786291603}
{'textcat': 3.2697595632635057}
{'textcat': 3.226115185767412}
{'textcat': 3.180007532937452}
{'textcat': 3.099624315276742}
{'textcat': 2.9841340337879956}
{'textcat': 2.9971739314496517}
{'textcat': 2.4882253310643137}
{'textcat': 2.753918915288523}
{'textcat': 2.503231597424019}
{'textcat': 2.3171451425587293}
{'textcat': 2.0215828239452094}
{'textcat': 1.8175035582389683}
{'textcat': 1.7241439405770507}
{'textcat': 1.3627582510307548}
{'textcat': 1.251774375410605}
{'textcat': 1.3541165466576786}
{'textcat': 0.8423196174865097}
{'textcat': 1.1964323555948795}


#### Checking the accuracy on the test dataset