In [27]:
import pandas as pd

#### Taking 4 hand annotated datastsets

In [28]:
model = pd.read_excel('All_Data/model.xlsx', index_col=None)
feature = pd.read_excel('All_Data/feature.xlsx')
train = pd.read_excel('All_Data/train.xlsx')
encoding = pd.read_excel('All_Data/encoding.xlsx')

In [29]:
df = pd.concat([model[['Comment', 'MLTD']], feature[['Comment', 'MLTD']], train[['Comment', 'MLTD']], encoding[['Comment', 'MLTD']]], axis=0)

In [30]:
df['MLTD'].value_counts()

Yes    1207
No      313
Name: MLTD, dtype: int64

#### Dropping duplcate values

In [31]:
df = df.drop_duplicates()

In [32]:
df

Unnamed: 0,Comment,MLTD
0,TODO cont feat names no longer required only n...,Yes
1,TODO: check whether model_builder is necessary...,No
2,TODO model.default_collection only in BaseMode...,No
3,TODO Lida Xu please re-write the CNN model,Yes
4,"\""\""\"" || TODO: This test fails due to the ran...",Yes
...,...,...
39,TODO: remove label encoding when class bug is ...,Yes
44,TODO: Check if weight is tied to encoding embe...,Yes
45,TODO: use `tokenize.detect_encoding`,Yes
46,TODO don't like that: encoding after each event,Yes


#### Escaping unicode characters

In [33]:
df['Comment'] = df['Comment'].apply(lambda x:x.encode('ascii').decode('unicode_escape'))

  df['Comment'] = df['Comment'].apply(lambda x:x.encode('ascii').decode('unicode_escape'))


In [34]:
df.head()

Unnamed: 0,Comment,MLTD
0,TODO cont feat names no longer required only n...,Yes
1,TODO: check whether model_builder is necessary...,No
2,TODO model.default_collection only in BaseMode...,No
3,TODO Lida Xu please re-write the CNN model,Yes
4,""""""" || TODO: This test fails due to the random...",Yes


#### Loading a transformer model into the spaCy pipeline

In [35]:
from spacy.training.example import Example
from spacy.util import minibatch
import random

In [10]:
import spacy
nlp = spacy.blank('en')

config = {
    "model" : {
        "@architectures": "spacy-transformers.TransformerModel.v3",
        "name": "microsoft/codebert-base"
    }
}

# nlp.add_pipe("transformer", config = config)
textcat = nlp.add_pipe("textcat", last=True)
textcat.add_label("Yes")
textcat.add_label("No")

1

In [11]:
train_data = []

for idx, row in df.iterrows():
    labels = {"Yes":False, "No":False}
    if row['MLTD'] == "Yes":
        labels['Yes'] = True
    else:
        labels['No'] = True
    tup = (row['Comment'], {"cats":labels})
    train_data.append(tup)

In [12]:
train_examples = []

for text, annotations in train_data[:200]:
    train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))

In [13]:
# Train the model
nlp.initialize()

<thinc.optimizers.Optimizer at 0x7faa62680cc0>

In [14]:
for i in range(50): # Number of training iterations
    random.shuffle(train_examples)
    losses = {}
    for batch in minibatch(train_examples, size=8): # Batch size
        nlp.update(batch, drop=0.5, losses=losses) # Dropout rate
    print(losses)

{'textcat': 4.765812769532204}
{'textcat': 3.456567259505391}
{'textcat': 3.348786309361458}
{'textcat': 3.2674097856506705}
{'textcat': 3.1324857091531157}
{'textcat': 3.073168136179447}
{'textcat': 3.18490640912205}
{'textcat': 2.918075067922473}
{'textcat': 3.126375346677378}
{'textcat': 2.68063209252432}
{'textcat': 2.3615779983229004}
{'textcat': 2.3671965022804216}
{'textcat': 1.6193219758570194}
{'textcat': 1.410908434074372}
{'textcat': 1.6631450213535572}
{'textcat': 1.3953501990768018}
{'textcat': 1.3742426630319073}
{'textcat': 1.0239166861770173}
{'textcat': 1.4288553542037334}
{'textcat': 1.126635500878372}
{'textcat': 0.7357673591060916}
{'textcat': 0.5093665157378009}
{'textcat': 0.6742810882409955}
{'textcat': 1.0414643063826716}
{'textcat': 0.6792168252310455}
{'textcat': 0.4625592626807844}
{'textcat': 0.8117307572175325}
{'textcat': 0.33867019029686446}
{'textcat': 0.37410118040091755}
{'textcat': 0.37476249645388293}
{'textcat': 0.6973404616889163}
{'textcat': 0.683

In [15]:
nlp.to_disk('my_model')

#### Checking the accuracy on the test dataset

In [16]:
span = pd.read_excel('All_Data/span.xlsx', index_col=None)
test = pd.read_excel('All_Data/test.xlsx')
tokens = pd.read_excel('All_Data/tokens.xlsx')
validate = pd.read_excel('All_Data/validate.xlsx')

In [17]:
span['predicted'] = ''

In [18]:
model = nlp.from_disk('my_model')