# Multi label Text Classification Snippet Specificity using BERT

## Data Processing

In [21]:
import pandas as pd

In [22]:
dataset = pd.read_excel('3192antibody.xlsx')

In [23]:
dataset.head()

Unnamed: 0,Row ID,Antibody?,Specificity?,PMID,PMCID,SNIPPET
0,2635,"anti-V5 (RRID:AB_307024), anti-flag (RRID:AB_...",Rabbit Anti-Mouse IgG- positive,27725089,PMC5059142,Input lysates and pulldown eluates were analyz...
1,1744,"""specific antibodies","(positive) ""snippet isnt en/augh to kn/aw what",27506200,PMC4977824,ATAD-3 is an evolutionarily conserved AAA-fami...
2,38,Cy3-conjugated mouse monoclonal antibody speci...,(positive) claim,23468629,PMC3585132,"For ookinete conversion assays, blood was take..."
3,54,this antibody,(positive) claim,24251095,PMC3821019,Rat monoclonal antibody against hKIAA1199 was ...
4,55,"the antibodies specific to KIAA1199, CHC, &#94...",(positive) claim,24251095,PMC3821019,Cell homogenate supernatants were separated by...


### Remove redundant information

In [24]:
dataset.drop('Antibody?',axis=1,inplace=True)
dataset.drop('PMID',axis=1, inplace=True)
dataset.drop('PMCID',axis=1,inplace=True)

### How many specificity class right now?

In [25]:
import matplotlib.pyplot as plt

In [26]:
specificity_cases = dict()
count_record = 0

In [27]:
for value in dataset['Specificity?']:
    if pd.isna(value) == False:
        v = value.strip('\n')
        v = v.strip()
        specificity_cases[v] = specificity_cases.get(v, 0) + 1
        count_record += 1

In [28]:
specificity_cases

{'Rabbit Anti-Mouse IgG- positive': 1,
 '(positive) "snippet isnt en/augh to kn/aw what': 1,
 '(positive) claim': 3,
 '2 positive, 2 claim positive': 1,
 'antibody pennetration method': 1,
 'claim': 19,
 'claim (positive)': 2,
 'Claim +ve': 3,
 'claim positive': 217,
 'Claim positive': 2,
 'claim positive, claim nonspecific': 1,
 'claim positive, claim nonspecific, claim positive': 1,
 'claim positive, N/A': 1,
 'claim positive, n/anspecific': 2,
 'claim positive, nonspecific, positive(3)': 1,
 'claim positive, positive.': 2,
 'Claim positive(2), nonspecific (2)': 1,
 'claim postive': 2,
 'claim, positive': 1,
 'Clzim positive': 1,
 'N/A': 3,
 'N/A, Claim positive(2)': 1,
 'N/A, negative': 2,
 'N/A, positive': 3,
 'N/A?': 2,
 'N/A(2), claim positive(2)': 1,
 'n/anspecfic': 1,
 'n/anspecific': 11,
 'n/anspecific (2)  positive': 2,
 'n/anspecific, rest are claim positive': 1,
 'n/at sure': 10,
 'na': 12,
 'negative': 96,
 'negative, N/A': 1,
 'negative, neutral': 1,
 'netural': 1,
 'netu

In [29]:
count_record

2431

Try redunce these cases into 5 labels classes

In [30]:
fivecases = dict()

In [31]:
for key in specificity_cases:
    s = " "
    for k in key.split(' '):
        if "neg" in k or "unspecific" in k or "nonspecific" in k:
            s = s + "negative "
        elif "neu" in k or "netural" in k:
            s = s + "neural "
        elif "unsure" in k or "not sure" in k:
            s = s + "unsure "
        elif "claim" in k or "Claim" in k or "Clzim" in k:
            s = s + "claim "
        elif "pos" in k or "+ve" in k:
            s = s + "positive "
        elif "na" in k or "N/A" in k or "n/a" in k:
            s = s + "unsure "
    s = s.strip()
    fivecases[s] = fivecases.get(s, 0) + 1

In [32]:
fivecases

{'positive': 9,
 'positive unsure unsure': 1,
 'positive claim': 2,
 'positive claim positive': 2,
 '': 6,
 'claim': 1,
 'claim positive': 8,
 'claim positive claim negative': 1,
 'claim positive claim negative claim positive': 1,
 'claim positive unsure': 2,
 'claim positive negative positive': 1,
 'claim positive positive': 1,
 'claim positive negative': 1,
 'unsure': 9,
 'unsure claim positive': 3,
 'unsure negative': 1,
 'unsure positive': 3,
 'negative': 5,
 'negative unsure': 1,
 'negative neural': 1,
 'neural': 3,
 'neural claim positive': 1,
 'negative positive': 1,
 'positive unsure': 3,
 'positive neural': 2,
 'positive negative': 4,
 'unsure positive negative': 1,
 'unsure unsure unsure': 1}

In [33]:
LABELS_COLUMN = ["positive", "negative", "neutral", "unsure", "claim"]

In [34]:
def labels_to_onehot(labels):
    onehot = [0, 0, 0, 0, 0]
    
    for k in labels.split(' '):
        if "neg" in k or "unspecific" in k or "nonspecific" in k:
            onehot[1] = 1
        elif "neu" in k or "netural" in k:
            onehot[2] = 1
        elif "unsure" in k or "not sure" in k:
            onehot[3] = 1
        elif "claim" in k:
            onehot[4] = 1
        elif "pos" in k or "+ve" in k:
            onehot[0] = 1
        elif "na" in k or "N/A" in k or "n/a" in k:
            onehot[3] = 1
    if len(set(onehot)) == 0:
        onehot[3] = 1
    return onehot

### Reconstruct Dataset

In [35]:
import csv

In [36]:
with open('dataset.csv', mode='w') as csv_file:
    fieldnames = ['snippet', 'positive', 'negative', 'neutral', 'unsure', 'claim']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()
    for index, row in dataset.iterrows():
        label = row['Specificity?']
        if pd.isna(row['Specificity?']) == False:
            onehot = labels_to_onehot(label)
            writer.writerow({
                'snippet': row['SNIPPET'], 
                'positive': onehot[0], 
                'negative': onehot[1], 
                'neutral': onehot[2],
                'unsure': onehot[3],
                'claim': onehot[4]
            })

### Dataset (Partition)

In [37]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,snippet,positive,negative,neutral,unsure,claim
0,Input lysates and pulldown eluates were analyz...,1,0,0,0,0
1,ATAD-3 is an evolutionarily conserved AAA-fami...,1,0,0,1,0
2,"For ookinete conversion assays, blood was take...",1,0,0,0,1
3,Rat monoclonal antibody against hKIAA1199 was ...,1,0,0,0,1
4,Cell homogenate supernatants were separated by...,1,0,0,0,1


In [38]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
trainset, validset = train_test_split(df, test_size=0.2, random_state=42)

trainset.to_csv('./trainset.csv',index=False)
validset.to_csv('./validset.csv',index=False)

### Test Set

In [39]:
with open('testset.csv', mode='w') as csv_file:
    fieldnames = ['id', 'snippet']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()
    for index, row in dataset.iterrows():
        label = row['Specificity?']
        if pd.isna(row['Specificity?']) == True:
            writer.writerow({
                'id': row['Row ID'],
                'snippet': row['SNIPPET'], 
            })

In [40]:
df = pd.read_csv('testset.csv')
df.head()

Unnamed: 0,id,snippet
0,594,"Alpha-sma is also expressed on MM, but areas o..."
1,656,Successful universal IAV vaccines not only ind...
2,657,"These cells, via perforin dependent cytotoxici..."
3,701,Normal tonsil and bone marrow was used as posi...
4,729,VectaShield mounting medium was applied and co...
