# Multi label Text Classification Snippet Specificity using BERT

## Data Processing

In [2]:
import pandas as pd

In [138]:
dataset = pd.read_excel('3192antibody.xlsx')

In [139]:
dataset.head()

Unnamed: 0,Row ID,Antibody?,Specificity?,PMID,PMCID,SNIPPET
0,10,"Diatheva, Dahl",negative,23390418,PMC3565217,The Diatheva and Dahl antibodies were two of t...
1,11,"Diatheva, Dahl",negative,23390418,PMC3565217,"For example, in the study of Panx1 knockout mi..."
2,12,"Diatheva, Dahl",negative,23390418,PMC3565217,(2011) where in situ hybridization images of P...
3,13,no,neutral,23390418,PMC3565217,Because Western blots are frequently treated a...
4,14,no,neutral,23390418,PMC3565217,Because Western blots are frequently treated a...


### Remove redundant information

In [140]:
dataset.drop('Row ID',axis=1,inplace=True)
dataset.drop('Antibody?',axis=1,inplace=True)
dataset.drop('PMID',axis=1, inplace=True)
dataset.drop('PMCID',axis=1,inplace=True)

### How many specificity class right now?

In [141]:
import matplotlib.pyplot as plt

In [142]:
specificity_cases = dict()
count_record = 0

In [143]:
for value in dataset['Specificity?']:
    if pd.isna(value) == False:
        v = value.strip('\n')
        v = v.strip()
        list_values.append(v)
        specificity_cases[v] = specificity_classes.get(v, 0) + 1
        count_record += 1

In [144]:
specificity_cases

{'negative': 64,
 'neutral': 148,
 'unsure': 46,
 'positive': 673,
 'na': 13,
 '(positive) claim': 4,
 'claim': 20,
 'unsure (positive and negative)': 2,
 'positive and negative': 5,
 'claim (positive)': 3,
 'positive, claim': 41,
 'postive': 4,
 'positive, neutral': 2,
 'claim, positive': 2,
 'netural ??': 2,
 '(positive) "snippet isnt enough to know what': 2,
 'antibody pennetration method': 2,
 'unsure (snippet not enough need more)': 2,
 'not sure': 11,
 'nonspecific (2)  positive': 3,
 'nonspecific': 6,
 'N/A': 4,
 'positive (2) neutral': 2,
 'Rabbit Anti-Mouse IgG- positive': 2,
 'unspecific': 2,
 'negative, neutral': 2,
 'nonspecfic': 2,
 'positive (2) N/A': 3,
 'postiive': 3,
 'positive?': 3,
 'Claim +ve': 4,
 'Positive': 2,
 'claim positive': 49,
 'neutral(2) claim positive': 2,
 'positve': 3,
 'N/A, positive': 4,
 'unsure, positive': 2,
 'N/A, negative': 3,
 'positive,': 2,
 'specific not sensitive': 3,
 'positive,claim positive(2)': 3,
 'netural': 2}

In [145]:
count_record

1115

Try redunce these cases into 5 labels classes

In [146]:
fivecases = dict()

In [147]:
for key in specificity_cases:
    s = " "
    for k in key.split(' '):
        if "neg" in k or "unspecific" in k or "nonspecific" in k:
            s = s + "negative "
        elif "neu" in k or "netural" in k:
            s = s + "neural "
        elif "unsure" in k or "not sure" in k:
            s = s + "unsure "
        elif "claim" in k:
            s = s + "claim "
        elif "pos" in k or "+ve" in k:
            s = s + "positive "
        elif "na" in k or "N/A" in k or "n/a" in k:
            s = s + "unsure "
    s = s.strip()
    fivecases[s] = fivecases.get(s, 0) + 1

In [148]:
fivecases

{'negative': 3,
 'neural': 3,
 'unsure': 4,
 'positive': 9,
 'positive claim': 2,
 'claim': 1,
 'unsure positive negative': 1,
 'positive negative': 1,
 'claim positive': 4,
 'positive neural': 2,
 '': 5,
 'negative positive': 1,
 'negative neural': 1,
 'positive unsure': 1,
 'neural claim positive': 1,
 'unsure positive': 2,
 'unsure negative': 1}

In [149]:
LABELS_COLUMN = ["positive", "negative", "neutral", "unsure", "claim"]

In [150]:
def labels_to_onehot(labels):
    onehot = [0, 0, 0, 0, 0]
    
    for k in labels.split(' '):
        if "neg" in k or "unspecific" in k or "nonspecific" in k:
            onehot[1] = 1
        elif "neu" in k or "netural" in k:
            onehot[2] = 1
        elif "unsure" in k or "not sure" in k:
            onehot[3] = 1
        elif "claim" in k:
            onehot[4] = 1
        elif "pos" in k or "+ve" in k:
            onehot[0] = 1
        elif "na" in k or "N/A" in k or "n/a" in k:
            onehot[3] = 1
    if len(set(onehot)) == 0:
        onehot[3] = 1
    return onehot

### Reconstruct Dataset

In [151]:
import csv

In [152]:
with open('dataset.csv', mode='w') as csv_file:
    fieldnames = ['snippet', 'positive', 'negative', 'neutral', 'unsure', 'claim']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()
    for index, row in dataset.iterrows():
        label = row['Specificity?']
        if pd.isna(row['Specificity?']) == False:
            onehot = labels_to_onehot(label)
            writer.writerow({
                'snippet': row['SNIPPET'], 
                'positive': onehot[0], 
                'negative': onehot[1], 
                'neutral': onehot[2],
                'unsure': onehot[3],
                'claim': onehot[4]
            })

### Dataset (Partition)

In [3]:
dataset = pd.read_csv('dataset.csv')
dataset.head()

Unnamed: 0,snippet,positive,negative,neutral,unsure,claim
0,The Diatheva and Dahl antibodies were two of t...,0,1,0,0,0
1,"For example, in the study of Panx1 knockout mi...",0,1,0,0,0
2,(2011) where in situ hybridization images of P...,0,1,0,0,0
3,Because Western blots are frequently treated a...,0,0,1,0,0
4,Because Western blots are frequently treated a...,0,0,1,0,0


In [4]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
trainset, validset = train_test_split(dataset, test_size=0.2, random_state=42)

trainset.to_csv('./trainset.csv',index=False)
validset.to_csv('./validset.csv',index=False)

### Test Set

In [None]:
with open('dataset.csv', mode='w') as csv_file:
    fieldnames = ['id', 'snippet']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()
    for index, row in dataset.iterrows():
        label = row['Specificity?']
        if pd.isna(row['Specificity?']) == True:
            label = 'na'
        onehot = labels_to_onehot(label)
        writer.writerow({
            'snippet': row['SNIPPET'], 
        })