### Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Check the Drive Details

In [None]:
!ls "/content/drive/My Drive/COVID-19/Code_Base/Biobert_pretrained"

bert_config.json			model.ckpt-1000000.meta
model.ckpt-1000000.data-00000-of-00001	pytorch_model.bin
model.ckpt-1000000.index		vocab.txt


#### Install Libraries

In [None]:
!pip install biobert-embedding
!pip install pytorch_pretrained_bert

### Import Libraries

In [None]:
import glob
import pandas as pd
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier 
from biobert_embedding.embedding import BiobertEmbedding

#### Read data set

In [None]:
df_BA_APIRO=pd.read_csv('/content/drive/My Drive/COVID-19/Code_Base/Data/Bert_Data/APIRO_Dataset.csv').drop(columns='Unnamed: 0',axis=1)
df_covid19_PICO=pd.read_csv('/content/drive/My Drive/COVID-19/Code_Base/Data/Bert_Data/df_covid19_PICO.csv')

#### BioBERT Model Initialization

In [None]:
bert_model_path='/content/drive/My Drive/COVID-19/Code_Base/Biobert_pretrained'
biobert = BiobertEmbedding(model_path=bert_model_path)

#### Biobert Encoding CB | Covid Data set


In [None]:

# CB Dataset
df_BA_APIRO['sent_length']=df_BA_APIRO['sent'].apply(lambda x: len(biobert.process_text(x))) 
df_BA_APIRO=df_BA_APIRO[df_BA_APIRO['sent_length']<=512]
# Covid 19 dataset
df_covid19_PICO['sent_length']=df_covid19_PICO['sent'].apply(lambda x: len(biobert.process_text(x))) 
df_covid19_PICO=df_covid19_PICO[df_covid19_PICO['sent_length']<=512]

In [None]:
def custom_sent_vector(cols):
    '''
    @Author - Fakhare Alam
    '''
    x=cols['sent']
    sent_vector=biobert.sentence_vector(x)
    return sent_vector

In [None]:
chunk_size = int(df_covid19_PICO.shape[0] / 100)
for start in range(0, df_BA_APIRO.shape[0], chunk_size):
    print('start set -' ,start)
    df_subset = df_BA_APIRO.iloc[start:start + chunk_size]
    df_subset['sent_embedding']=df_subset[['sent']].apply(custom_sent_vector,axis=1)
    df_subset.to_csv('/content/drive/My Drive/COVID-19/Code_Base/Data/Bert_Data/APIRO_Dataset_biobert_encoded_'+str(start)+'.csv',index=False)

In [None]:
chunk_size_CB = int(df_covid19_PICO.shape[0] / 10)
for start in range(0, df_covid19_PICO.shape[0], chunk_size_CB):
    print('start set -' ,start)
    df_subset = df_covid19_PICO.iloc[start:start + chunk_size_CB]
    df_subset['sent_embedding']=df_subset[['sent']].apply(custom_sent_vector,axis=1)
    df_subset.to_csv('/content/drive/My Drive/COVID-19/Code_Base/Data/Bert_Data/df_covid19_PICO_biobert_encoded_'+str(start)+'.csv',index=False)

#### Combine all Covid Files Together  CB | Covid

In [36]:
covid_path=r'/content/drive/My Drive/COVID-19/Code_Base/Data/Bert_Data/covid19'
all_files = glob.glob(covid_path + "/*.csv")
li = []
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

df_covid19_PICO_biobert_encoded_combined = pd.concat(li, axis=0, ignore_index=True)
df_covid19_PICO_biobert_encoded_combined.to_csv(covid_path+'/df_covid19_PICO_biobert_encoded_combined.csv',index=False)

In [None]:
df_covid19_PICO_biobert_encoded_combined.head()

Unnamed: 0,sent,aimoprc_category,sent_length,sent_embedding
0,this retrospective chart review describes the...,A,51,"tensor([ 2.4645e-01, -2.5895e-01, -1.3125e-01,..."
1,"rhinovirus, the most common cause of upper re...",A,104,"tensor([ 2.7615e-01, -2.4615e-01, 5.5013e-02,..."
2,the human leukocyte antigen (hla) system is w...,A,117,"tensor([ 1.9259e-01, -1.8838e-01, -9.6603e-02,..."
3,an epidemic of a severe acute respiratory syn...,A,79,"tensor([ 1.9417e-01, -2.4222e-01, -1.2005e-01,..."
4,severe acute respiratory syndrome (sars) is a...,A,137,"tensor([ 3.9694e-01, 8.0560e-02, -1.3360e-01,..."


In [None]:
CB_path=r'/content/drive/My Drive/COVID-19/Code_Base/Data/Bert_Data/CB'
all_files = glob.glob(CB_path + "/*.csv")
li = []
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

df_APIRO_Dataset_biobert_encoded_combined = pd.concat(li, axis=0, ignore_index=True)
df_APIRO_Dataset_biobert_encoded_combined.to_csv(CB_path+'/df_APIRO_Dataset_biobert_encoded_combined.csv',index=False)

In [None]:
df_APIRO_Dataset_biobert_encoded_combined.head()

Unnamed: 0,sent,aimoprc_category,data_category,sent_length,sent_embedding
0,innominate artery aneurysms (iaas) are relati...,A,medline,207.0,"tensor([ 8.6653e-02, -9.0528e-02, -7.9277e-02,..."
1,when treating intracranial aneurysms with ope...,A,medline,174.0,"tensor([ 1.5612e-01, -1.7345e-01, 6.4176e-02,..."
2,to report a novel internal and external blood...,A,medline,359.0,"tensor([ 2.0954e-01, -1.8186e-01, -7.3135e-03,..."
3,image-based hemodynamic simulations have grea...,A,medline,50.0,"tensor([-6.5059e-03, -1.1184e-01, 2.1153e-01,..."
4,intracranial aneurysm rupture is the most dev...,A,medline,105.0,"tensor([ 2.7629e-01, 8.5086e-03, 3.5425e-02,..."


In [None]:
df_APIRO_Dataset_biobert_encoded_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173284 entries, 0 to 173283
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   sent              173283 non-null  object 
 1   aimoprc_category  173283 non-null  object 
 2   data_category     173283 non-null  object 
 3   sent_length       173283 non-null  float64
 4   sent_embedding    173283 non-null  object 
dtypes: float64(1), object(4)
memory usage: 6.6+ MB
