# GitHub Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
%cd '/content/drive/MyDrive/CS685'

/content/drive/MyDrive/CS685


In [3]:
TOKEN="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"
USER="smeyerhot"
PROJECT="CS685"

In [4]:
GIT_PATH = "https://" + TOKEN + "@github.com/" + USER + "/" + PROJECT + ".git"

In [5]:
!git pull "{GIT_PATH}" 

From https://github.com/smeyerhot/CS685
 * branch            HEAD       -> FETCH_HEAD
Already up to date.


In [6]:
!git checkout biobert # or -b

M	biobert_embedding.ipynb
M	decoder_only.ipynb
M	gpt_covid.ipynb
Already on 'biobert'


In [31]:
!git add --all
!git config --global user.email "psalm10045@gmail.com"
!git commit -m 'added biobert'
!git push "{GIT_PATH}" 
!git status

# BioBERT Embedding Extraction

## Environment setup

In [None]:
!pip install transformers datasets
# !pip install pytorch_pretrained_bert
# !pip install rouge/requirements.txt
# !pip install rouge-score

In [8]:
import os
import time

import json
import pandas as pd
import seaborn as sns
import numpy as np
import random

import matplotlib.pyplot as plt
%matplotlib inline

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW, GPT2LMHeadModel, \
GPT2DoubleHeadsModel, GPT2TokenizerFast, GPT2Config, BertTokenizer, EncoderDecoderModel
from transformers import AdamW, get_linear_schedule_with_warmup

from datasets import load_dataset

#from helper import format_time, rouge
from data import GPT2Dataset, preprocessing, split_data
from plots import loss_curves

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
torch.manual_seed(42)
np.random.seed(42)
torch.set_default_dtype(torch.float64)

## Data Augmentation

In [10]:
covid_dialog = load_dataset("covid_qa_ucsd", "en", data_dir="/content/drive/MyDrive/CS685" )

Using custom data configuration en-c080136eb0615511
Reusing dataset covid_qa_ucsd (/root/.cache/huggingface/datasets/covid_qa_ucsd/en-c080136eb0615511/1.0.0/2a15b6e8fdc7cee91951d8f20ac2b26ede79fbef988919fbde22dbb97bf4df81)


  0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
covid_df = covid_dialog['train'].to_pandas()

In [12]:
qa_df = preprocessing(covid_df)
text = qa_df.text.copy()

  all_fields = np.transpose(np.array(all_fields).reshape((4, -1)))


['I have cough with no travel history. Is this a symptom of Covid-19?Hello doctor, I get a cough for the last few days, which is heavy during night times. No raise in temperature but feeling tired with no travel history. No contact with any Covid-19 persons. It has been four to five days and has drunk a lot of Benadryl and took Paracetamol too. Doctors have shut the OP so do not know what to do? Please help.Hello, I understand your concern. I just have a few more questions.Does your cough has phlegm? Any other symptoms like difficulty breathing? Any other medical condition such as asthma, hypertension? Are you a smoker? Alcoholic beverage drinker?Thank you doctor,I have phlegm but not a lot. A tiny amount comes out most of the time. I have no difficulty in breathing. No medical conditions and not a smoker nor a drinker.Hi, I would recommend you take n-acetylcysteine 200 mg powder dissolved in water three times a day. You may also nebulize using PNSS (saline nebulizer) three times a day

In [19]:
qa_df.head()

Unnamed: 0,id,encoder,decoder,text
0,0,[I have cough with no travel history. Is this ...,"[Hello, I understand your concern. I just have...",[I have cough with no travel history. Is this ...
1,0,[I have cough with no travel history. Is this ...,"[Hi, I would recommend you take n-acetylcystei...",[I have cough with no travel history. Is this ...
2,1,[I have a little fever with no history of fore...,"[Hello, I can understand your concern.In my op...",[I have a little fever with no history of fore...
3,1,[I have a little fever with no history of fore...,"[Hi, yes, upload in this query only. I will se...",[I have a little fever with no history of fore...
4,1,[I have a little fever with no history of fore...,"[Hi, I can understand your concern. I have gon...",[I have a little fever with no history of fore...


## Data Augmentation with Backtranslation strategy

In [None]:
# Augmentation with Backtranslation strategy

!pip install googletrans==3.1.0a0

In [51]:
import googletrans
from googletrans import Translator

translator = Translator()

In [59]:
def data_augment(dataset):
  augmented = []
  languages = ['de', 'fr', 'ko']

  for example in dataset:
    augmented.append([example]) # include the original example
    example_text = "".join(example)

    for target_language in languages:
      trans = translator.translate(example_text, src='en', dest=target_language)
      #print('trans: ', trans.text)
      backtrans = translator.translate(trans.text, src=target_language, dest='en')
      paraphrase = backtrans.text
      #print('backtrans: ',paraphrase)

      augmented.append([paraphrase])


  return augmented

In [60]:
data_augment(qa_df['encoder'][0])

[['I have cough with no travel history. Is this a symptom of Covid-19?Hello doctor, I get a cough for the last few days, which is heavy during night times. No raise in temperature but feeling tired with no travel history. No contact with any Covid-19 persons. It has been four to five days and has drunk a lot of Benadryl and took Paracetamol too. Doctors have shut the OP so do not know what to do? Please help.'],
 ['I have cough with no travel history. Is this a symptom of Covid-19?Hello doctor, I get a cough for the last few days, which is heavy during night times. No raise in temperature but feeling tired with no travel history. No contact with any Covid-19 persons. It has been four to five days and has drunk a lot of Benadryl and took Paracetamol too. Doctors have shut the OP so do not know what to do? Please help.'],
 ['I have cough with no travel history. Is this a symptom of Covid-19?Hello doctor, I get a cough for the last few days, which is heavy during night times. No raise in 

In [61]:
qa_df_1 = qa_df.copy()

In [62]:
from tqdm import tqdm

In [63]:
tqdm.pandas()

In [64]:
_ =data_augment(qa_df['encoder'])

KeyboardInterrupt: ignored