<a href="https://colab.research.google.com/github/saakolch/procedure_of_extracting_data/blob/main/data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
!pip install datasets evaluate transformers sentencepiece

In [None]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

In [None]:
from datasets import load_dataset

data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}

# \t is the tab character
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

In [None]:
drug_sample = drug_dataset['train'].shuffle(seed=42).select(range(1000))
drug_sample[:3]

In [None]:
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))
    print(len(drug_dataset[split].unique("Unnamed: 0")))
    print(len(drug_dataset[split]))

In [None]:
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name='patient_id'
)
drug_dataset

In [None]:
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}


drug_dataset = drug_dataset.filter(lambda x: x['condition'] is not None)
drug_dataset = drug_dataset.map(lowercase_condition)

In [None]:
drug_dataset['train']['condition'][:3]

In [None]:
def compute_review_length(example):
  return {"review_length": len(example['review'].split())}

In [None]:
drug_dataset = drug_dataset.map(compute_review_length)

drug_dataset['train'][0]

In [None]:
drug_dataset["train"].sort("review_length")[:3]

In [None]:
drug_before = drug_dataset['train']
drug_before.num_rows

In [None]:
drug_dataset =  drug_dataset.filter(lambda x: x['review_length'] > 30)
drug_dataset.num_rows

In [None]:
import html

text = "I&#039;m a transformer called BERT"
html.unescape(text)

In [None]:
#drug_dataset = drug_dataset.map(lambda x: {'review': html.unescape(x['review'])})
# way faster:
drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True
)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

def tokenize_function(examples):
    return tokenizer(examples['review'], truncation=True)

In [None]:
def tokenize_and_split(examples):
    return tokenizer(
        examples['review'],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True
    )

In [None]:
result = tokenize_and_split(drug_dataset['train'][:3])
[len(inp) for inp in result['input_ids']]

Here we will get the error, because of overflowith of tokens which increased our tokenized_datasets to 1463, though we selected 1000 samples for drug_dataset

In [None]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)

In [None]:
len(tokenized_dataset['train']), len(drug_dataset['train'])

Here we are going to remove those extra columns

In [None]:
tokenized_dataset = drug_dataset.map(
    tokenize_and_split, batched=True, remove_columns=drug_dataset['train'].column_names
)

In [None]:
len(tokenized_dataset['train']), len(drug_dataset['train'])