In [1]:
import pathlib
import numpy as np
import torch
from datasets import load_dataset

In [2]:
VOCAB_SIZE = 10000
MAX_LEN = 80
EMBEDDING_DIM = 256
KEY_DIM = 256
N_HEADS = 2
FEED_FORWARD_DIM = 256
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 5

In [3]:
# Load the full dataset
import pathlib
datasets_folder = pathlib.Path(r"C:\Users\amrul\programming\deep_learning\dl_projects\Generative_Deep_Learning_2nd_Edition\data")
wine_review_filepath=datasets_folder/"wine_reviews"/"winemag-data-130k-v2.json"
data = load_dataset(str(wine_review_filepath.parent),'json')

In [4]:
data

DatasetDict({
    train: Dataset({
        features: ['taster_twitter_handle', 'region_1', 'region_2', 'winery', 'description', 'province', 'title', 'price', 'variety', 'country', 'taster_name', 'points', 'designation'],
        num_rows: 129971
    })
})

In [5]:
data['train']['description'][:3]

["Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.",
 "This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's  already drinkable, although it will certainly be better from 2016.",
 'Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through, with crisp acidity underscoring the flavors. The wine was all stainless-steel fermented.']

In [6]:
train_ds= data['train']

In [7]:
countries=train_ds['country']
print(f"there are {len(set(countries))} countries exist in wine review dataset")

there are 44 countries exist in wine review dataset


In [8]:
varieties = train_ds['variety']
print(f"there are {len(set(varieties))} varieties in wine review dataset")

there are 708 varieties in wine review dataset


In [9]:
def prepare_text_with_country_variety(batch):
    text_with_country_variety = [f"{country} : {province} : {variety} : {description}" for country, province, variety, description in zip(batch['country'],batch['province'], batch['variety'], batch['description'])]
    return {"text": text_with_country_variety}

In [10]:
wine_ds = data.map(prepare_text_with_country_variety,batched=True, batch_size=None)

Map:   0%|          | 0/129971 [00:00<?, ? examples/s]

In [19]:
train_ds = wine_ds["train"]

In [20]:
df=train_ds.to_pandas()

In [25]:
df[(df.country=='US')&(df.province=='California')]['variety'].value_counts()

variety
Pinot Noir                        6896
Cabernet Sauvignon                5693
Chardonnay                        5183
Zinfandel                         2639
Syrah                             1870
                                  ... 
Tempranillo-Cabernet Sauvignon       1
Syrah-Grenache-Viognier              1
Negroamaro                           1
Moscato Giallo                       1
Alvarinho                            1
Name: count, Length: 194, dtype: int64

In [27]:
chardonnay_descriptions=df[(df.country=='US')&(df.province=='California')&(df.variety=='Chardonnay')]['description']


In [30]:
chardonnay_descriptions.index=range(len(chardonnay_descriptions))

In [39]:
chardonnay_descriptions[chardonnay_descriptions.map(lambda description : "vines" in description)][176]

'This voluptuously seductive wine is also elegantly balanced. Citrus, wet stone and sea salt lift the minerality to pronounced levels, accented by a crispness of lemon peel and papaya wrapped in blossoming jasmine. Many of the vines on the site date back nearly 40 years, a demonstration that in age there is beauty.'

In [12]:
wine_ds["train"]["text"][:2]

["Italy : White Blend : Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.",
 "Portugal : Portuguese Red : This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's  already drinkable, although it will certainly be better from 2016."]

In [13]:
from transformers import AutoTokenizer

In [14]:
model_chkpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_chkpt)

In [15]:
tokens = tokenizer(wine_ds["train"]["text"][1])

In [16]:
tokens

{'input_ids': [101, 5978, 1024, 5077, 2417, 1024, 2023, 2003, 22503, 1998, 5909, 2100, 1010, 1037, 4511, 2008, 2003, 5744, 2096, 2145, 14336, 1012, 3813, 9092, 11483, 2015, 2024, 3561, 2041, 2007, 28900, 2417, 10498, 10962, 1998, 4840, 6675, 2007, 5648, 3012, 1012, 2009, 1005, 1055, 2525, 4392, 3085, 1010, 2348, 2009, 2097, 5121, 2022, 2488, 2013, 2355, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [17]:
text_tokens = {input_id : tokenizer.decode(input_id) for input_id in tokens["input_ids"]}

In [18]:
text_tokens

{101: '[CLS]',
 5978: 'portugal',
 1024: ':',
 5077: 'portuguese',
 2417: 'red',
 2023: 'this',
 2003: 'is',
 22503: 'ripe',
 1998: 'and',
 5909: 'fruit',
 2100: '##y',
 1010: ',',
 1037: 'a',
 4511: 'wine',
 2008: 'that',
 5744: 'smooth',
 2096: 'while',
 2145: 'still',
 14336: 'structured',
 1012: '.',
 3813: 'firm',
 9092: 'tan',
 11483: '##nin',
 2015: '##s',
 2024: 'are',
 3561: 'filled',
 2041: 'out',
 2007: 'with',
 28900: 'juicy',
 10498: 'berry',
 10962: 'fruits',
 4840: 'fresh',
 6675: '##ened',
 5648: 'acid',
 3012: '##ity',
 2009: 'it',
 1005: "'",
 1055: 's',
 2525: 'already',
 4392: 'drink',
 3085: '##able',
 2348: 'although',
 2097: 'will',
 5121: 'certainly',
 2022: 'be',
 2488: 'better',
 2013: 'from',
 2355: '2016',
 102: '[SEP]'}