<a href="https://colab.research.google.com/github/sanspareilsmyn/medium/blob/main/seq2seq_with_atis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# https://towardsdatascience.com/natural-language-understanding-with-sequence-to-sequence-models-e87d41ad258b

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
ls /content/drive/MyDrive/code/atis

atis.dict.intent.csv  atis.test.pkl          atis.train.pkl
atis.dict.slots.csv   atis.test.query.csv    atis.train.query.csv
atis.dict.vocab.csv   atis.test.slots.csv    atis.train.slots.csv
atis.test.intent.csv  atis.train.intent.csv


In [14]:
import os
import numpy as np
import pickle

In [16]:
DATA_DIR = '/content/drive/MyDrive/code/atis'

In [20]:
def load_ds(fname=os.path.join(DATA_DIR, '/atis.train.pkl'), verbose=True):
  with open(fname, 'rb') as stream:
    ds, dicts = pickle.load(stream)
  if verbose:
    print('Done  loading: ', fname)
    print('      samples: {:4d}'.format(len(ds['query'])))
    print('   vocab_size: {:4d}'.format(len(dicts['token_ids'])))
    print('   slot count: {:4d}'.format(len(dicts['slot_ids'])))
    print(' intent count: {:4d}'.format(len(dicts['intent_ids'])))
  return ds,dicts

In [39]:
def load_atis(filename, add_start_end_token=False, verbose=True):
  train_ds, dicts = load_ds(os.path.join(DATA_DIR, filename), verbose)
  t2i, s2i, in2i = map(dicts.get, ['token_ids', 'slot_ids', 'intent_ids'])
  i2t, i2s, i2in = map(lambda d: {d[k]:k for k in d.keys()}, [t2i, s2i, in2i])
  query, slots, intent = map(train_ds.get, ['query', 'slot_labels', 'intent_labels'])

  if add_start_end_token:
    i2s[178] = 'BOS'
    i2s[179] = 'EOS'
    s2i['BOS'] = 178
    s2i['EOS'] = 179

  input_tensor = []
  target_tensor = []
  query_data = []
  intent_data = []
  slot_data = []
  to_show = np.random.randint(0, len(query)-1, 5)

  for i in range(len(query)):
    input_tensor.append(query[i])
    slot_text = []
    slot_vector = []
    for j in range(len(query[i])):
      slot_text.append(i2s[slots[i][j]])
      slot_vector.append(slots[i][j])
    if add_start_end_token:
      slot_text[0] = 'BOS'
      slot_vector[0] = 178
      slot_text[-1] = 'EOS'
      slot_vector[-1]= 179
    target_tensor.append(slot_vector)
    q = ' '.join(map(i2t.get, query[i]))
    query_data.append(q.replace('BOS', '').replace('EOS', ''))
    intent_data.append(i2in[intent[i][0]])
    slot = ' '.join(slot_text)
    slot_data.append(slot[1:-1])
    if i in to_show and verbose:
      print('Query text:', q)
      print('Query vector: ', query[i])
      print('Intent label: ', i2in[intent[i][0]])
      print('Slot text: ', slot)
      print('Slot vector: ', slot_vector)
      print('*'*74)
  query_data = np.array(query_data)
  intent_data = np.array(intent_data)
  slot_data = np.array(slot_data)
  intent_data_label = np.array(intent).flatten()
  return t2i, s2i, in2i, i2t, i2s, i2in, input_tensor, target_tensor, query_data, intent_data, intent_data_label, slot_data

In [40]:
# load ATIS training dataset
t2i_train, s2i_train, in2i_train, i2t_train, i2s_train, i2in_train, \
input_tensor_train, target_tensor_train, \
query_data_train, intent_data_train, intent_data_label_train, slot_data_train = load_atis('atis.train.pkl')

# load ATIS testing dataset
t2i_test, s2i_test, in2i_test, i2t_test, i2s_test, i2in_test, \
input_tensor_test, target_tensor_test, \
query_data_test, intent_data_test, intent_data_label_test, slot_data_test = load_atis('atis.test.pkl')


Done  loading:  /content/drive/MyDrive/code/atis/atis.train.pkl
      samples: 4978
   vocab_size:  943
   slot count:  129
 intent count:   26
Query text: BOS i need to book a flight from newark to tampa on april fourth EOS
Query vector:  [178 479 617 851 264 180 428 444 620 851 816 654 227 439 179]
Intent label:  flight
Slot text:  O O O O O O O O B-fromloc.city_name O B-toloc.city_name O B-depart_date.month_name B-depart_date.day_number O
Slot vector:  [128, 128, 128, 128, 128, 128, 128, 128, 48, 128, 78, 128, 28, 27, 128]
**************************************************************************
Query text: BOS does united airline have any flights from dallas to san francisco EOS
Query vector:  [178 376 887 199 463 218 429 444 339 851 739 440 179]
Intent label:  flight
Slot text:  O O B-airline_name I-airline_name O O O O B-fromloc.city_name O B-toloc.city_name I-toloc.city_name O
Slot vector:  [128, 128, 2, 83, 128, 128, 128, 128, 48, 128, 78, 125, 128]
***************************

In [46]:
import pandas as pd

pd.set_option('display.max_colwidth', -1)
df = pd.DataFrame({'query': query_data_train, 'intent': intent_data_train, 'slot filing': slot_data_train})
df_small = pd.DataFrame(columns=['query', 'intent', 'slot filing'])
j = 0
for i in df.intent.unique():
  df_small.loc[j] = df[df.intent==i].iloc[0]
  j = j+1

df_small

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,query,intent,slot filing
0,i want to fly from boston at 838 am and arrive in denver at 1110 in the morning,flight,O O O O O B-fromloc.city_name O B-depart_time.time I-depart_time.time O O O B-toloc.city_name O B-arrive_time.time O O B-arrive_time.period_of_day
1,what is the arrival time in san francisco for the 755 am flight leaving washington,flight_time,O O O B-flight_time I-flight_time O B-fromloc.city_name I-fromloc.city_name O O B-depart_time.time I-depart_time.time O O B-fromloc.city_name
2,cheapest airfare from tacoma to orlando,airfare,B-cost_relative O O B-fromloc.city_name O B-toloc.city_name
3,what kind of aircraft is used on a flight from cleveland to dallas,aircraft,O O O O O O O O O O B-fromloc.city_name O B-toloc.city_name
4,what kind of ground transportation is available in denver,ground_service,O O O O O O O O B-city_name
5,what 's the airport at orlando,airport,O O O O O B-city_name
6,which airline serves denver pittsburgh and atlanta,airline,O O O B-fromloc.city_name B-fromloc.city_name O B-fromloc.city_name
7,how far is it from orlando airport to orlando,distance,O O O O O B-fromloc.airport_name I-fromloc.airport_name O B-toloc.city_name
8,what is fare code h,abbreviation,O O O O B-fare_basis_code
9,how much does the limousine service cost within pittsburgh,ground_fare,O O O O B-transport_type O O O B-city_name


In [None]:
# Creating Tensors부터 내일