In [66]:
import pandas as pd
import random
import re
from collections import Counter
import numpy as np

train= pd.read_csv("data/atis_train.csv")
val= pd.read_csv("data/atis_val.csv")
test= pd.read_csv("data/atis_test.csv")

print("Train intents:\n",train["intent"].value_counts(),"\n")
print("Val intents:\n",val["intent"].value_counts(),"\n")
print("Test intents:\n",test["intent"].value_counts(),"\n")



Train intents:
 intent
flight            2932
airfare            339
ground_service     204
airline            125
abbreviation       118
aircraft            65
flight_time         43
quantity            41
flight+airfare      17
distance            16
airport             16
city                15
ground_fare         14
capacity            13
flight_no           10
restriction          5
meal                 5
Name: count, dtype: int64 

Val intents:
 intent
flight            734
airfare            85
ground_service     51
airline            32
abbreviation       29
aircraft           16
flight_time        11
quantity           10
ground_fare         4
city                4
distance            4
airport             4
flight+airfare      4
capacity            3
flight_no           2
restriction         1
meal                1
Name: count, dtype: int64 

Test intents:
 intent
flight            632
airfare            48
airline            38
ground_service     36
abbreviation       33
cap

Test missing an intent but that is expected

In [67]:
months = ["january","february","march","april","may","june","july","august","september","october","november","december"]
def tokenize(text):
  text= text.lower().strip()
  for m in months:
    text = re.sub(rf"\b{m}\b", "<month>", text)
  text= re.sub(r"\d+", "<num>", text) #nums not needed
  text = re.sub(r"(\w)'(\w)", r"\1\2", text)
  text= re.sub(r"[^\w\s<>]", "", text)
  return text.split()

def vocab():
  tokens= [token for sent in train["text"] for token in tokenize(sent)]
  counter= Counter(tokens)
  min=1
  pad_token= "<pad>"
  unk_token= "<unk>"

  word_idx={
      pad_token: 0,
      unk_token: 1,
  }
  for word, count in sorted(counter.items()):
    if count >= min and word not in word_idx:
      word_idx[word] = len(word_idx)
  idx_word= {i: w for w, i in word_idx.items()}
  print(idx_word)

vocab()

{0: '<pad>', 1: '<unk>', 2: '<month>', 3: '<num>', 4: '<num>s', 5: 'a', 6: 'aa', 7: 'abbreviation', 8: 'abbreviations', 9: 'able', 10: 'about', 11: 'ac', 12: 'actually', 13: 'advertises', 14: 'after', 15: 'afternoon', 16: 'afternoons', 17: 'afterwards', 18: 'again', 19: 'air', 20: 'aircraft', 21: 'airfare', 22: 'airfares', 23: 'airline', 24: 'airlines', 25: 'airplane', 26: 'airplanes', 27: 'airport', 28: 'airports', 29: 'alaska', 30: 'all', 31: 'along', 32: 'also', 33: 'am', 34: 'america', 35: 'american', 36: 'amount', 37: 'an', 38: 'and', 39: 'angeles', 40: 'another', 41: 'any', 42: 'anything', 43: 'anywhere', 44: 'ap', 45: 'ap<num>', 46: 'approximately', 47: 'are', 48: 'area', 49: 'arizona', 50: 'around', 51: 'arrange', 52: 'arrangements', 53: 'arrival', 54: 'arrivals', 55: 'arrive', 56: 'arrives', 57: 'arriving', 58: 'as', 59: 'at', 60: 'atl', 61: 'atlanta', 62: 'available', 63: 'b', 64: 'back', 65: 'baltimore', 66: 'bay', 67: 'be', 68: 'beach', 69: 'before', 70: 'beginning', 71: 'b

In [68]:
synonyms= {
    "airfare": ["airfares"],
    "airfares":["airfare"],
    "airline": ["airlines"],
    "airlines": ["airline"],
    "arrive": ["arrives", "arriving"],
    "arrives": ["arrive"],
    "arriving": ["arrive"],
    "depart": ["departing","departs"],
    "departing": ["depart"],
    "departs": ["depart"],
    "flight": ["flights"],
    "flights": ["flight"],
    "trip": ["trips"],
    "trips": ["trip"],
    "fare": ["fares"],
    "fares": ["fare"],
    "price": ["prices"],
    "prices": ["price"],
    "meal": ["meals"],
    "meals": ["meal"],
  }


In [69]:
def augment_tokens(tokens,ins_,del_,rep_):
  #augment based on the probs
  augmented_tokens=[]
  for tok in tokens:
    if random.random()<del_ and len(tokens)>3: #no delete for short sent
      continue
    if tok in synonyms and random.random()<rep_:
      augmented_tokens.append(random.choice(synonyms[tok]))
    else:
      augmented_tokens.append(tok)
  for i in range(len(augmented_tokens)):
    if random.random()<ins_:
      ins= random.choice(["show","list","view"])
      augmented_tokens.insert(i, ins)
  return augmented_tokens

def augment_text(text,ins_,del_,rep_):
  tokens= tokenize(text)
  augmented_tokens= augment_tokens(tokens,ins_,del_,rep_)
  return " ".join(augmented_tokens)



In [70]:
def augmentation(df,frac,ins_,del_,rep_):
  random.seed(42)
  aug_row=[]
  n= int(len(df)* frac)
  sampled = df.sample(n,random_state=42)
  for index, row in sampled.iterrows():
    original_text= row["text"]
    new_text = augment_text(original_text,ins_,del_,rep_)
    aug_row.append({
        "text": new_text,
        "intent": row["intent"],
        "label": row["label"]
    })
  aug_df= pd.DataFrame(aug_row)
  full= pd.concat([df,aug_df])
  return full

In [71]:
aug_train= augmentation(train,frac=1,ins_=0.05,del_=0.05,rep_=0.05)
print("Original train size:",len(train))
print("Augmented train size:",len(aug_train))

Original train size: 3978
Augmented train size: 7956
