# banking77

In [1]:
import os
import numpy as np
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


# Paths and Variables

In [3]:
dataset_name = "banking77"

In [4]:
input_dir = './data'
output_dir = f'./../../processed/{dataset_name}/'
os.makedirs(output_dir, exist_ok=True)
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')

# Get data from huggingface datasets

In [5]:
data = load_dataset("PolyAI/banking77")

Downloading data: 100%|██████████| 298k/298k [00:00<00:00, 551kB/s]
Downloading data: 100%|██████████| 93.9k/93.9k [00:00<00:00, 291kB/s]
Generating train split: 100%|██████████| 10003/10003 [00:00<00:00, 157460.34 examples/s]
Generating test split: 100%|██████████| 3080/3080 [00:00<00:00, 228556.25 examples/s]


In [7]:
len(data['train']['text']), len(data['test']['text'])

(10003, 3080)

In [8]:
train_data = data['train']['text']
train_labels = data['train']['label']
test_data = data['test']['text']
test_labels = data['test']['label']

In [9]:
id_col = "id"
target_col = "label"
text_col = "text"

In [10]:
train_data = pd.DataFrame({id_col: range(len(train_data)), text_col: train_data, target_col: train_labels})
train_data.head()

Unnamed: 0,id,text,label
0,0,I am still waiting on my card?,11
1,1,What can I do if my card still hasn't arrived ...,11
2,2,I have been waiting over a week. Is the card s...,11
3,3,Can I track my card while it is in the process...,11
4,4,"How do I know if I will get my card, or if it ...",11


In [11]:
test_data = pd.DataFrame({id_col: range(len(test_data)), text_col: test_data, target_col: test_labels})
test_data.head()

Unnamed: 0,id,text,label
0,0,How do I locate my card?,11
1,1,"I still have not received my new card, I order...",11
2,2,I ordered a card but it has not arrived. Help ...,11
3,3,Is there a way to know when my card will arrive?,11
4,4,My card has not arrived yet.,11


In [12]:
data = pd.concat([train_data, test_data], axis=0)

In [13]:
# Drop duplicates
train_data.drop_duplicates(subset = [id_col], keep='first', inplace=True)
test_data.drop_duplicates(subset = [id_col], keep='first', inplace=True)

# Shuffle Data

In [14]:
# shuffle data
train_data = train_data.sample(frac=1, random_state=42)
train_data.head()

Unnamed: 0,id,text,label
6883,6883,Is it possible for me to change my PIN number?,21
5836,5836,I'm not sure why my card didn't work,25
8601,8601,I don't think my top up worked,59
2545,2545,Can you explain why my payment was charged a fee?,15
8697,8697,How long does a transfer from a UK account tak...,5


In [15]:
test_key = test_data[[id_col, target_col]].copy()
test_data = test_data.drop(columns=[target_col])

# Utility to Save DF as a zipped file

In [16]:
def save_df_to_zipped_csv(df, ftype=None): 
    if ftype is not None: 
        suffix = f'_{ftype}'
    else: 
        suffix = ''
        
    zipped_f_name = f'{dataset_name}{suffix}.zip'
    archive_f_name = f'{dataset_name}{suffix}.csv'   
    compression_opts = dict(method='zip',
                        archive_name=archive_f_name)      
    df.to_csv(os.path.join(output_dir, zipped_f_name), index=False, compression=compression_opts )

# Save Main Data File

In [17]:
# save as zipped file 
save_df_to_zipped_csv(data)

In [18]:
# zip files
save_df_to_zipped_csv(train_data, "train")
save_df_to_zipped_csv(test_data, "test")
save_df_to_zipped_csv(test_key, "test_key")