In [1]:
import json
import os
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import ast

In [2]:

def extract_intent(data):
    for item in data:
        for turn in item['turns']:
            if turn['speaker'] != "SYSTEM":
                text = turn['utterance']
                text_intents = []
                for frame in turn['frames']:
                    intent = frame['state']['active_intent']
                    if intent != 'NONE':
                        text_intents.append(intent)
                if text in intents:
                    intents[text].update(text_intents)
                else:
                    intents[text] = set(text_intents)


In [3]:
def make_intent_csv(file_list, csv_file_name):   
    for file in file_list:
        with open('train/{}'.format(file)) as f:
            data = json.load(f)
            extract_intent(data)

    df = pd.DataFrame.from_dict(intents.items())

    counts = {}
    for text, intent in intents.items():
        l = len(intent)
        if l in counts:
            counts[l] += 1
        else:
            counts[l] = 1

    df.columns = ['text', 'intents']
    df['intents'] = df['intents'].apply(lambda x: list(x))

    df.columns = ['text', 'intents']
    df['intents'] = df['intents'].apply(lambda x: list(x))
    print(len(df))

    df = df[df['intents'].apply(lambda x: len(x) < 3)]
    df['intents'] = df['intents'].apply(lambda x: ['no_intent'] if len(x) ==0 else x)
    df['intents'] = df['intents'].apply(lambda x: str(x))

    df.to_csv(csv_file_name)
    df = pd.read_csv(csv_file_name)

    df['intents'] = df['intents'].apply(lambda x: ast.literal_eval(x) )

    multilabel = MultiLabelBinarizer()
    y = multilabel.fit_transform(df['intents'])
    y.astype(float)

    y_df = pd.DataFrame(y, columns=multilabel.classes_)
    df_all = pd.concat([df, y_df], axis=1)
    
    return df_all



### Train Intents

In [5]:
intents = {}
data = 'train'
csv_file_name = f"{data}_intents.csv"
file_list = os.listdir(data)
df = make_intent_csv(file_list, csv_file_name)
df.to_csv(f"data/{csv_file_name}"")

51244


### Test Intents

In [12]:
intents = {}
data = 'test'
csv_file_name = f"{data}_intents.csv"
file_list = os.listdir(data)
df = make_intent_csv(file_list, csv_file_name)
df.to_csv(f"data/{csv_file_name}")
df

Unnamed: 0.1,Unnamed: 0,text,intents,book_hotel,book_restaurant,book_train,find_attraction,find_bus,find_hospital,find_hotel,find_police,find_restaurant,find_taxi,find_train,no_intent
0,0,i need a place to dine in the center thats exp...,"[find_restaurant, find_hotel]",0,0,0,0,0,0,1,0,1,0,0,0
1,1,"Any sort of food would be fine, as long as it ...","[find_restaurant, find_hotel]",0,0,0,0,0,0,1,0,1,0,0,0
2,2,"Sounds good, could I get that phone number? Al...","[find_restaurant, find_hotel]",0,0,0,0,0,0,1,0,1,0,0,0
3,3,Yes. Can you book it for me?,[find_hotel],0,0,0,0,0,0,1,0,0,0,0,0
4,4,i want to book it for 2 people and 2 nights st...,[book_hotel],1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6797,6838,I would like to visit a cinema of the west side.,[find_attraction],0,0,0,1,0,0,0,0,0,0,0,0
6798,6839,How about a museum then?,[find_attraction],0,0,0,1,0,0,0,0,0,0,0,0
6799,6840,I would prefer a museum. One with free entry i...,[find_attraction],0,0,0,1,0,0,0,0,0,0,0,0
6800,6841,That is fine. Book me a ticket and get me a re...,[find_attraction],0,0,0,1,0,0,0,0,0,0,0,0
