In [8]:
def is_ascii(s):
    try:
        s.encode().decode('ascii')
        return True
    except UnicodeDecodeError:
        return False

def filter(s):
    filtered_string = " ".join([s for s in s.split() if is_ascii(s)])
    filtered_string = filtered_string.replace("\'", "-")
    filtered_string = filtered_string.replace("\"", "-")
    filtered_string = filtered_string.replace(":", "-")
    return filtered_string


In [9]:
def load_dataset(raw_data_path):
    import pandas
    data = pandas.read_csv(raw_data_path)

    dataset = [] # list of all questions. each question is a tuple of title, text and a list of all answers
    for questionTitle in data["questionTitle"].unique():
        df = data[data["questionTitle"] == questionTitle]
        
        title = filter(df.iloc[0, df.columns.get_loc('questionTitle')])
        text = filter(df.iloc[0, df.columns.get_loc('questionText')])
        answers = []

        for answer in df["answerText"]:
            answers.append(filter(answer))

        dataset.append((title, text, answers))

    return dataset

In [10]:
nlu_pre = '''version: "2.0"
nlu:
'''

nlu = '''- intent: faq/{id}
  examples: |
    - {title}
    - {text}
'''

nlu_post = '''rules:
- rule: respond to FAQs
  steps:
    - intent: faq
    - action: utter_faq
'''

In [11]:
domain_pre = '''version: "2.0"
intents:
  - faq
responses:
'''

domain_id = '''  utter_faq/{id}:
'''

domain = '''    - text: {answer}
'''

In [12]:
def write_nlu(dataset, nlu_path):
    global nlu_pre, nlu, nlu_post
    i = 0
    
    for i in range(len(dataset)):
        nlu_pre += nlu.format(id = i, title = dataset[i][0], text = dataset[i][1])
    nlu_pre += nlu_post

    with open(nlu_path, "w") as file:
        file.write(nlu_pre)


In [13]:
def write_domain(dataset, domain_path):
    global domain_pre, domain_id, domain
    i = 0
    
    for i in range(len(dataset)):
        domain_pre += domain_id.format(id = i)
        for answer in dataset[i][2]:
            domain_pre += domain.format(answer = answer)


    with open(domain_path, "w") as file:
        file.write(domain_pre)

In [14]:
def main():
    dataset = load_dataset("data/raw.csv")
    write_nlu(dataset, "data/nlu.yml")
    write_domain(dataset, "domain.yml")

main()