# Anonymise data

### Removing names/email addresses from the survey data 
#### Save the dataframes to pickle files

In [1]:
import json
import pickle
from pandas.io.json import json_normalize
import os
import logging
import logging.handlers
import pandas as pd
import numpy as np
import glob
from datetime import datetime,timedelta
from collections import Counter
import nltk 
from functools import reduce
import re
from scipy.stats import pointbiserialr
import matplotlib
import matplotlib.pyplot as plt
from matplotlib_venn import venn3
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline
matplotlib.style.use('ggplot')
import seaborn as seabornInstance 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')

In [2]:
def dt_parse(t):
    ''' 
    Function that parses datetime so it can be used to find duration of conversation
    INPUT: a string t representing a message timestamp
    OUTPUT: a string ret representing the message timestamp in the format %Y-%m-%dT%H:%M
    '''
    ret = datetime.strptime(t[0:16],'%Y-%m-%dT%H:%M')
    return ret 

In [3]:
def create_utterance_df(rootDir):
    '''
    Function that traverses subfolders of a root directory 
    where each subfolder corresponds to a conversation with one of the chatbots.
    Each subfolder contains JSON files, each corresponding to a single user message. 
    '''
    
    jsdata = []
    df = pd.DataFrame(pd.np.empty((0, 7)))
    df.columns = ['Username','Duration', 'Chatbot', 'NumberOfTurns', 'Utterance','Word_Count', "Avg_Utt_Len"]

    for subdir, dirs, files in os.walk(rootDir): # each conversation has its own folder of json files
        for dir_ in dirs: # for each conversation
            message_timestamps = []
            user_utterances = []
            char_counts = []
            numberOfTurns = 0
            word_count = 0
            for file in glob.glob(rootDir+dir_+'/'+'*.json'):
                with open(file) as f:
                    data = json.load(f)
                    numberOfTurns += 1
                    message_timestamps.append(dt_parse(data['timestamp']))
                    user_utterances.append(data['text'])
                    for word in (data['text']).split():
                        word_count += 1
                    char_counts.append(len(data['text']) - data['text'].count(' '))
            avg_char_count = np.mean(char_counts)       


            #add post interaction survey (Nasoto)

            #Duration
            maxTime = max(dt for dt in message_timestamps)
            minTime = min(dt for dt in message_timestamps)
            duration =  maxTime - minTime

            #get the information from nasoto inserted
            jsdata.append({'Username': data['from']['name'],
                           'Duration': duration,'Chatbot': data['recipient']['name'], 
                           'NumberOfTurns' : numberOfTurns, 'Utterance': user_utterances, 
                            'Word_Count': word_count, 'Avg_Utt_Len': avg_char_count})



    df =  pd.DataFrame.from_records(jsdata)
    df[:]['Duration'] / np.timedelta64(1, 'm')
    return df  

## Interaction DataFrames

In [4]:
df_nasoto = create_utterance_df('webchat-nasoto-userstudy/')
df_makoto = create_utterance_df('webchat-makoto-userstudy/')
df_makoto


The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead



Unnamed: 0,Username,Duration,Chatbot,NumberOfTurns,Utterance,Word_Count,Avg_Utt_Len
0,gavin.morris,00:03:00,Makoto-Bot,24,"[4, ok, Gav, can't go to the library anymore, ...",56,11.416667
1,mark.hartnett,00:01:00,Makoto-Bot,9,"[Hi, Its Practical, 5, yes, nope, Generally al...",14,6.777778
2,conor.beenham,00:03:00,Makoto-Bot,16,"[5, campus, interesting topic and a good balan...",37,12.875
3,sophie.oneill1,00:05:00,Makoto-Bot,19,"[The content is very interesting and relevant,...",89,22.105263
4,amina.khalid,00:01:00,Makoto-Bot,9,"[human computer interaction, they're nice, 6, ...",20,11.444444
5,orla.keating,00:06:00,Makoto-Bot,21,"[yes, Órla, 5, They're all terrible, ok, Game ...",63,12.761905
6,daire.murphy.2,00:04:00,Makoto-Bot,24,"[No, Spatial info systems, No, Yes, I like it,...",44,8.75
7,zachary.oconnor,00:05:00,Makoto-Bot,17,"[Probably deep learning, sure, Yes, I use the ...",114,30.176471
8,kiowa.daly,00:01:00,Makoto-Bot,10,[its easy becasue of there only being one form...,37,15.5
9,jeremiah.wangaruro,00:04:00,Makoto-Bot,20,"[I'm working from home with my internship, Def...",112,23.95


# Survey DataFrames 

In [5]:
PostMakotoDir = 'surveys-userstudy/Post-Interaction Survey (MAKOTO).csv'
df_PostMakoto = pd.read_csv(PostMakotoDir, error_bad_lines=False)
df_PostMakoto['Username'] =  df_PostMakoto['Username'].str.replace(r'@ucdconnect.ie', '')
df_PostMakoto

Unnamed: 0,Timestamp,Username,Describe Makoto,How would you rate your experience with Makoto based on its knowledge?,Expand on your answer (optional),How would you rate your experience with Makoto based on the quality of the conversation?,Expand on your answer (optional).1,How would you rate your experience with Makoto based on the its attitude/personality?,Expand on your answer (optional).2,Did you enjoy your interaction with Makoto?
0,2020/04/11 12:48:02 pm CET,george.ridgway,A web-based chat application,3,I was supposed that it incorporated the corona...,3,"It was more of a one-way conversation, but it ...",4,,"Yes, it was very easy and fast to respond to m..."
1,2020/04/11 1:27:47 pm CET,oisin.quinn,"Makoto is a cool dude, chill but engaged, they...",4,They seemed to know a good bit about life in U...,5,The conversation just felt very engaging and f...,5,"Makoto seems really sound and friendly, they s...",Yes!
2,2020/04/11 11:25:47 pm CET,mohamed.jama,very polite bot. I did not feel like there wer...,5,I did not have an opportunity to ask it questi...,4,it asked appropriate questions and the flow of...,5,polite and eager,yes
3,2020/04/12 2:06:26 pm CET,nikolaj.jasenko,Its cool,4,,3,,5,,Yes
4,2020/04/15 4:59:37 pm CET,kiowa.daly,Mokoto is a chat bot with a cheery personality,3,I dont really have an opinion on its knowledge...,4,Mokoto is very cheery and oddly puts you in a ...,5,Mokoto has a much better personality in compar...,"Yes i did, with more training i think makoto w..."
5,2020/04/15 5:06:13 pm CET,conor.beenham,"Very straight to the point, although struggled...",4,,3,,4,,it was quite pleasant
6,2020/04/15 5:59:16 pm CET,zachary.oconnor,"Seemed similar to Nasoto, but more enthusiastic",5,,5,,5,,"Yes, friendly chat bot!"
7,2020/04/15 6:28:59 pm CET,orla.keating,Bland,1,It didn't seem to have much knowledge about mo...,1,It had a set amount of phrases to say at certa...,1,"There wasn't an attitude or personality, just ...",It was ok
8,2020/04/15 9:58:38 pm CET,daire.murphy.2,Nicer,3,It had knowledgeable questions,5,Felt like an actual conversation,5,Much nicer to talk to then the last bot. Less ...,Yes
9,2020/04/16 2:49:41 pm CET,jeremiah.wangaruro,Pleasant but still quite robotic,4,You can tell it was leading the conversation,4,,4,,Yeah


In [6]:
PostNasotoDir = 'surveys-userstudy/Post-Interaction Survey (NASOTO).csv'
df_PostNasoto = pd.read_csv(PostNasotoDir, error_bad_lines=False)
df_PostNasoto['Username'] =  df_PostNasoto['Username'].str.replace(r'@ucdconnect.ie', '')

## Demographic Survey

In [7]:
PreInteractionPath = 'surveys-userstudy/Pre-Interaction Survey.csv'
df_dem = pd.read_csv(PreInteractionPath,  error_bad_lines=False)
df_dem['Username'] =  df_dem['Username'].str.replace(r'@ucdconnect.ie', '')

In [8]:
Survey4Dir = 'surveys-userstudy/Final Survey .csv'
df_survey4 = pd.read_csv(Survey4Dir)
df_survey4['Username'] =  df_survey4['Username'].str.replace(r'@ucdconnect.ie', '')
df_survey4

Unnamed: 0,Timestamp,Username,Did you notice any differences in your interactions with one chatbot over the other? If so what were they.,Which chatbot did you prefer interacting with?,Explain why you preferred interacting with this chatbot?
0,2020/04/11 12:58:39 pm CET,george.ridgway,"Yes, I felt like the Nasoto allowed for a much...",Nasoto,The conversation I had with Nasoto felt two-si...
1,2020/04/11 1:30:37 pm CET,oisin.quinn,Nasoto felt more formal. It felt like I was do...,Makoto,They were just more fun and relaxed and it was...
2,2020/04/11 11:39:16 pm CET,mohamed.jama,yes. I found the first bot to be more pleasent...,Makoto,it is not rude and the questions it asked allo...
3,2020/04/12 2:12:37 pm CET,nikolaj.jasenko,"Yeah, first one was more friendly",Makoto,More interesting to interact with due to a bri...
4,2020/04/15 5:01:59 pm CET,kiowa.daly,"Yes, the first bot seems to disregard your res...",Makoto,"Makoto is more cheery, nasoto seems to not res..."
5,2020/04/15 5:14:04 pm CET,conor.beenham,"Yes. Makoto was friendlier, and seemed more 'i...",Makoto,"resulted in a better, more interesting convers..."
6,2020/04/15 6:01:50 pm CET,zachary.oconnor,"They asked the same kinds of questions, but Ma...",Makoto,Makoto seemed nicer and more excited to talk t...
7,2020/04/15 6:46:40 pm CET,orla.keating,"Yes I talked to one longer than the other, alt...",Nasoto,It was engaging more in the conversation
8,2020/04/15 10:02:56 pm CET,daire.murphy.2,The second bots personality was much nicer to ...,Makoto,It was sounder to chat to. Better personality
9,2020/04/16 2:50:45 pm CET,jeremiah.wangaruro,Makoto understood more of what i said,Makoto,Better understanding


# Create Dataframes

In [10]:
df_allSurveys =  [df_dem, df_PostMakoto, df_PostNasoto, df_survey4, df_nasoto, df_makoto]


df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['Username'],
                                            how='outer'), df_allSurveys)
df_merged


Unnamed: 0,Timestamp_x,Username,Age,Gender,Do you know what a chatbot is?,"If Yes, describe your understanding of what a chatbot is","How often do you use Siri, Alexa, Google Assistant or any other virtual assistant?",I am reserved,I tend to find fault with others,I am helpful and unselfish with others,...,NumberOfTurns_x,Utterance_x,Word_Count_x,Avg_Utt_Len_x,Duration_y,Chatbot_y,NumberOfTurns_y,Utterance_y,Word_Count_y,Avg_Utt_Len_y
0,2020/04/11 12:39:37 pm CET,george.ridgway,18-24,Male,Yes,software used to 'chat' with a user in-place o...,Somewhat,3,2,4,...,19,"[this semester, they seem to be involved with ...",108,25.684211,00:05:00,Makoto-Bot,16,"[5, Alot of my time is spent working, but I en...",124,33.0
1,2020/04/11 1:03:11 pm CET,oisin.quinn,18-24,Male,Yes,A computer program that can talk to me in a ch...,Somewhat,5,2,4,...,27,"[I like the vast majority of them!, I don't li...",162,26.074074,00:03:00,Makoto-Bot,16,"[Hi, Working, working, watching Netflix, sleep...",136,36.375
2,2020/04/11 11:15:03 pm CET,mohamed.jama,18-24,Male,Yes,A chatbot is an account where a client (user) ...,Somewhat,4,2,4,...,19,"[4, my name is mohamed, It was a very necessar...",104,25.105263,00:06:00,Makoto-Bot,21,"[mostly positive, yes, yes, I feel it is a nec...",149,28.714286
3,2020/04/12 2:03:09 pm CET,nikolaj.jasenko,18-24,Male,Yes,Chat with an AI,Daily,2,4,4,...,30,"[A few i said, 4, Yes, Boring and repetitive, ...",76,9.833333,00:01:00,Makoto-Bot,9,"[4, Yes, Very lazy, Relates to my career path,...",17,6.777778
4,2020/04/15 11:26:28 am CET,rahul,18-24,Male,Yes,,Daily,4,1,5,...,14,"[A lot of real life application, Rahul, 4, Spa...",24,7.357143,00:01:00,Makoto-Bot,9,"[Yeah, Rahul, sup, Theyre good, could be bette...",17,8.333333
5,2020/04/15 4:51:11 pm CET,kiowa.daly,18-24,Male,Yes,A application or AI that responds to questions...,Somewhat,2,2,4,...,10,[our work is compared to master students and p...,33,15.3,00:01:00,Makoto-Bot,10,[its easy becasue of there only being one form...,37,15.5
6,2020/04/15 5:01:23 pm CET,conor.beenham,18-24,Male,Yes,a software application used to create a chat c...,Daily,3,4,4,...,23,"[Ethical hacking, unable to access campus, esp...",98,22.217391,00:03:00,Makoto-Bot,16,"[5, campus, interesting topic and a good balan...",37,12.875
7,2020/04/15 5:22:18 pm CET,zachary.oconnor,18-24,Male,Yes,A program that takes questions in the form of...,Somewhat,4,2,5,...,34,[My least favourite is probably Connectionist ...,281,37.558824,00:05:00,Makoto-Bot,17,"[Probably deep learning, sure, Yes, I use the ...",114,30.176471
8,2020/04/15 6:16:24 pm CET,orla.keating,35-39,Female,Yes,a program that replies usually through text to...,Never,4,3,4,...,24,"[I would use some fairly often, They're good, ...",107,18.5,00:06:00,Makoto-Bot,21,"[yes, Órla, 5, They're all terrible, ok, Game ...",63,12.761905
9,2020/04/15 9:39:52 pm CET,daire.murphy.2,18-24,Male,Yes,Software the you can have a conversation simil...,Never,1,4,4,...,19,"[No, Felt it was the right thing to do, Campus...",64,13.0,00:04:00,Makoto-Bot,24,"[No, Spatial info systems, No, Yes, I like it,...",44,8.75


In [11]:
df_merged['Chatbot_binary'] = pd.np.where(df_merged['Which chatbot did you prefer interacting with?'].str.contains("Makoto"), 0, 1)



The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead



# Anonymise username

In [12]:
df_merged.drop(columns=["Username"], inplace=True)
df_merged.to_pickle("./df_merged.pkl")

In [13]:
df_dem.drop(columns=["Username"], inplace= True)
df_dem.to_pickle("./df_dem.pkl")

In [14]:
frames = [df_nasoto, df_makoto]
df_chatbotdata =  pd.concat(frames)
df_chatbotdata.drop(columns=["Username"], inplace= True)
df_chatbotdata.to_pickle("./df_chatbotdata.pkl")