In [None]:
# Imports

import json
import pandas as pd
from datetime import datetime
from nltk.tokenize import RegexpTokenizer
import numpy as np
import statistics
from nltk.sentiment import SentimentIntensityAnalyzer
tokenizer = RegexpTokenizer(r'\w+')
nltk.downloader.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

def read_json(fname):
    with open(fname) as json_file:
        data = json.load(json_file)

    return data

In [None]:

def process():
    df = pd.read_csv('./data/data.csv')

    chatlogs = pd.read_csv('./data/chatlogs.csv')
    messages = pd.read_csv('./data/messages.csv')

    def get_appended_human_messages(row):
        if not (row['condition'] == 'another person' and row['condition_2'] == '0'):
            human = messages[(messages['worker_id'] == row['mturk_id']) & (~pd.isna(messages['user_id']))]['content'].values.tolist()
            human = [h for h in human if type(h) == str]
        else:
            # Get room id
            try:
                room_list = list(set(messages[messages['worker_id'] == row['workerId']]['room_id']))
                room_id = room_list[0]
                user = list(set(messages[messages['room_id'] == room_id]['participant_username'].values.tolist()))[0]
            except:
                print("NA", row['workerId'])
                return

            human = messages[(messages['room_id'] == room_id) & (messages['username'] == user)]['content'].values.tolist()
            human = [h for h in human if type(h) == str]


        #return "Hidden"
        return " ".join(human)

    def get_appended_ai_messages(row):
        if not (row['condition'] == 'another person' and row['condition_2'] == '0'):
            ai = messages[(messages['worker_id'] == row['mturk_id']) & (pd.isna(messages['user_id']))]['content'].values.tolist()
            ai = [h for h in ai if type(h) == str]
        else: # Talking to a real person, get the pair's messages
            try:
                room_list = list(set(messages[messages['worker_id'] == row['workerId']]['room_id']))
                room_id = room_list[0]
                user = list(set(messages[messages['room_id'] == room_id]['participant_username'].values.tolist()))[1]
            except:
                print("NA", row['workerId'])
                return

            ai = messages[(messages['room_id'] == room_id) & (messages['username'] == user)]['content'].values.tolist()
            ai = [h for h in ai if type(h) == str]


        #return "Hidden"
        return " ".join(ai)

    def get_chatlog(row):
        #return "Hidden"
        if len(chatlogs[chatlogs['worker_id'] == row['mturk_id']]) == 0:
            return
        return chatlogs[chatlogs['worker_id'] == row['mturk_id']]['content'].values[0]

    def get_human_message_count(row):
        if not (row['condition'] == 'another person' and row['condition_2'] == '0'):
            human = messages[(messages['worker_id'] == row['mturk_id']) & (~pd.isna(messages['user_id']))][
                'content'].values.tolist()
        else:
            try:
                room_list = list(set(messages[messages['worker_id'] == row['workerId']]['room_id']))
                room_id = room_list[0]
                user = list(set(messages[messages['room_id'] == room_id]['participant_username'].values.tolist()))[0]
            except:
                print("NA", row['workerId'])
                return

            human = messages[(messages['room_id'] == room_id) & (messages['username'] == user)]['content'].values.tolist()


        return len([h for h in human if type(h) == str])

    def get_ai_message_count(row):
        if not (row['condition'] == 'another person' and row['condition_2'] == '0'):
            ai = messages[(messages['worker_id'] == row['mturk_id']) & (pd.isna(messages['user_id']))][
                'content'].values.tolist()
        else:    
            try:
                room_list = list(set(messages[messages['worker_id'] == row['workerId']]['room_id']))
                room_id = room_list[0]
                user = list(set(messages[messages['room_id'] == room_id]['participant_username'].values.tolist()))[1]
            except:
                print("NA", row['workerId'])
                return
            ai = messages[(messages['room_id'] == room_id) & (messages['username'] == user)]['content'].values.tolist()
        
        return len([h for h in ai if type(h) == str])

    def get_human_word_count(row):
        if not (row['condition'] == 'another person' and row['condition_2'] == '0'):
            human = messages[(messages['worker_id'] == row['mturk_id']) & (~pd.isna(messages['user_id']))][
                'content'].values.tolist()
        else:
            try:
                room_list = list(set(messages[messages['worker_id'] == row['workerId']]['room_id']))
                room_id = room_list[0]
                user = list(set(messages[messages['room_id'] == room_id]['participant_username'].values.tolist()))[0]
            except:
                print("NA", row['workerId'])
                return
            human = messages[(messages['room_id'] == room_id) & (messages['username'] == user)]['content'].values.tolist()

        human = [len(h.split(" ")) for h in human if type(h) == str]
        return json.dumps(human)

    def get_ai_word_count(row):
        if not (row['condition'] == 'another person' and row['condition_2'] == '0'):
            ai = messages[(messages['worker_id'] == row['mturk_id']) & (pd.isna(messages['user_id']))][
                'content'].values.tolist()
        else:
            try:
                room_list = list(set(messages[messages['worker_id'] == row['workerId']]['room_id']))
                room_id = room_list[0]
                user = list(set(messages[messages['room_id'] == room_id]['participant_username'].values.tolist()))[1]
            except:
                print("NA", row['workerId'])
                return
            ai = messages[(messages['room_id'] == room_id) & (messages['username'] == user)]['content'].values.tolist()

        ai = [len(h.split(" ")) for h in ai if type(h) == str]
        return json.dumps(ai)

    def get_avg_human_word_count(row):
        if not (row['condition'] == 'another person' and row['condition_2'] == '0'):
            human = messages[(messages['worker_id'] == row['mturk_id']) & (~pd.isna(messages['user_id']))][
                'content'].values.tolist()
        else:
            try:
                room_list = list(set(messages[messages['worker_id'] == row['workerId']]['room_id']))
                room_id = room_list[0]
                user = list(set(messages[messages['room_id'] == room_id]['participant_username'].values.tolist()))[0]
            except:
                print("NA", row['workerId'])
                return
            human = messages[(messages['room_id'] == room_id) & (messages['username'] == user)]['content'].values.tolist()

        human = [len(h.split(" ")) for h in human if type(h) == str]

        if len(human) == 0:
            return -1
        
        return statistics.mean(human)

    def get_avg_ai_word_count(row):
        if not (row['condition'] == 'another person' and row['condition_2'] == '0'):
            ai = messages[(messages['worker_id'] == row['mturk_id']) & (pd.isna(messages['user_id']))][
                'content'].values.tolist()
        else:
            try:
                room_list = list(set(messages[messages['worker_id'] == row['workerId']]['room_id']))
                room_id = room_list[0]
                user = list(set(messages[messages['room_id'] == room_id]['participant_username'].values.tolist()))[1]
            except:
                print("NA", row['workerId'])
                return
            ai = messages[(messages['room_id'] == room_id) & (messages['username'] == user)]['content'].values.tolist()

        ai = [len(h.split(" ")) for h in ai if type(h) == str]

        if len(ai) == 0:
            return -1
        return statistics.mean(ai)

    def get_response_times(row):
        msgs = messages[(messages['worker_id'] == row['mturk_id'])]

        if (row['condition'] == 'another person' and row['condition_2'] == '0'): # Real person condition
            try:
                room_list = list(set(messages[messages['worker_id'] == row['workerId']]['room_id']))
                room_id = room_list[0]
            except:
                print("NA", row['workerId'])
                return json.dumps([])
            
            msgs = messages[(messages['room_id'] == room_id)]
        response_times = []

        last_ai_timestamp = None
        last_ai_message = None
        pair_started = False
        last_message_is_human = False
        for index, msg in msgs.iterrows():
            if not (row['condition'] == 'another person' and row['condition_2'] == '0'):  ## AI Condition
                if not pair_started and not pd.isna(msg['user_id']):  # Human sent a message before AI did
                    continue
                elif pd.isna(msg['user_id']):
                    pair_started = True

                if pd.isna(msg['user_id']):  # AI sends message
                    last_ai_timestamp = datetime.strptime(msg['date_added'], "%Y-%m-%d %H:%M:%S.%f+00")
                    last_ai_message = msg['content']
                    last_message_is_human = False
                else:  # Human sends message
                    if not last_message_is_human:
                        response_time = (datetime.strptime(msg['date_added'], "%Y-%m-%d %H:%M:%S.%f+00") -
                                     last_ai_timestamp).total_seconds()

                        # Also deduct the writing time of AI
                        response_time -= len(last_ai_message.split(" ")) * 0.6  # 0.6 second for each word
                        response_times.append(response_time)

                    last_message_is_human = True
            else:  # Real Person condition
                try:
                    user = list(set(messages[messages['worker_id'] == row['workerId']]['username']))[0]
                except:
                    print("NA", row['workerId'])
                    return json.dumps([])

                if not pair_started and msg['username'] == user:  # Human sent a message before its pair did
                    continue

                elif msg['username'] != user:
                    pair_started = True

                if msg['username'] != user: # Pair sends message
                    last_ai_timestamp = datetime.strptime(msg['date_added'], "%Y-%m-%d %H:%M:%S.%f+00")
                    last_ai_message = msg['content']
                    last_message_is_human = False
                else:  # Human sends message
                    if not last_message_is_human:
                        response_time = (datetime.strptime(msg['date_added'], "%Y-%m-%d %H:%M:%S.%f+00") -
                                     last_ai_timestamp).total_seconds()
                    
                        response_times.append(response_time)

                    last_message_is_human = True

        return json.dumps(response_times)

    def get_avg_response_time(row):
        response_times = json.loads(get_response_times(row))

        if len(response_times) > 0:
            return statistics.mean(response_times)
        else:
            return -1
        
    sia = SentimentIntensityAnalyzer()
    def get_human_compound_sentiments(row):
        if not (row['condition'] == 'another person' and row['condition_2'] == '0'):
            msgs_human = messages[(messages['worker_id'] == row['mturk_id']) & (~pd.isna(messages['user_id']))][
                'content'].values.tolist()
        else:
            try:
                room_list = list(set(messages[messages['worker_id'] == row['workerId']]['room_id']))
                room_id = room_list[0]
                user = list(set(messages[messages['room_id'] == room_id]['participant_username'].values.tolist()))[0]
            except:
                print("NA", row['workerId'])
                return json.dumps([])
            
            msgs_human = messages[(messages['room_id'] == room_id) & (messages['username'] == user)]['content'].values.tolist()


        return json.dumps([sia.polarity_scores(message)['compound'] for message in msgs_human if type(message) == str])

    def get_ai_compound_sentiments(row):
        if not (row['condition'] == 'another person' and row['condition_2'] == '0'):
            msgs_ai = messages[(messages['worker_id'] == row['mturk_id']) & (pd.isna(messages['user_id']))][
                'content'].values.tolist()
        else:
            try:
                room_list = list(set(messages[messages['worker_id'] == row['workerId']]['room_id']))
                room_id = room_list[0]
                user = list(set(messages[messages['room_id'] == room_id]['participant_username'].values.tolist()))[1]
            except:
                print("NA", row['workerId'])
                return json.dumps([])
            
            msgs_ai = messages[(messages['room_id'] == room_id) & (messages['username'] == user)]['content'].values.tolist()

        return json.dumps([sia.polarity_scores(message)['compound'] for message in msgs_ai if type(message) == str])

    def get_avg_human_compound_sentiment(row):
        scores = json.loads(get_human_compound_sentiments(row))
        if len(scores) == 0:
            return -1
        return statistics.mean(scores)

    def get_avg_ai_compound_sentiment(row):
        scores = json.loads(get_ai_compound_sentiments(row))
        if len(scores) == 0:
            return -1
        return statistics.mean(scores)


    df['ResponseTimes'] = df.apply(get_response_times, axis=1)
    df['AppendedHumanMessages'] = df.apply(get_appended_human_messages, axis=1)
    df['AppendedAIMessages'] = df.apply(get_appended_ai_messages, axis=1)
    df['FullChatlog'] = df.apply(get_chatlog, axis=1)
    df['HumanMessageCount'] = df.apply(get_human_message_count, axis=1)
    df['AIMessageCount'] = df.apply(get_ai_message_count, axis=1)
    df['HumanWordCounts'] = df.apply(get_human_word_count, axis=1)
    df['AIWordCounts'] = df.apply(get_ai_word_count, axis=1)
    df['AvgHumanWordCount'] = df.apply(get_avg_human_word_count, axis=1)
    df['AvgAIWordCount'] = df.apply(get_avg_ai_word_count, axis=1)
    df['AICompoundSentiments'] = df.apply(get_ai_compound_sentiments, axis=1)
    df['HumanCompoundSentiments'] = df.apply(get_human_compound_sentiments, axis=1)
    df['AvgAICompoundSentiment'] = df.apply(get_avg_ai_compound_sentiment, axis=1)
    df['AvgHumanCompoundSentiment'] = df.apply(get_avg_human_compound_sentiment, axis=1)
    df['AvgResponseTime'] = df.apply(get_avg_response_time, axis=1)

    df.to_csv('./data/data_with_conversation_stats.csv')


### Generate Conversation Features

In [None]:
process()

### Read Generated File Containing Conversation Features

In [None]:
df = pd.read_csv('./data/data_with_conversation_stats.csv')

# Code Below Are For Pairwise Analyses (e.g., sentiment similarity)

In [None]:
############## IMPORTS ##############

from sklearn.linear_model import LinearRegression
import numpy as np
from collections import Counter
import pandas as pd
import nltk
import statistics
from rpy2.robjects.packages import importr
import rpy2.robjects as R
from rpy2.robjects import pandas2ri

pandas2ri.activate()

from rpy2.robjects.packages import importr
utils = importr('utils')
utils.chooseCRANmirror(ind=1) # select the first mirror in the list
utils.install_packages('effsize')
effsize = importr('effsize')

import csv
import spacy
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import RegexpTokenizer

#nlp = spacy.load("en_core_web_trf")
nlp = spacy.load("en_core_web_lg")

base = importr('base')
stats = importr("stats")

nltk.download('averaged_perceptron_tagger')
# Tagset: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [None]:
############## READ THE DATA ##############

results = {"human": [], "ai": [], "all": []}

sender_type = "human" # ai, human, all
label_type = "willing_friend"  # willing_friend, willing_romantic, wtp

# Dataset
data_all = df

all_messages = {'ai': [], 'human': []}

In [None]:
############## CONSTRUCT LSM INSTANCE ##############
import math
class LinguisticStyleMatching:
    data = None
    sia = None
    tokenizer = None

    def __init__(self):
        self.data = data_all
        self.sia = SentimentIntensityAnalyzer()
        self.tokenizer = RegexpTokenizer(r'\w+')

    # Return each AI and human message separately, to compare later
    def get_paired_messages(self):
        data = pd.read_csv('./data/messages.csv')
        curr_room = None

        all_pairs = {}
        curr_message_pairs = {'human-ai': [], 'ai-human': []}
        last_message = {'sender_type': '', 'content': ''}

        for index, message in data.iterrows():
            if curr_room is None:
                curr_room = message['room_id']

            if curr_room != message['room_id']: # Room changed, save
                # Get users of current room
                users_list = list(set(data[data['room_id'] == curr_room]['worker_id'].values.tolist()))
                for user in users_list:
                    # Save the pairs for all users in room
                    all_pairs[user] = curr_message_pairs
                
                curr_room = message['room_id']

                # Reset
                curr_message_pairs = {'human-ai': [], 'ai-human': []}
                last_message = {'sender_type': '', 'content': ''}

            # Keep adding to curr_messages
            users_list = list(set(data[data['room_id'] == curr_room]['participant_username'].values.tolist()))
            if len(users_list) == 0:
                continue
            
            if len(users_list) == 1:
                if message['username'] == 'Jessie': # AI message
                    if last_message['sender_type'] == 'ai':  # Take only the first message, in case there are consecutive ones
                        continue

                    if last_message['sender_type'] == 'human':  # Second message, Save the pair
                        curr_message_pairs['human-ai'].append([last_message['content'], message['content']])

                    last_message = {'sender_type': 'ai', 'content': message['content']}
                else:
                    if last_message['sender_type'] == 'human':
                        continue

                    if last_message['sender_type'] == 'ai':  # Not None
                        curr_message_pairs['ai-human'].append([last_message['content'], message['content']])

                    last_message = {'sender_type': 'human', 'content': message['content']}
            else:
                user = users_list[1]
                if message['username'] == user: # AI message
                    if last_message['sender_type'] == 'ai':  # Take only the first message, in case there are consecutive ones
                        continue

                    if last_message['sender_type'] == 'human':  # Second message, Save the pair
                        curr_message_pairs['human-ai'].append([last_message['content'], message['content']])

                    last_message = {'sender_type': 'ai', 'content': message['content']}
                else:
                    if last_message['sender_type'] == 'human':
                        continue

                    if last_message['sender_type'] == 'ai':  # Not None
                        curr_message_pairs['ai-human'].append([last_message['content'], message['content']])

                    last_message = {'sender_type': 'human', 'content': message['content']}

            if index == data.shape[0] - 1:
                # Get users of current room
                users_list = list(set(data[data['room_id'] == curr_room]['worker_id'].values.tolist()))
                for user in users_list:
                    # Save the pairs for all users in room
                    all_pairs[user] = curr_message_pairs


        return all_pairs


    # Normalize the data to interval [0, 1], before calculating similarity
    # zi = (xi – min(x)) / (max(x) – min(x))
    def normalize_val(self, num):
        return (num + 1) / 2


    # Calculate average turn-by-turn similarities and add it to self.data
    def calculate_turn_similarities(self):

        # Create Columns
        for sim_type in ['Spacy', 'TurnLength', 'Valence']:
            self.data['HumanAISim{}'.format(sim_type)] = None
            self.data['AIHumanSim{}'.format(sim_type)] = None
            self.data['AvgSim{}'.format(sim_type)] = None

        paired_messages = self.get_paired_messages()

        for worker_id, pairs in paired_messages.items():  # All subjects
            sim = {'Spacy': {}, 'TurnLength': {}, 'Valence': {}}
            for pair_type, messages in pairs.items():  # Iterate Both Pair Types, i.e., Human-AI or AI-Human
                similarities = {'Spacy': [], 'TurnLength': [], 'Valence': []}
                for pair in messages:  # Iterate each pair
                    try:
                        #### Calculate Spacy Similarity ####
                        spacy_similarity = nlp(str(pair[0])).similarity(nlp(str(pair[1])))
                        similarities['Spacy'].append(100 * spacy_similarity)

                        #### Calculate Turn Length Similarity ####
                        turn_similarity = self.calculate_sim(len(self.tokenizer.tokenize(str(pair[0]))), len(self.tokenizer.tokenize(str(pair[1]))))
                        similarities['TurnLength'].append(turn_similarity)

                        #### Calculate Valence Similarity ####
                        # Normalize the data to interval [0, 1], before calculating similarity
                        polarity_1 = self.normalize_val(self.sia.polarity_scores(str(pair[0]))['compound'])
                        polarity_2 = self.normalize_val(self.sia.polarity_scores(str(pair[1]))['compound'])
                        valence_similarity = self.calculate_sim(polarity_1, polarity_2)
                        similarities['Valence'].append(valence_similarity)

                        #print(" -- Spacy Similarity: ", spacy_similarity, " -- Turn Length Similarity: ", turn_similarity, " -- Valence Similarity: ", valence_similarity)
                    except Exception as e:
                        print(e)

                # Calculate mean and std. deviation between pairs
                for sim_type in similarities.keys():
                    sim[sim_type][pair_type] = {'mean': statistics.mean(similarities[sim_type]) if len(similarities[sim_type]) > 1 else -1, 'sd': statistics.stdev(similarities[sim_type]) if len(similarities[sim_type]) > 1 else -1}

            for sim_type in sim.keys():
                self.data.loc[self.data['mturk_id'] == worker_id, 'HumanAISim{}'.format(sim_type)] = sim[sim_type]['human-ai']['mean']
                self.data.loc[self.data['mturk_id'] == worker_id, 'AIHumanSim{}'.format(sim_type)] = sim[sim_type]['ai-human']['mean']
                self.data.loc[self.data['mturk_id'] == worker_id, 'AvgSim{}'.format(sim_type)] = statistics.mean([sim[sim_type]['ai-human']['mean'], sim[sim_type]['human-ai']['mean']])



    def tokenize_raw_data(self):
        def tokenize(conversation):
            if type(conversation) == float and math.isnan(conversation):
                return ""
            return nltk.pos_tag(nltk.word_tokenize(conversation))

        self.data.loc[:, 'HumanTags'] = self.data.loc[:, 'AppendedHumanMessages'].apply(tokenize)
        self.data.loc[:, 'AITags'] = self.data.loc[:, 'AppendedAIMessages'].apply(tokenize)

    def count_tokens(self):
        def count_tokens(tokens):
            return dict(Counter(token[1] for token in tokens))

        self.data.loc[:, 'HumanTokenCounts'] = self.data.loc[:, 'HumanTags'].apply(count_tokens)
        self.data.loc[:, 'AITokenCounts'] = self.data.loc[:, 'AITags'].apply(count_tokens)

    def calculate_style_matching(self):
        def calculate(human_tags, ai_tags):
            # Ignore Punctuations
            if "." in human_tags.keys():
                human_tags.pop(".")

            if "." in ai_tags.keys():
                ai_tags.pop(".")

            human_total_tags = sum(human_tags.values())
            ai_total_tags = sum(ai_tags.values())
            matchings = {}
            for k in self.get_all_token_names(): # Calculate only function words
                try:
                    human_prop = human_tags.get(k, 0) / human_total_tags
                    ai_prop = ai_tags.get(k, 0) / ai_total_tags
                except ZeroDivisionError:
                    continue
                matchings[k] = self.calculate_sim(human_prop, ai_prop)

            return matchings

        self.data['StyleMatching'] = self.data.apply(lambda x: calculate(x.HumanTokenCounts, x.AITokenCounts), axis=1)

    def get_all_token_names(self):
        return ['CC', 'DT', 'PRP', 'PRP$', 'WP', 'IN']

    def get_all_tokens(self):
        tokens = self.get_all_token_names()

        def get_token(row): # Return tokens
            return { token: row.get(token, 0) for token in tokens }

        return self.data.loc[:, 'StyleMatching'].apply(get_token)

    # Mean of all common tokens
    def get_lsms(self):
        tokens = self.get_all_tokens()
        token_names = self.get_all_token_names()

        def get_avg(row):
            avg = []
            for token in token_names:
                try:
                    avg.append(row[token])
                except KeyError:
                    continue

            if len(avg) == 0:
                return -1
            return statistics.mean(avg)

        self.data['AvgSimStyleMatching'] = self.data.loc[:, 'StyleMatching'].apply(get_avg)
        return self.data['AvgSimStyleMatching']


    # Mean LSM among all conversations
    def get_mean_lsm(self):
        tokens = self.get_all_token_names()
        token_avgs = { token : [] for token in tokens }
        for index, row in self.data.iterrows():
            for token in tokens:
                try:
                    token_avgs[token].append(row['StyleMatching'][token])
                except KeyError:
                    continue

        return { k: {'Mean': sum(v) / len(v), 'Std. Dev': statistics.stdev(v)} for k, v in token_avgs.items() }

    # Returns the similarity between two numbers, i.e., similarity = 1 − (|num1 − num2|/(num1 + num2))
    def calculate_sim(self, num1, num2):
        if num1 + num2 == 0:
            return 100
        return 100 * (1 - (abs(num1 - num2) / (num1 + num2)))



In [None]:

lsm = LinguisticStyleMatching()

In [None]:
############### Setup For Linguistic Style Matching ###############

lsm.tokenize_raw_data()
lsm.count_tokens()
lsm.calculate_style_matching()
lsm.get_mean_lsm()
lsm.get_all_tokens()
lsms = lsm.get_lsms()

In [None]:
lsm.get_mean_lsm()

In [None]:
lsm.calculate_turn_similarities()  # Calculate turn-by-turn similarities

In [None]:
lsm.data[['workerId', 'AvgSimSpacy', 'AvgSimTurnLength', 'AvgSimValence', 'AvgSimStyleMatching']]

### Save the Data

In [None]:
# Merge with our original dataset
df = df.merge(lsm.data[['workerId', 'AvgSimSpacy', 'AvgSimTurnLength', 'AvgSimValence', 'AvgSimStyleMatching']], on='workerId')
df.to_csv("./data/final_data.csv")  ### Save similarity data