# Import

In [1]:
import configparser as cp
import pandas as pd
import numpy as np

# Data

In [2]:
config = cp.RawConfigParser()
config.read(r'config.txt')

proc_file = config.get('main', 'proc_file')
proc_output = config.get('liwc', 'proc_output')

speaker_concat_file = config.get('corpus', 'speaker_concat_file')
speaker_concat_output = config.get('liwc', 'speaker_concat_output')

conv_concat_file = config.get('corpus', 'conv_concat_file')
conv_concat_output = config.get('liwc', 'conv_concat_output')

In [3]:
fun_cat_lst = [
    'prep',
    'article',
    'auxverb',
    'adverb',
    'conj',
    'ppron',
    'ipron',
    'negate']

In [4]:
df_speaker_concat = pd.read_csv(speaker_concat_file, index_col='Row ID', usecols=['Row ID', 'speaker'])
df_speaker_func = pd.read_csv(speaker_concat_output, index_col='Row ID', usecols=['Row ID'] + fun_cat_lst)

df_conv_concat = pd.read_csv(conv_concat_file, index_col='Row ID', usecols=['Row ID', 'speaker', 'listener'])
df_conv_func = pd.read_csv(conv_concat_output, index_col='Row ID', usecols=['Row ID'] + fun_cat_lst)

df_speaker_style = df_speaker_concat.join(df_speaker_func, on='Row ID').set_index('speaker')
df_conv_style = df_conv_concat.join(df_conv_func, on='Row ID').set_index(['speaker', 'listener'])

In [5]:
speaker_lst = df_speaker_style.index.tolist()
tup_lst = df_conv_style.index.tolist()

# Results

### Generate the LSM score between two vectors

In [6]:
def get_lsm(V1, V2):
    lsm_lst = []
    for (val1, val2) in list(zip(V1, V2)):
        lsm = 1 - abs(val1 - val2) / (val1 + val2 + 0.0001)
        lsm_lst.append(lsm)
    return np.mean(lsm_lst)

lsm_dct = {}
for tup in tup_lst:
    (speaker, listener) = tup
    V1 = df_conv_style.loc[tup].tolist()
    V2 = df_speaker_style.loc[speaker].tolist()
    lsm_dct[(speaker, listener)] = get_lsm(V1, V2)
    
df_conv_style.insert(0, 'similarity_with_user_baseline', lsm_dct.values())

### Final step: mean and standard deviation

In [7]:
df_style = pd.DataFrame(df_conv_style.similarity_with_user_baseline)
df_style_mean = df_style.groupby('speaker').mean()
df_style_std = df_style.groupby('speaker').std()

def get_speaker_mean(speaker):
    return df_style_mean.loc[speaker]

def get_speaker_std(speaker):
    return df_style_std.loc[speaker]

df_style = df_style.reset_index()
df_style['mean'] = df_style['speaker'].apply(get_speaker_mean)
df_style['std'] = df_style['speaker'].apply(get_speaker_std)

df_style = df_style.set_index('speaker')

# Save

In [10]:
df_style

Unnamed: 0_level_0,listener,similarity_with_user_baseline,mean,std
speaker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
U03RTJ7MJ68,U03S3QBFKED,0.956522,0.781268,0.239533
U03RTJ7MJ68,U03SESRETRN,0.42819,0.781268,0.239533
U03RTJ7MJ68,U0RFXPUFK,0.848507,0.781268,0.239533
U03RTJ7MJ68,U0RMHEX53,0.891854,0.781268,0.239533
U03S3QBFKED,U03RTJ7MJ68,0.87565,0.846398,0.07261
U03S3QBFKED,U03SESRETRN,0.793397,0.846398,0.07261
U03S3QBFKED,U0RFXPUFK,0.93531,0.846398,0.07261
U03S3QBFKED,U0RMHEX53,0.781233,0.846398,0.07261
U03SESRETRN,U03RTJ7MJ68,0.81113,0.83546,0.098369
U03SESRETRN,U03S3QBFKED,0.936008,0.83546,0.098369


In [9]:
df_style.to_csv('analysis/lsm_matrix.csv')