In [25]:
import matplotlib as mpl
mpl.use('TkAgg')
rc_fonts = {"text.usetex": True, "font.size": 30, 'mathtext.default': 'regular', 'axes.titlesize': 33, "axes.labelsize": 33, "legend.fontsize": 30, "xtick.labelsize": 30, "ytick.labelsize": 30, 'figure.titlesize': 33, 'figure.figsize': (15, 9.3), 'text.latex.preamble': [
    r'\usepackage{amsmath,amssymb,bm,physics,lmodern}'], "font.family": "serif", "font.serif": "computer modern roman", }
mpl.rcParams.update(rc_fonts)
import matplotlib.pylab as plt
import pandas as pd
import numpy as np
from scipy.stats import gaussian_kde as KDE
from sklearn import metrics


def produce_raw_data_sets(frac_for_testing=0.6):
    """
    Produces the training, testing, and validation datasets.
    :param frac_for_testing: Float, Fractional divide between train and test.
    :return: List of 3 Dataframes, Training, Testing, Validation.
    """
    conversation_id = 0
    n_to_drop = {'good':31, 'bad': 8}  # The number reserved for validation
    frac_for_train = 0.6  # The fraction for training wrt testing.
    df_train, df_test, df_validate = [], [], []
    for f_name, drop_num in n_to_drop.items():
        df1 = pd.read_excel('raw/{}.xlsx'.format(f_name))
        df = df1
        df_cols = ['word', 'from', 'to', 'speaker', 'gap']
        df = df[1:]
        df.columns = range(df.shape[1])
        words = list(df[0])
        words = [int(x) if pd.notnull(x) else x for x in words]
        df = df.drop([0], 1)
        dfs = [df[[i for i in range(j, j+len(df_cols[1:]))]] for j in range(1, max(df.columns), len(df_cols[1:]))]
        for i in range(len(dfs)):
            d = dfs[i]
            d.columns = df_cols[1:]
            dfs[i] = d.assign(id=conversation_id)
            conversation_id += 1
        df = pd.concat(dfs,ignore_index=True)
        df = df.assign(word=words*len(dfs))
        df = df.assign(type=f_name)
        df_validate.append(df[:len(words)*drop_num])
        df_include = df[len(words)*drop_num:]
        ids_train = {k:np.random.random() <= frac_for_train for k in list(set(df_include['id']))}
        df_train.append(df_include[df_include['id'].map(ids_train).values])
        df_test.append(df_include[np.invert(df_include['id'].map(ids_train).values)])
    df_train, df_test, df_validate = [pd.concat(d, ignore_index=True) for d in [df_train, df_test, df_validate]]
    return df_train, df_test, df_validate


def make_features(df):
    """
    Adds features for prediction to a data set.
    :param df: DataFrame.
    :return: List, DataFrames with the new features as columns.
    """
    ids = set(df['id'])
    dfs = []
    for id in ids:
        d = df[df['id'] == id]
        d = d.assign(sentence=d['speaker'])
        d['sentence'] = [0] + list(np.diff(d['sentence']))
        d['sentence'] = 1.0*(d['sentence'] != 0)
        d['sentence'] = np.cumsum(d['sentence'])
        g = d.groupby(['sentence'])
        f = g.apply(lambda x: [x.shape[0], np.ptp(x['from']), np.mean(x['gap']), list(x['type'])[0]])
        d = pd.DataFrame(list(f.values), columns=['n_words', 'duration', 'gap', 'type'])
        d.index.name = 'sentence'
        d = d.dropna()  # This way of dropping NAN might be a bit too aggressive.
        d = d.drop(0)  # We drop the opening sentences as they have zero gaps
        d = d[d['duration'] > 0]  # durations of Zero make no sense.
        dfs.append(d)
    # types = [list(set(d['type']))[0] for d in dfs]
    # g = pd.concat(dfs)
    return ids, dfs
    # b = g[g['type'] == 'bad']
    # g = g[g['type'] == 'good']
    # x = np.linspace(-6, 7, 1000)
    # f = ['n_words', 'gap', 'duration'][-1]
    # plt.clf()
    # plt.hist(np.log(g[f]), normed=True, label='Good', bins=100,alpha=0.5)
    # plt.hist(np.log(b[f]), normed=True, label='Bad', bins=100, alpha=0.5)
    # plt.plot(x, KDE(np.log(g[f]))(x))
    # plt.plot(x, KDE(np.log(b[f]))(x))
    # plt.legend()

#df_train, df_test, df_validate = produce_raw_data_sets()

# _, d_train = make_features(df_train)
# _, d_test = make_features(df_test)
# _, d_validate = make_features(df_validate)

In [64]:
dataframes = {'train': d_train,
              'test': d_test,
              'validate': d_validate}

In [74]:
for name, df_array in dataframes.items():
    
    for i, df in enumerate(df_array):
        dataframes[name][i] = df.reset_index().set_index(['conversation', 'sentence'])

In [115]:
d_train, d_test, d_validate = pd.concat(dataframes['train']), pd.concat(dataframes['test']), pd.concat(dataframes['validate'])

dfs = [d_train, d_test, d_validate]

d_train = d_train.reset_index().rename(columns={'sentence': 'segment'})
d_train = d_train.set_index(['conversation', 'segment'])
d_test = d_test.reset_index().rename(columns={'sentence': 'segment'})
d_test = d_test.set_index(['conversation', 'segment'])
d_validate = d_validate.reset_index().rename(columns={'sentence': 'segment'})
d_validate = d_validate.set_index(['conversation', 'segment'])
    

In [72]:
d_train.to_csv('./processed/d_train')
d_test.to_csv('./processed/d_test')
d_validate.to_csv('./processed/d_validate')

NameError: name 'd_train' is not defined

In [85]:
child_interview = pd.read_csv('raw/Child_interview.csv')
cbt_session = pd.read_csv('raw/cbt_session.csv')

real_dfs = {'child_interview': child_interview,
            'cbt_session': cbt_session}

for name, df in real_dfs.items():
    print(name, 0)
    df['id'] = 1
    df.rename(columns={'TimeFrom': 'from',
                       'TimeTo': 'to',
                       'Gap between speakers': 'gap',
                       'Speaker': 'speaker'}, inplace=True)
    df['type'] = 'Unknown'
    _, real_dfs[name] = make_features(df)
    
    real_dfs[name] = real_dfs[name][0]
    real_dfs[name].drop(['type'], axis=1, inplace=True)

    real_dfs[name].to_pickle('./processed/real/{}'.format(name))


child_interview 0
cbt_session 0
