In [1]:
import os
os.chdir('../../..')

In [2]:
import convokit
import pandas as pd
import numpy as np

In [3]:
# CORPUS_DIR = "convokit/thread_generator/annotated-fake-trajectory"
CORPUS_DIR = "convokit/tensor_decomposer/experiments/reddit-trajectory-subset-annotated"

In [4]:
from convokit import Corpus
from convokit import Clusterer
from sklearn.preprocessing import StandardScaler
from convokit import HyperConvo

In [5]:
corpus = Corpus(CORPUS_DIR)

## Hyperconv-20

In [6]:
corpus.print_summary_stats()

Number of Speakers: 25994
Number of Utterances: 53135
Number of Conversations: 1500


In [7]:
hc20 = HyperConvo(prefix_len=20, feat_name='hyperconvo-20', invalid_val=-1)

In [8]:
hc20.fit_transform(corpus)

<convokit.model.corpus.Corpus at 0x129f7e290>

In [9]:
data = dict()
for convo in corpus.iter_conversations():
    data[convo.id] = convo.meta['hyperconvo-20']

In [10]:
df = pd.DataFrame(data).T

In [11]:
np.sum(np.sum(pd.isnull(df))) # checking for NaN vals

0

In [12]:
vals = StandardScaler().fit_transform(df.values)

## Hyperconvo-20, PCA (rank 9)

In [13]:
from sklearn.decomposition import PCA

In [14]:
pca = PCA(n_components=9)

In [15]:
pca_20_9 = pca.fit_transform(vals)

In [16]:
Clusterer.purity(pca_20_9, n_clusters=3, actual_num_clusters=3, group_size=500)

0.796

## Hyperconvo-20, PCA (rank 3)

In [17]:
pca_20_3 = PCA(n_components=3).fit_transform(vals)

In [18]:
Clusterer.purity(pca_20_3, n_clusters=3, actual_num_clusters=3, group_size=500)

0.8

## Hyperconvo-concat

In [19]:
for convo in corpus.iter_conversations():
    convo.meta['concat'] = dict()
    for idx in range(3, 20+1):
        d = convo.meta['hyperconvo-{}'.format(idx)].copy()
        convo.meta['concat'].update({k+"_"+str(idx): v for k, v in d.items()})

In [20]:
len(corpus.random_conversation().meta['concat'])

2520

In [21]:
2660 / 140

19.0

In [22]:
concat_data = dict()
for convo in corpus.iter_conversations():
    concat_data[convo.id] = convo.meta['concat']

In [23]:
concat_df = pd.DataFrame(concat_data).T

In [24]:
np.sum(np.sum(pd.isnull(concat_df))) # NaN values

25548

In [25]:
concat_vals = concat_df.values.astype('float64')

In [26]:
concat_vals[np.isnan(concat_vals)] = -1

In [27]:
concat_vals[np.isnan(concat_vals)]

array([], dtype=float64)

In [28]:
concat_vals = StandardScaler().fit_transform(concat_vals)

## PCA (rank 9)

In [30]:
pca_concat_9 = PCA(n_components=9).fit_transform(concat_vals)
Clusterer.purity(pca_concat_9, n_clusters=3, actual_num_clusters=3, group_size=500)

0.968

## PCA (rank 3)

In [32]:
pca_concat_3 = PCA(n_components=3).fit_transform(concat_vals)
Clusterer.purity(pca_concat_3, n_clusters=3, actual_num_clusters=3, group_size=500)

0.9666666666666667

In [141]:
pca_concat_9

array([[ 32.79581832,  -3.67773664,  32.17694763, ...,  -4.78298362,
          0.3103629 ,   5.2829629 ],
       [ 38.95020812,  -9.38072444, -11.99405252, ...,  16.43580309,
        -19.56778948,   0.43145715],
       [ 43.79290604,  -9.99408821,  48.5498135 , ...,   9.79125929,
          9.5262141 ,   9.07122533],
       ...,
       [-25.26482742,  17.22759867,  10.37474599, ...,  19.95641962,
          7.00169012,  -4.68075883],
       [-22.12409456,  14.72896106,  17.70278678, ...,   3.25189752,
          1.03975168, -10.90308218],
       [-37.85369453, -11.91373247,   0.47802821, ...,  -6.33990889,
         -8.263612  ,   1.93409771]])

In [142]:
pca_concat_3

array([[ 32.79581832,  -3.67773666,  32.17694791],
       [ 38.95020812,  -9.38072443, -11.99405353],
       [ 43.79290604,  -9.99408824,  48.5498153 ],
       ...,
       [-25.26482742,  17.2275987 ,  10.37474589],
       [-22.12409456,  14.72896112,  17.70278578],
       [-37.85369453, -11.91373245,   0.47802845]])