In [1]:
import os
os.chdir('../../..')

In [2]:
import convokit
import pandas as pd
import numpy as np

In [3]:
CORPUS_DIR = "convokit/thread_generator/annotated-fake-trajectory-4"
# CORPUS_DIR = "convokit/tensor_decomposer/experiments/reddit-trajectory-subset-annotated"

In [4]:
from convokit import Corpus
from convokit import Clusterer
from sklearn.preprocessing import StandardScaler
from convokit import HyperConvo

In [5]:
corpus = Corpus(CORPUS_DIR)

## Hyperconv-20

In [6]:
corpus.print_summary_stats()

Number of Speakers: 25994
Number of Utterances: 53135
Number of Conversations: 1500


In [7]:
hc20 = HyperConvo(prefix_len=20, feat_name='hyperconvo-20', invalid_val=-1)

In [8]:
hc20.fit_transform(corpus)

<convokit.model.corpus.Corpus at 0x129f7e290>

In [6]:
data = dict()
for convo in corpus.iter_conversations():
    data[convo.id] = convo.meta['hyperconvo-20']

In [7]:
df = pd.DataFrame(data).T

In [8]:
np.sum(np.sum(pd.isnull(df))) # checking for NaN vals

0

In [9]:
vals = StandardScaler().fit_transform(df.values)

In [13]:
GROUP_SIZE=1500

## Hyperconvo-20, PCA (rank 9)

In [10]:
from sklearn.decomposition import PCA

In [11]:
pca = PCA(n_components=9)

In [12]:
pca_20_9 = pca.fit_transform(vals)

In [14]:
Clusterer.purity(pca_20_9, n_clusters=3, actual_num_clusters=3, group_size=GROUP_SIZE)

0.9588888888888889

## Hyperconvo-20, PCA (rank 3)

In [15]:
pca_20_3 = PCA(n_components=3).fit_transform(vals)

In [17]:
Clusterer.purity(pca_20_3, n_clusters=3, actual_num_clusters=3, group_size=GROUP_SIZE)

0.7282222222222222

## Hyperconvo-concat

In [18]:
for convo in corpus.iter_conversations():
    convo.meta['concat'] = dict()
    for idx in range(3, 20+1):
        d = convo.meta['hyperconvo-{}'.format(idx)].copy()
        convo.meta['concat'].update({k+"_"+str(idx): v for k, v in d.items()})

In [19]:
len(corpus.random_conversation().meta['concat'])

2520

In [20]:
2660 / 140

19.0

In [21]:
concat_data = dict()
for convo in corpus.iter_conversations():
    concat_data[convo.id] = convo.meta['concat']

In [22]:
concat_df = pd.DataFrame(concat_data).T

In [23]:
np.sum(np.sum(pd.isnull(concat_df))) # NaN values

0

In [24]:
concat_vals = concat_df.values.astype('float64')

In [26]:
concat_vals[np.isnan(concat_vals)] = -1

In [27]:
concat_vals[np.isnan(concat_vals)]

array([], dtype=float64)

In [25]:
concat_vals = StandardScaler().fit_transform(concat_vals)

## PCA (rank 9)

In [26]:
pca_concat_9 = PCA(n_components=9).fit_transform(concat_vals)
Clusterer.purity(pca_concat_9, n_clusters=3, actual_num_clusters=3, group_size=GROUP_SIZE)

0.988

## PCA (rank 3)

In [27]:
pca_concat_3 = PCA(n_components=3).fit_transform(concat_vals)
Clusterer.purity(pca_concat_3, n_clusters=3, actual_num_clusters=3, group_size=GROUP_SIZE)

0.9844444444444445

In [28]:
pca_concat_9

array([[ -6.84884006, -28.82968805,  20.25392391, ...,   0.44204589,
         -1.2714628 ,  -5.04941634],
       [-19.14378096, -19.7768689 ,  11.06637143, ...,   0.30593675,
          4.02062489,   6.03047848],
       [  6.17296591,  -3.26043173,  27.99443101, ...,   5.56132235,
         -6.11911761,   7.80935148],
       ...,
       [-38.25283433,  -8.13848303, -21.42095776, ...,   3.36659203,
          1.33705509,   0.90086804],
       [-23.4830956 ,  29.44835327,  -2.30542456, ...,  -7.65735327,
         -1.24193502,  -7.34615908],
       [-10.18839047,  15.18437324,  -0.88564103, ...,   4.6196634 ,
          2.27286834,  -7.70028564]])

In [29]:
pca_concat_3

array([[ -6.84884006, -28.82968805,  20.25392396],
       [-19.14378096, -19.7768689 ,  11.06637143],
       [  6.17296591,  -3.26043176,  27.99443078],
       ...,
       [-38.25283433,  -8.13848305, -21.42095793],
       [-23.4830956 ,  29.4483532 ,  -2.3054249 ],
       [-10.18839047,  15.18437319,  -0.88564125]])