In [1]:
import pandas as pd
import json
import pickle
import joblib

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import sys 
sys.path.append('../')
from fast_tsne.fast_tsne import fast_tsne

In [3]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
from eval_canarex import calculate_mse

## Cluster results

### Textrank

In [18]:
path='relatio_test/clust_100/'

cpath = '{}/kmeans_all.pkl'.format(path)
narratives_path='{}/final_narratives.jsonl'.format(path)
result_path = '{}/results.jsonl'.format(path)
eval_path = '{}/eval_output.csv'.format(path)

ctype='true_text'

### Load scores

In [19]:
df = pd.read_pickle(cpath)

In [7]:
df.head(2)

Unnamed: 0,k,score,dist,labels
0,2,0.109552,"[0.97442937, 0.9733651, 0.94854605, 0.68408346...","[1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
0,3,0.114131,"[0.9483961, 0.9779208, 0.94301593, 0.5396521, ...","[2, 2, 2, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."


In [20]:
df = df[df.score == df.score.max()]
df = df.explode(['dist', 'labels'])
df = df.reset_index(drop = True)

In [10]:
df.shape

(208276, 4)

In [11]:
df.labels.nunique()

6

In [12]:
df.head(2)

Unnamed: 0,k,score,dist,labels
0,6,0.127249,0.767754,3
1,6,0.127249,0.941476,1


In [13]:
df.shape

(208276, 4)

### Merge narratives

In [8]:
narratives_path

'relatio_test/clust_100//final_narratives.jsonl'

In [9]:
chunks = pd.read_json(narratives_path, 
                      lines=True, chunksize=10000)
narratives = None
for chunk in chunks:
    if narratives is None:
        narratives = chunk
    else:
        narratives = pd.concat([narratives, chunk], axis=0)

In [16]:
narratives.shape

(208276, 8)

In [17]:
narratives.index

RangeIndex(start=0, stop=208276, step=1)

In [21]:
final = pd.concat([narratives, df.reset_index(drop = True)],
                  axis = 1)

In [22]:
final = final.rename(columns = {'sentence': 'text'})

In [22]:
final.shape

(208276, 12)

In [23]:
df = pd.read_json('{}/synthetic_test_data_{}.jsonl'.format('synthetic_data', ctype), 
                  lines = True, orient='records')

In [24]:
orig_text = df[['id', 'sentence_id', 'text', 'noise', 'synthetic_label', 'time']].drop_duplicates()


In [26]:
final.shape

(208276, 12)

In [25]:
final = pd.merge(final, orig_text)

In [28]:
final.shape

(208276, 16)

In [159]:
# 1 = triangle (18: kevin rudd)
# 2 = square (24: climate change)
# 3 = random wave (63: Indeginous people)

In [29]:
final[final.noise == False].groupby(['synthetic_label', 'labels']).\
size().\
reset_index(name='l').\
sort_values(['synthetic_label', 'l'], ascending=[True, False]).\
groupby(['synthetic_label']).\
head(3)

Unnamed: 0,synthetic_label,labels,l
5,1,5,14669
3,1,3,4613
1,1,1,3327
7,2,1,16283
10,2,4,13261
6,2,0,9552
14,3,2,68030
13,3,1,10710
12,3,0,5687


### Calculate MSE

In [32]:
ctype

'true_text'

In [33]:
df = pd.read_json('{}/synthetic_test_data_{}.jsonl'.format('synthetic_data', ctype), 
                  lines = True, orient='records')

In [34]:
final = pd.read_json(result_path, lines = True, orient='records')

In [72]:
final.labels.unique()

array([3, 1, 5, 2, 4, 0])

In [26]:
eval_df = calculate_mse(df, final)

In [53]:
eval_path

'relatio_test/clust_100//eval_output.csv'

In [87]:
eval_df.to_csv(eval_path, index = False)

In [None]:
eval_df

In [76]:
eval_df[eval_df.mse == eval_df.groupby(['orig_clust'])['mse'].transform('min')]

Unnamed: 0,orig_clust,pred_clust,intercept,coef,mse,r_sq
2,1,5,-19.232467,3.20669,471.754359,0.977219
10,2,4,-43.677397,5.831577,1444.872357,0.96783
15,3,2,4.528892,2.235056,580.683972,0.936984


### Save output

In [79]:
eval_path

'relatio_test/clust_100//eval_output.csv'

In [None]:
eval_df.to_csv(eval_path, index = False)

### Create label mapping

In [80]:
label_mapping = eval_df[eval_df.mse == eval_df.groupby(['orig_clust'])['mse'].transform('min')]
label_mapping = label_mapping[['orig_clust', 'pred_clust']]
label_mapping = label_mapping.rename(columns = {'pred_clust': 'labels',
                                      'orig_clust':'new_labels'})

In [81]:
label_mapping

Unnamed: 0,new_labels,labels
5,1,5
10,2,4
14,3,2


In [82]:
# column, new_labels: mapped original cluster assignments (3 clusters)
# column, labels: new cluster assignments (N clusters)

# Map the rest of the (N-3) labels to a single cluster (as it is not considered recovery)
# Used for plotting graphs
pending_labels = deque()
pending_labels.extend(set(range(0, final.labels.nunique())) - set(label_mapping.labels.tolist()))

label_mapping_dict = []
for i in range(4, 4 + len(pending_labels)):
    label_mapping_dict.append({'new_labels': 4, 'labels':pending_labels.pop()})

# Add pending labels
label_mapping = pd.concat([label_mapping, pd.DataFrame(label_mapping_dict)], axis = 0).reset_index(drop = True)

In [83]:
# narrative cluster assignments mapped to orignial synthetic cluster
# Cluster 1 = triangle (kevin rudd)
# Cluster 2 = square (climate change)
# Cluster 3 = random wave (Indeginous people)
# Cluster 4 = Other new cluster identified (we treat this as noise)
label_mapping

Unnamed: 0,new_labels,labels
0,1,5
1,2,4
2,3,2
3,4,3
4,4,1
5,4,0


In [84]:
final = pd.merge(final, label_mapping)

### Plot narrative clusters

In [85]:
cluster_labels = {1:'Cluster 1: Kevin Rudd', #18, #synthetic label 1, Kevin Rudd
2:'Cluster 2: Climate change', # 24 #synthetic label 2 Climate change
3:'Cluster 3: Indeginous people', #63 #synthetic label 3, Indeginous people
4: 'Cluster 4: New cluster'}

colour = {'1':'#abd9e9',
'2':'#fdae61',
'3': '#D2AB3C',
'4': '#5ab4ac'}

In [None]:
test = final[final.noise == False]\
[['sentence_id','time', 'text', 'synthetic_label', 'noise', 'new_labels']].drop_duplicates()
test = test.groupby(['time', 'new_labels']).size().reset_index(name='l')
test = test.sort_values(by=['new_labels'], key=lambda x: x.map(cluster_labels))

fig, ax = plt.subplots(figsize=(8,6))
for label in test.new_labels.unique():
    ax.scatter(test[test.new_labels == label].time.tolist(), 
               test[test.new_labels == label].l.tolist(), 
               c=[colour[str(i)] for i in test[test.new_labels == label].new_labels.tolist()], 
               s=15, label=cluster_labels[label],
               alpha=1, edgecolors='none')

ax.legend(title='Topics', loc=1, prop={'size': 12}, 
          title_fontsize=15, markerscale=2, framealpha=0.5)
ax.grid(False)
ax.set_ylabel("No of sentences", size=14)
ax.set_xlabel("Time", size=14)
ax.grid(False)
plt.show()

### Compare clusters

In [56]:
import pandas as pd

In [89]:
# All clusters
path='relatio_test/clust_40'
eval_df = pd.read_csv('{}/eval_output.csv'.format(path))

In [90]:
eval_df[eval_df.mse == eval_df.groupby(['orig_clust'])['mse'].transform('min')]

Unnamed: 0,orig_clust,pred_clust,intercept,coef,mse,r_sq
3,1,3,-7.257261,3.467224,378.230393,0.981817
11,2,5,-134.695081,6.384355,6818.049052,0.827567
12,3,0,7.804375,2.296535,555.529785,0.939877


In [91]:
path='relatio_test/clust_100'
eval_df = pd.read_csv('{}/eval_output.csv'.format(path))

In [92]:
eval_df[eval_df.mse == eval_df.groupby(['orig_clust'])['mse'].transform('min')]

Unnamed: 0,orig_clust,pred_clust,intercept,coef,mse,r_sq
5,1,5,-19.232467,3.20669,471.754359,0.977219
10,2,4,-43.677397,5.831577,1444.872357,0.96783
14,3,2,4.528892,2.235056,580.683972,0.936984
