# Intrinsic Eval (Purity)

In [1]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.cluster import SpectralClustering, KMeans
from sklearn import metrics
from nltk.tokenize import word_tokenize
import multiprocessing
import tensorflow as tf
import json

In [2]:
sys.path.append("../")
from rs_helper import DAN, FastTextWrapper, scatter_plot



## Read Data

In [3]:
data_dir = "../datasets/final_datasets/clean_dataset"

In [4]:
def label_map(x:str):
    if x == "clustering":
        return 0
    if x == "prediction":
        return 1
    if x == "pattern_mining":
        return 2

In [5]:
def read_data(_dir:str) -> pd.DataFrame:
    data = {}
    #data["url"] = []
    data["text"] = []
    data["class"] = []
    for root, dirs, files in os.walk(_dir):
        for _dir in dirs: 
            for txt_file in [x for x in os.listdir(os.path.join(root, _dir)) if x.endswith((".txt", ".TXT"))]:
                # Class name = dir name
                class_name = _dir
                #Read File
                file_name = os.path.abspath(os.path.join(root, _dir, txt_file))
                file = open(file_name, "r")
                txt = file.read()
                file.close()
                #data["url"].append(file_name)
                data["text"].append(txt)
                data["class"].append(class_name)
    df = pd.DataFrame.from_dict(data)
    del data
    return df

In [6]:
df = read_data(data_dir).sample(frac=1)

In [7]:
df["label"] = df["class"].apply(lambda x: label_map(x))

In [None]:
df.head()

## Get Embeddings

In [None]:
ft_model = FastTextWrapper("ft_models/fasttext_12/model.joblib")

In [None]:
dan = DAN(ft_model, "DAN_COMBINATIONS/model_layer17/frozen_graph.pb")

In [None]:
df["tokenized"] = df["text"].apply(lambda x: word_tokenize(x))

In [None]:
embeddings = dan.inference_batches(df["tokenized"].tolist())

In [None]:
df["embeddings"] =embeddings

In [None]:
scatter_plot(dan, df["text"].tolist(), df["label"].tolist())

## Clustering

In [None]:
df["embeddings"] = df["embeddings"].apply(lambda x: x[0])

In [None]:
%%time
clustering = SpectralClustering(n_clusters=3,assign_labels="discretize",random_state=0, n_jobs=multiprocessing.cpu_count())
clustering.fit(df["embeddings"].tolist())

In [None]:
clustering.labels_

## Result of Spectral Clustering

In [None]:
scatter_plot(dan,df["text"].tolist(), clustering.labels_)

## Results Kmeans

In [None]:
kmeans = KMeans(n_clusters=3, random_state=0).fit(df["embeddings"].tolist())

In [None]:
scatter_plot(dan, df["text"].tolist(),kmeans.labels_)

## Purity Score

In [9]:
def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

In [None]:
# Spectral Clustering
purity_score(df["label"].tolist(), clustering.labels_)

In [None]:
#Kmeans
purity_score(df["label"].tolist(), kmeans.labels_)

## Calculate Purity Score for every Model

In [8]:
path = "DAN_COMBINATIONS"

In [10]:
dan_models = []

In [11]:
tf.logging.set_verbosity(tf.logging.ERROR)
for model in os.listdir(path):
    if os.path.isdir(os.path.join(path,model)):
        if model != ".ipynb_checkpoints":
            frozen_graph = os.path.join(path, model, "frozen_graph.pb")
            config = os.path.abspath(os.path.join(path,model, "config.json"))
            # Read JSON and Get FT
            with open(config) as _json:
                data = json.load(_json)
            dan_models.append((frozen_graph, config))

In [15]:
%%time
counter = 0
for frozen_graph, config in dan_models[256:]:
     # Read Score
    with open(config, "r") as _json:
            _dict = json.load(_json)

    dan = DAN(FastTextWrapper(_dict["ft_model"]), frozen_graph)

    _df = df
    # Changed to batch inference
    _df["embedding"] = dan.inference_batches(_df["text"].apply(lambda x: word_tokenize(x)).tolist())
    _df["embedding"] = _df["embedding"].apply(lambda x: x[0]) # Only because of batch inference.. 
    #_df["label"] = _df["class"].apply(lambda x: label_map(x))

    #clustering = SpectralClustering(n_clusters=3,assign_labels="discretize",random_state=0, n_jobs=multiprocessing.cpu_count())
    clustering = KMeans(n_clusters=3, n_jobs = multiprocessing.cpu_count())
    clustering.fit(_df["embedding"].tolist())

    ps = purity_score(_df["label"].tolist(), clustering.labels_)
    _dict.update({"purity_score_kmeans": ps})
    with open(config, "w") as _json:
        json.dump(_dict, _json)
    counter +=1
    print(ps)
    print("{}/{}".format(counter, len(dan_models)))

0.9621238628411477
1/479
0.9724457662701189
2/479
0.9619489153254024
3/479
0.9629986004198741
4/479
0.9671973407977607
5/479
0.9640482855143457
6/479
0.9687718684394682
7/479
0.9647480755773268
8/479
0.9627361791462561
9/479
0.9658852344296711
10/479
0.9635234429671099
11/479
0.9645731280615816
12/479
0.9807557732680195
13/479
0.9745451364590623
14/479
0.9749825052484254
15/479
0.9769944016794961
16/479
0.9715710286913926
17/479
0.9778691392582225
18/479
0.9704338698390483
19/479
0.9732330300909727
20/479
0.9776067179846046
21/479
0.9727081875437369
22/479
0.959849545136459
23/479
0.9727956613016095
24/479
0.9696466060181945
25/479
0.9730580825752274
26/479
0.968334499650105
27/479
0.9696466060181945
28/479
0.970258922323303
29/479
0.9670223932820154
30/479
0.9681595521343597
31/479
0.9697340797760672
32/479
0.9633484954513646
33/479
0.9664975507347796
34/479
0.980668299510147
35/479
0.9772568229531141
36/479
0.9776941917424773
37/479
0.9778691392582225
38/479
0.9782190342897131
39/479