In [1]:
import gensim
import nltk
import string
import operator

import numpy as np
import pandas as pd

from gensim.models import Word2Vec
from nltk import word_tokenize, pos_tag
from collections import defaultdict
from copy import deepcopy
from math import floor

# Auxiliary Funcs
_PUNC_ = string.punctuation
CGREEN ="\033[46m"
CEND = '\033[0m'
CRED = "\033[41m"

def line_process(text, tag=False, remove_punc=True, lower=True):
    """
    Auxiliary function that preprocess the given line
    lowercase - strip - remove punc - tokenize - (pos_tag)
    """
    text = text.strip()
    
    if remove_punc:
        text = "".join([char for char in text if char not in _PUNC_])
    text = word_tokenize(text)

    if lower:
        text = [item.lower() for item in text]

    text_tags = None
    if tag:
        text_tags = pos_tag(text)

    return text, text_tags

def file2corpus(file_path, corpus=None, with_tag=False, with_punc=False, lower=True):
    """
    Auxiliary function that read document from a file and write them into a corpus (list_of_docs)
    """
    corpus = corpus if corpus is not None else list()
    with open(file_path, 'r') as f:
        lines = f.readlines()

    new_doc = False
    for eachLine in lines:
        if eachLine == '\n':
            new_doc = False
        else:
            if with_tag:
                _, lineComp = line_process(eachLine, tag=True, remove_punc=operator.not_(with_punc), lower=lower)
            else:
                lineComp, _ = line_process(eachLine, tag=False, remove_punc=operator.not_(with_punc), lower=lower)
            if not new_doc:
                new_doc = True
                corpus.append(lineComp)
                corpus[-1].append('\n')
            else:
                corpus[-1] += lineComp

    return corpus

def concatTokens(token_list):
    """
    Auxiliary function that print the token in a human-friendly fashion
    """
    ans = list()
    for eachToken in token_list:
        if eachToken not in _PUNC_:
            ans.append(" ")
        ans.append(eachToken)

    return "".join(ans[1:])

def model_test(corpus, capital_corpus, w2m, c2w, conf):
    """
    Take original documents, meta and configurations, generate faked doc tokens
    """
    num_bin, tgt_bin, num_rep = conf.num_bin, conf.tgt_bin, conf.num_rep

    fake_corpus = list()
    real_corpus = list()
    for eachDoc, eachCapDoc in zip(corpus, capital_corpus):
        for _ in range(3):
            eachFake, eachReal = gen_fake_once(eachDoc, eachCapDoc, w2m, c2w, num_bin, tgt_bin, num_rep)
            fake_corpus.append(eachFake)
        real_corpus.append(eachReal)

    return fake_corpus, real_corpus

def print_result(result_corpus):
    """
    Auxiliary function that takes a result corpus (Dic or named tuple) and print them line-by-line
    """
    for eachResult in result_corpus.keys():
        print(f'Metric: {eachResult} -- {result_corpus[eachResult]}')
    pass

def write_result(result_corpus, file_path):
    with open(file_path, 'a') as f:
        for eachResult in result_corpus.keys():
            f.write(f'Metric: {eachResult} -- {result_corpus[eachResult]}\n')
    pass

def token2string(token_list):
    """
    Auxiliary function that revert the "work_tokenize" -- Approximate imp.
    """
    ans = list()
    for eachToken in token_list:
        if not isinstance(eachToken, str) or eachToken not in _PUNC_:
            if len(ans) > 0 and ans[-1] in ['(', '\n', '\'']:
                pass
            else:
                ans.append(" ")
            ans.append(str(eachToken))
        else:
            if str(eachToken) == "(":
                ans.append(" ")

            ans.append(str(eachToken))

    return "".join(ans[1:])

def gen_fake_once(doc, cap_doc, w2m, c2w, num_bin, tgt_bin, num_rep):
    assert len(doc) == len(cap_doc), "DOC & CAP_DOC size inconsistent"
    
    # Find concepts of given doc & Get TF-IDF
    concept_candidates = defaultdict(float)

    term_cnt = 0
    for eachToken, tokenProp in pos_tag(doc):
        if eachToken == '\n':
            continue

        if tokenProp == 'NN':
            concept_candidates[eachToken.lower()] += 1
        if eachToken not in _PUNC_:
            term_cnt += 1
    
    concept_list = list()
    for eachConcept in concept_candidates.keys():
        concept_candidates[eachConcept] /= float(term_cnt)
        meta_tuple = w2m.get(eachConcept, None)
        if meta_tuple is None:
            continue

        concept_candidates[eachConcept] *= meta_tuple[1]  # TF-IDF computation
        concept_list.append((eachConcept, meta_tuple[0], concept_candidates[eachConcept]))

    # Sort into bins
    concept_list.sort(key=lambda x: x[2])
    
    # Select & Scale replacement
    if len(concept_list) < num_bin:
        tgt_bin= floor((tgt_bin / num_bin) * len(concept_list))
        num_bin = len(concept_list)
        print(f'Number of cencepts is less than num_bin. Scaling num_bin to {num_bin}, target bin\'s idx to {tgt_bin}')

    num_concepts = len(concept_list)
    split_factor = num_concepts // num_bin
    bin_dict = defaultdict(list)
    for idx in range(num_concepts):
        bin_dict[idx // split_factor].append(concept_list[idx])

    num_bin_concepts = len(bin_dict[tgt_bin])
    if num_bin_concepts < num_rep or num_rep < 0:
        print(f'Number of concepts ({num_bin_concepts}) in the target bin is less than num_rep. Scaling num_rep to {num_bin_concepts}')

    replace_idxs = np.random.permutation(num_bin_concepts)[:min(num_bin_concepts, num_rep)]

    # Build Replacement Mapping
    replace_mapping = dict()
    for eachIdx in replace_idxs:
        c_token = bin_dict[tgt_bin][eachIdx]
        token_nn = c2w[c_token[1]]
        if len(token_nn) < 2:
            replace_mapping[c_token[0]] = c_token[0]
        else:
            tmpIdx = np.random.choice(len(token_nn))
            while token_nn[tmpIdx] == c_token[0]:
                tmpIdx = np.random.choice(len(token_nn))
            replace_mapping[c_token[0]] = token_nn[tmpIdx]

    # Generate fake doc via replacement mapping
    fake_doc = list()
    real_doc = list()
    change_cnt = 0
    for idx in range(len(doc)):
        current_token = replace_mapping.get(doc[idx], None)
        if current_token is None:
            real_doc.append(cap_doc[idx])
            fake_doc.append(cap_doc[idx])
        else:
            real_doc.append(CGREEN + doc[idx] + CEND)
            fake_doc.append(CRED + current_token + CEND)

            if cap_doc[idx].istitle() or cap_doc[idx].isupper():
                fake_doc.append(current_token.capitalize())
            else:
                fake_doc.append(current_token)
    
            change_cnt += 1

    print(f"Change Token Ratio of this doc is : {float(change_cnt) / len(real_doc)}")

    return fake_doc, real_doc

def doc2corpus(data, corpus=None, with_tag=False, with_punc=False, lower=True):
    """
    Auxiliary function that read document from a file and write them into a corpus (list_of_docs)
    """
    for doc in data:
        lines = doc.split('\n')
        new_doc = False
        for eachLine in lines:
            if eachLine == '\n':
                new_doc = False
            else:
                if with_tag:
                    _, lineComp = line_process(eachLine, tag=True, remove_punc=operator.not_(with_punc), lower=lower)
                else:
                    lineComp, _ = line_process(eachLine, tag=False, remove_punc=operator.not_(with_punc), lower=lower)
                if not new_doc:
                    new_doc = True
                    corpus.append(lineComp)
                else:
                    corpus[-1] += ['\n']+ lineComp
    return corpus

    
class DummyConfiguration(object):
    def __init__(self, num_bin, tgt_bin, num_rep):
        self.num_bin = num_bin
        self.tgt_bin = tgt_bin
        self.num_rep = num_rep
        pass    
   

In [2]:
print('Loading test data...')
TEST_PATH = '../data/raw_data/cs_clean/test.txt'   

Idx = [204, 645, 745, 791,  
1120, 1226, 1263, 1578, 1633,  
2203, 2369, 3005, 3357, 202,  
3206, 2938]

with open(TEST_PATH, 'r') as f:    
    test_data = f.read().split('\n\n\n')
test_data = np.array(test_data)[Idx]

Loading test data...


In [3]:
META_PATH = 'outputs/cs_meta_neo.csv'

meta_df = pd.read_csv(META_PATH)
word2meta = dict()
centroid2word = defaultdict(list)

for instance_idx in range(len(meta_df.index)):
    cinstance = meta_df.iloc[instance_idx]
    word2meta[cinstance['Word']] = (cinstance['Centroids'], cinstance['IDF'])
    centroid2word[cinstance['Centroids']].append(cinstance['Word'])
    
doc_corpus = doc2corpus(test_data, list(), with_punc=True)
doc_corpus_capital = doc2corpus(test_data, list(), with_punc=True, lower=False)

In [4]:
num_bin = 2
tgt_bin = 1

conf = DummyConfiguration(num_bin, tgt_bin, -1)

print('Generating fake docs...')
fake_corpus, real_corpus = model_test(doc_corpus, doc_corpus_capital, word2meta, centroid2word, conf)

original_docs = [token2string(item) for item in real_corpus]
generate_docs = [token2string(item) for item in fake_corpus]

Generating fake docs...
Number of concepts (8) in the target bin is less than num_rep. Scaling num_rep to 8
Change Token Ratio of this doc is : 0.08275862068965517
Number of concepts (8) in the target bin is less than num_rep. Scaling num_rep to 8
Change Token Ratio of this doc is : 0.08275862068965517
Number of concepts (8) in the target bin is less than num_rep. Scaling num_rep to 8
Change Token Ratio of this doc is : 0.08275862068965517
Number of concepts (13) in the target bin is less than num_rep. Scaling num_rep to 13
Change Token Ratio of this doc is : 0.15172413793103448
Number of concepts (13) in the target bin is less than num_rep. Scaling num_rep to 13
Change Token Ratio of this doc is : 0.15172413793103448
Number of concepts (13) in the target bin is less than num_rep. Scaling num_rep to 13
Change Token Ratio of this doc is : 0.14482758620689656
Number of concepts (9) in the target bin is less than num_rep. Scaling num_rep to 9
Change Token Ratio of this doc is : 0.09210526

In [5]:
for idx, ori_file in enumerate(original_docs):
    print('-' * 80)
    print(' ' * 36 + 'ORIGINAL')
    print(ori_file)
    for fake_idx in range(3 * (idx), 3 * (idx+1)):
        print('-' * 80)
        print(' ' * 36 + 'FAKE ' + str(fake_idx%3))
        fake = generate_docs[fake_idx]
        print(fake)

--------------------------------------------------------------------------------
                                    ORIGINAL
Normalized [46minformation[0m Distance 
The normalized [46minformation[0m distance is a universal distance measure for objects of all kinds. It is based on [46mkolmogorov[0m [46mcomplexity[0m and thus uncomputable, but there are ways to utilize it. First, compression algorithms can be used to approximate the [46mkolmogorov[0m [46mcomplexity[0m if the objects have a [46mstring[0m representation. Second, for names and abstract concepts, page [46mcount[0m statistics from the World Wide Web can be used. These practical realizations of the normalized [46minformation[0m distance can then be applied to machine learning tasks, expecially clustering, to perform feature-free and parameter-free data mining. This [46mchapter[0m discusses the theoretical foundations of the normalized [46minformation[0m distance and both practical realizations. It presen