In [2]:
import json
import os
import gzip
from collections import Iterable
import configparser

_ENCODING_UTF8 = 'utf-8'

  from collections import Iterable


In [3]:
def read_jsonline(filename, encoding=_ENCODING_UTF8, default=None, is_gzip=False):
    """
    read jsonl file
    :param filename: source file path
    :param encoding: file encoding
    :param default: returned value when filename is not existed.
                    If it's None, exception will be raised as usual.
    :param is_gzip: whether input file is gzip format
    :return: object list, an object corresponding a line
    """
    if not os.path.exists(filename) and default is not None:
        return default
    if not is_gzip:
        file = open(filename, encoding=encoding)
    else:
        file = gzip.open(filename, 'rt', encoding=encoding)
    items = []
    for line in file:
        items.append(json.loads(line))
    file.close()
    return items

In [4]:
def write_json(filename, data, serialize_method=None):
    """
    dump json data to file, support non-UTF8 string (will not occur UTF8 hexadecimal code).
    :param filename: destination file path
    :param data: data to be saved
    :param serialize_method: python method to do serialize method
    :return: None
    """
    with open(filename, 'a', encoding=_ENCODING_UTF8) as f:
        if not serialize_method:
            json.dump(data, f, ensure_ascii=False)
        else:
            json.dump(data, f, ensure_ascii=False, default=serialize_method)

In [5]:
def append_line(filename, line, encoding=_ENCODING_UTF8):
    """
    append single line to file
    :param filename: destination file path
    :param line: line string
    :param encoding: text encoding to save data
    :return: None
    """
    if not isinstance(line, str):
        raise TypeError('line is not in str type')
    with open(filename, 'a', encoding=encoding) as f:
        f.write(line + '\n')


In [None]:
# 数据的标准格式

In [46]:
dict_list_kp20k = read_jsonline('/mnt/KeyphraseExpansion/data/raw/kp20k/kp20k_testing.json')
for k,v in dict_list_kp20k[0].items():
    print('['+k+']')
    print(v)

[abstract]
A feedback vertex set of a graph G is a set S  of its vertices such that the subgraph induced by V(G)?S V ( G ) ? S is a forest. The cardinality of a minimum feedback vertex set of G  is denoted by ?(G) ? ( G ) . A graph G is 2-degenerate  if each subgraph G? G ? of G has a vertex v  such that dG?(v)?2 d G ? ( v ) ? 2 . In this paper, we prove that ?(G)?2n/5 ? ( G ) ? 2 n / 5 for any 2-degenerate n-vertex graph G and moreover, we show that this bound is tight. As a consequence, we derive a polynomial time algorithm, which for a given 2-degenerate n -vertex graph returns its feedback vertex set of cardinality at most 2n/5 2 n / 5 .
[keyword]
feedback vertex set;decycling set;2-degenerate graphs
[title]
A feedback vertex set of 2-degenerate graphs


In [None]:
# 数据现在的格式

In [6]:
dict_list = read_jsonline('/mnt/KeyphraseExpansion/data/raw/semeval/semeval_testno.json')

In [7]:
for k,v in dict_list[1].items():
    print('['+k+']')
    print(v[:200])

[name]
test_C-14
[title]
Sensor Deployment Strategy for Target Detection
[abstract]
In order to monitor a region for traffic traversal, sensors can be deployed to perform collaborative target detection. Such a sensor network achieves a certain level of detection performance with an a
[fulltext]
1. INTRODUCTION
Recent advances in computing hardware and software are
responsible for the emergence of sensor networks capable of
observing the environment, processing the data and making
decisions b
[keywords]
exposure;sensor number;path exposure;deployment;target detection;target decay;sequential deployment;value fusion;sensor network;random sensor placement;number of sensor;minimum exposure;sensor field;c


In [None]:
# 改变数据

In [79]:
new_list = []
for dic in dict_list:
    new_dict = {}
    new_dict["abstract"] = str(dic['abstract']).replace('`','').replace("'",'')
    new_dict["keyword"] = dic['keywords']
    new_dict["title"] = dic['title']
    new_list.append(new_dict)

In [None]:
# 写成文件

In [80]:
for dic in new_list:
    filename = '/mnt/KeyphraseExpansion/data/raw/semeval/semeval_test.json'
    append_line(filename, json.dumps(dic))

In [13]:
# 整理tfidf的pred文件

## 现在tfidf的pred文件格式是
dict_list = read_jsonline('/mnt/KeyphraseExpansion/data/raw/inspec/inspec_pred_tfidf.json')
for k,v in dict_list[1].items():
    print('=============='+k+'===========')
    print(v[:200])

['A', 'new', 'method', 'of', 'systemological', 'analysis', 'coordinated', 'with', 'the', 'procedure', 'ofobject-oriented', 'design.', 'II', 'For', 'pt.I.', 'see', 'Vestn.', 'KhGPU,', 'no.81,', 'p.15-18', '(2000).', 'The', 'paper', 'presents', 'the', 'results', 'of', 'development', 'of', 'an', 'object-oriented', 'systemological', 'method', 'used', 'to', 'design', 'complex', 'systems.', 'A', 'formal', 'system', 'representation,', 'as', 'well', 'as', 'an', 'axiomatics', 'of', 'the', 'calculus', 'of', 'systems', 'as', 'functional', 'flow-type', 'objects', 'based', 'on', 'a', 'Node-Function-Object', 'class', 'hierarchy', 'are', 'proposed.', 'A', 'formalized', 'NFO/UFO', 'analysis', 'algorithm', 'and', 'CASE', 'tools', 'used', 'to', 'support', 'it', 'are', 'considered']
formal system representation;functional flow-type objects;formalized nfo/ufo analysis algorithm;systemological analysis;case tools;object-oriented design;axiomatics;complex systems design
[]
['object', 'calculus of systems', 

In [12]:
## 我们需要的pred文件是
dict_list = read_jsonline('/mnt/KeyphraseExpansion/data/jsonl/inspec/inspec_pred.jsonl')
for k,v in dict_list[1].items():
    print('==========='+k+'============')
    print(v[:200])

['outlier', 'resistant', 'adaptive', 'matched', 'filtering', 'robust', 'adaptive', 'matched', 'filtering', '(', 'amf', ')', 'whereby', 'outlier', 'data', 'vectors', 'are', 'censored', 'from', 'the', 'covariance', 'matrix', 'estimate', 'is', 'considered', 'in', 'a', 'maximum', 'likelihood', 'estimation', '(', 'mle', ')', 'setting', '.', 'it', 'is', 'known', 'that', 'outlier', 'data', 'vectors', 'whose', 'steering', 'vector', 'is', 'highly', 'correlated', 'with', 'the', 'desired', 'steering', 'vector', ',', 'can', 'significantly', 'degrade', 'the', 'performance', 'of', 'amf', 'algorithms', 'such', 'as', 'sample', 'matrix', 'inversion', '(', 'smi', ')', 'or', 'fast', 'maximum', 'likelihood', '(', 'fml', ')', '.', 'four', 'new', 'algorithms', 'that', 'censor', 'outliers', 'are', 'presented', 'which', 'are', 'derived', 'via', 'approximation', 'to', 'the', 'mle', 'solution', '.', 'two', 'algorithms', 'each', 'are', 'related', 'to', 'using', 'the', 'smi', 'or', 'the', 'fml', 'to', 'estimate',

In [25]:
#title_and_abstract_tokens可以不变
# keyword_tokens，先按分号分开，再按空格分开
# given：不变
# pred：先按元素分开，每个元素是一个列表，再按空格分开

def json_jsonl(json_file, jsonl_file):
    dict_list = read_jsonline(json_file)
    for dd in dict_list:
        dd['title_and_abstract_tokens'] = dd['title_and_abstract_tokens']
        keys = []
        for keyphrase in dd['keyword_tokens'].split(';'):
            keys.append(keyphrase.split())
        dd['keyword_tokens'] = keys
        dd['given_keyword_tokens'] = []
        keys = []
        for key in dd['pred_keyphrases']:
            keys.append(key.split())
        dd['pred_keyphrases'] = keys

    for dic in dict_list:
        filename = jsonl_file
        append_line(filename, json.dumps(dic))

datasets = ['inspec','krapivin','nus','semeval','kp20k']

for data in datasets:
    json_file = '/mnt/KeyphraseExpansion/data/raw/'+data+'/'+data+'_pred_textrank.json'
    jsonl_file = '/mnt/KeyphraseExpansion/data/jsonl/'+data+'/'+data+'_pred_textrank.jsonl'
    json_jsonl(json_file, jsonl_file)





In [1]:
for i in range(0):
    print('222')

In [5]:
import math
math.floor(0.1)

0