Reference link: https://github.com/philgooch/abbreviation-extraction

In [1]:
from abbreviations import schwartz_hearst as sh

In [2]:
import urllib
import bson

In [3]:
import html2text

In [4]:
import numpy as np
import pandas as pd
import json
import copy
import unicodedata
from collections import defaultdict, Counter, OrderedDict
import re
import pickle
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.util import ngrams
import random

In [5]:
pairs = sh.extract_abbreviation_definition_pairs(doc_text='The emergency room (ER) was busy')

In [6]:
pairs

{'ER': 'emergency room'}

In [7]:
text_sample = 'The emergency room (ER) was busy, and the 21th Asia-Pacific Ecomonic Pact **h**((APEC) begins this week. \
Aviation Industry in China(AVIC) starts a new project, *EOP* but Chinese Communist Party  (CCP) wants more.'

In [8]:
pairs = sh.extract_abbreviation_definition_pairs(doc_text=text_sample)
pairs

{'ER': 'emergency room',
 'APEC': 'Asia-Pacific Ecomonic Pact',
 'CCP': 'Chinese Communist Party'}

In [20]:
with open("../customized_NER_MoneyFlow/data/corpus_data_v036.txt", "rb") as f:
    corpus = pickle.load(f)
    
    washington dc
    [d.c., h.u.,bgn]
    wdc:d.c.  wdc dc   d c  dc

In [21]:
%%time
# For easy-finding of text-segment, generate a dictionary with key: segment_id, value: result
temp = copy.deepcopy(corpus)
discovery_res = dict()
for entry in temp:
    #print(entry)
    seg_id = entry.pop('id')
    discovery_res[seg_id] = entry

CPU times: user 4.49 s, sys: 110 ms, total: 4.6 s
Wall time: 4.61 s


In [23]:
list(discovery_res.values())[0]

{'result_metadata': {'score': 3.75331},
 'extracted_metadata': {'sha1': '485c47d24a33622387d53b5d73dc1fa88287beb2',
  'filename': '101TheEvolvingIsraelChinaRelationshippdf_140.json',
  'file_type': 'json'},
 'filename': '101- The Evolving Israel-China Relationship.pdf',
 'text': 'Table 5.2 114\nIsraeli Companies That Received Chinese Investment\n The Evolving Israel-China Relationship\n Chinese\nIsraeli Entity Company Information Investors Area of Concern\nTnuva Tnuvas share of Israels dairy Bright Food Acquisition led to protests in Israel over food\n market exceeded 70 percent (state-owned) security, as the deal gave a Chinese state-owned\n at the time of acquisition company a 56-percent stake in one of Israels\n (declined since then to about largest food producers; members of the Knesset\n 50 percent). a Economic Affairs Committee opposed the deal.b\nThetaRay Cybersecurity company that Alibaba ThetaRays products aim to detect and prevent\n specializes in the detection cyber intrusio

In [26]:
def get_text_all(source=discovery_res.values(), sep=' *EOP* '):
    res = ''
    for val in source:
        text = val['text'].replace('\n', ' ').replace('\t', ' ')
        res += text + sep
    
    return res

In [29]:
text = get_text_all()

In [33]:
text[:2000]

'Table 5.2 114 Israeli Companies That Received Chinese Investment  The Evolving Israel-China Relationship  Chinese Israeli Entity Company Information Investors Area of Concern Tnuva Tnuvas share of Israels dairy Bright Food Acquisition led to protests in Israel over food  market exceeded 70 percent (state-owned) security, as the deal gave a Chinese state-owned  at the time of acquisition company a 56-percent stake in one of Israels  (declined since then to about largest food producers; members of the Knesset  50 percent). a Economic Affairs Committee opposed the deal.b ThetaRay Cybersecurity company that Alibaba ThetaRays products aim to detect and prevent  specializes in the detection cyber intrusions from Chinese government actors.  and prevention of advanced  persistent threats, which are  high-level cyber actors, usually  nation-states.c Kaymera Cybersecurity start-up focused GoCapital Kaymeras security products are marketed toward  on mobile devices.d governments as well as busine

In [36]:
pairs = sh.extract_abbreviation_definition_pairs(doc_text=text, most_common_definition=True)

In [37]:
pairs

{}

In [38]:
len(text)

20807821

In [39]:
paragrpahs = text.split('*EOP*')

In [40]:
len(paragrpahs)

9279

In [49]:
text_sample2 = 'a wholly owned unit of AVIC, funding and access to work with Oak Ridge National Laboratories (ORNL) as part of an incentive package to attract the firm '

In [50]:
pairs = sh.extract_abbreviation_definition_pairs(doc_text=text_sample2, most_common_definition=True)
pairs

{'ORNL': 'Oak Ridge National Laboratories'}

In [51]:
text_AVIC_1 = [a['text'].replace('\n', ' ').replace('\t', ' ') 
             for a in discovery_res.values() 
             if a['filename'] == '6- Pointe+Bello_Military+Civil+Fusion+Report- Full.pdf']

In [56]:
text_sample3 = ' '.join(text_AVIC_1)
pairs = sh.extract_abbreviation_definition_pairs(doc_text=text_sample3, most_common_definition=True)
pairs

{}

In [55]:
text_sample3

'Pointe  Bello  Promoting regional integration of economic and military development through the signing of  strategic cooperation framework agreements; and  Supporting the going out of Chinas defense industry, to include enhancing cooperation  with foreign governments and promoting Chinas nuclear equipment and technologies. MincanjunBoosting Private Sector Participation Over the last two years, SASTIND and the PLA have issued a number of policies aiming to boost private sector participation in the defense industry. Identified as a priority task in Document 37, opening up Chinas defense market to private sector firmsa concept referred to as mincanjun ( )is designed to acquire rapidly advancing military technologies and innovation spin-on into the defense sector via dual-use technology transfers or developing new products exclusively for military purposes.41 According to now retired LTG Li Andong (), a leading architect of Chinas armament modernization efforts between the early 2000s and