In [11]:
import spacy
from spacy.tokens import Span
from spacy.attrs import ENT_IOB, ENT_TYPE
import numpy
from spacy import displacy
from fpgrowth_py import fpgrowth
import pandas as pd
import re
import json
import seaborn as sns

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

doc.ents[0].label_
# for ent in doc.ents:
#     print(ent.text, ent.start_char, ent.end_char, ent.label_)

'ORG'

In [2]:
doc = nlp("San Francisco considers banning sidewalk delivery robots")

# document level
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)

# token level
ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
print(ent_san)  # ['San', 'B', 'GPE']
print(ent_francisco)  # ['Francisco', 'I', 'GPE']

[('San Francisco', 0, 13, 'GPE')]
['San', 'B', 'GPE']
['Francisco', 'I', 'GPE']


In [3]:
doc = nlp("fb is hiring a new vice president of global policy")
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('Before', ents)
# The model didn't recognize "fb" as an entity :(

# Create a span for the new entity
fb_ent = Span(doc, 0, 1, label="ORG")
orig_ents = list(doc.ents)

# Option 1: Modify the provided entity spans, leaving the rest unmodified
doc.set_ents([fb_ent], default="unmodified")

# Option 2: Assign a complete list of ents to doc.ents
doc.ents = orig_ents + [fb_ent]

ents = [(e.text, e.start, e.end, e.label_) for e in doc.ents]
print('After', ents)
# [('fb', 0, 1, 'ORG')] 🎉

Before []
After [('fb', 0, 1, 'ORG')]


In [4]:
doc = nlp.make_doc("London is a big city in the United Kingdom.")
print("Before", doc.ents)  # []

header = [ENT_IOB, ENT_TYPE]
attr_array = numpy.zeros((len(doc), len(header)), dtype="uint64")
attr_array[0, 0] = 3  # B
attr_array[0, 1] = doc.vocab.strings["GPE"]
doc.from_array(header, attr_array)
print("After", doc.ents)  # [London]

Before ()
After (London,)


In [5]:
text = "When Sebastian Thrun started working on self-driving cars at the HKPU in 2007, few people outside of the company took him seriously."

# nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
displacy.render(doc, style="ent")

In [30]:
doc = nlp("This is a sentence. This is another sentence.")
for sent in doc.sents:
    print(sent.text)

This is a sentence.
This is another sentence.


In [31]:
nlp = spacy.load("en_core_web_sm", exclude=["parser"])
nlp.enable_pipe("senter")
doc = nlp("This is a sentence. This is another sentence.")
for sent in doc.sents:
    print(sent.text)

This is a sentence.
This is another sentence.


In [12]:
jsfile = r"D:\Code Working Area\Python\knowledge-graph-for-stakeholder-risks-detection-in-mega-infrastructure-projects\Transactions.json"

project = r"D:\Study\Real Estate\Project\Gg.csv"
risk = r"D:\Study\Real Estate\Risk\Risk New\Risk_Simplified.xlsx"
stake = r"D:\Study\Real Estate\StakeHolder\expansive.csv"

project1 = r"D:\Code Working Area\Python\knowledge-graph-for-stakeholder-risks-detection-in-mega-infrastructure-projects\ExcelData\newTitle_Project.xlsx"
risk0 = r"D:\Code Working Area\Python\knowledge-graph-for-stakeholder-risks-detection-in-mega-infrastructure-projects\ExcelData\RiskFinal.xlsx"
stake1 = r"D:\Code Working Area\Python\knowledge-graph-for-stakeholder-risks-detection-in-mega-infrastructure-projects\ExcelData\New_StakeHolder_Abstract.xlsx"

In [13]:
pj = pd.read_csv(project, sep = ",")
risk1 = pd.read_excel(risk)
stk = pd.read_csv(stake, sep = ",")

prj = pd.read_excel(project1)
risk2 = pd.read_excel(risk0)
stk1 = pd.read_excel(stake1)

en = spacy.load('en_core_web_sm')
stopwords = en.Defaults.stop_words

# get the target text from original dataset to match
nproject = pd.DataFrame(pj["Article Title"])
nrisk = pd.DataFrame(risk1["Abstract"])
nstack = pd.DataFrame(stk["Abstract"])

nrisk.Abstract = nrisk.Abstract.fillna("No Context")

stk1.name = stk1.name.str.lower()

  pj = pd.read_csv(project, sep = ",")


In [16]:
def reload_dashes(droped: int)->pd.DataFrame:
    dashes = pd.read_excel(r"D:\Code Working Area\Python\knowledge-graph-for-stakeholder-risks-detection-in-mega-infrastructure-projects\ExcelData\adjustment.xlsx")
    dashes = dashes[dashes.frequency > droped].Words.to_list()
    return set(dashes)

stopwords |= reload_dashes(2)
# filout = pd.read_excel(r"D:\Code Working Area\Python\knowledge-graph-for-stakeholder-risks-detection-in-mega-infrastructure-projects\ExcelData\filtered.xlsx")
# stopwords |= set(filout.name.to_list())
stopwords |= set([str(num) for num in range(1,100)])

# manually add the words to the stopwords
possiblew = {"connections", "efficacy", "life", "This"}
stopwords |= possiblew

In [17]:
# pick up the ORG whose frequency larger than 30
fre = pd.read_csv(r"D:\Code Working Area\Python\knowledge-graph-for-stakeholder-risks-detection-in-mega-infrastructure-projects\ExcelData\Examples\ORG_frequency.csv")
chosenone = fre[fre.frequency >= 30]
chosenone = chosenone[chosenone.name.apply(lambda x: x not in stopwords)].reset_index(drop = True)

In [23]:
def combination(keyw: str, extracted: list):
    extracted = list(set(extracted))
    temp = []
    for val in range(0, len(extracted)):
        if re.sub("\W", "", extracted[val]) not in stopwords: temp.append(extracted[val])
    del extracted
    return [ele+" "+ keyw for ele in temp] if len(temp) else []

def regex_match(keyword: str, args:str):
    temp = args
    lookbehind = rf"(?<=\b{keyword})(\W\W?\w+)"
    lookforward = rf"(\w+\W\W?)(?=\b{keyword}\s)"
    return combination(keyword, re.findall(lookforward, temp)) + combination(keyword, re.findall(lookbehind, temp))

def match_sentence(args:str, sent:str):
    return re.findall(rf"[^.]+{args}[^.]+", sent)

In [33]:
args, sent="risk",list(nrisk.iloc[0])
match_sentence(args, sent[0])

[' Thus, a dynamic risk control system is a valuable support for the successful completion of the sleeve grouting process',
 ' This study aims to develop an entropy-based sleeve grouting risk dynamic control system',
 ' Design/methodology/approach First, static risk assessment was conducted through the structured interview survey using the entropy weight method, followed by a dynamic risk control technique, where indicators were simulated through system dynamics containing causal loop diagrams and stock-and-flow diagrams',
 ' Findings Finally, three types of risk control models, namely, tortuous type, stable type and peak loop type, were developed in the entropy-based sleeve grouting risk dynamic control system and simulated using system dynamics in a real case',
 ' Originality/value Compared to traditional sleeve grouting risk management, the developed system enabled dynamic control over time']

In [19]:
# pre-process of matching, but only apply for risk keyword abstraction
def reduction(args: str, val: str):
    args = str(args)
    for item in args:
        if item in ["(", ")", "+"]: args = args.replace(item, " ")
    try:
        return re.search(rf"\b{args}\b", val) != None
    except Exception:
        print("the currently word is: %s", args, flags = re.IGNORECASE)
    
def dummy_project(args: pd.Series, val: str):
    ags = str(args)
    return val.find(args) != -1

def match_attributes(args: str):
    casualty=[]
    """args are the value from nrisk.Abstract"""
#     res = prj["Article Title"].apply(dummy_project, args = (args, ))
#     casualty = prj["Article Title"][res == True].to_list()
#     res = risk2["Abstract"].apply(reduction, args= (args, ))
#     casualty = [*casualty, *risk2.Abstract[res==True].to_list()]
    res = chosenone.name.apply(regex_match, args = (args,))
    res = res[res.astype(bool)].explode()
    # convert list into new rows (attention here the res is in pd.Series type not pd.DataFrame)
#     res=stk2["stk"].apply(reduction, args=(args,))
#     res=stk2.stk[res==True]
    casualty = [*casualty, *res.to_list()]
#     print(casualty)
    return casualty

def write_json(new_data, filepath=jsfile):
    with open(filepath,'r+') as file:
          # First we load existing data into a dict.
        file_data = json.load(file)
        # Join new_data with file_data inside emp_details
        file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file, indent = 4)

# first run the first 500 lines and find out the frequency, delete the words whose frequency larger than 5
def extraction(rnum: int = 500):
    for value in nrisk.iloc[1:rnum].itertuples(): 
        if type(value.Abstract) is int or type(value.Abstract) is float: 
            print(value)
            continue
        write_json(match_attributes(value.Abstract))
#         match_attributes(value.Abstract)
    return
# extraction(50)
# nrisk.value_counts

Unnamed: 0,name,frequency
0,university,514
1,european,420
2,&,358
3,water,307
4,department,305
...,...,...
214,faculty,30
215,chinese,30
216,councils,30
217,german,30


In [None]:
"this is an example of how a spacy model can be used"
# texts = [
#     "Net income was $9.4 million compared to the prior year of $2.7 million.",
#     "Revenue exceeded twelve billion dollars, with a loss of $1b.",
# ]
#
# nlp = spacy.load("en_core_web_sm")
# for doc in nlp.pipe(texts, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]):
#     # Do something with the doc here
#     print([(ent.text, ent.label_) for ent in doc.ents])

In [None]:
# self-customerd function
from spacy.language import Language


@Language.component("info_integration")
def info_integration(doc):
    ...


def spacy_org(data):
    vals = nlp(data)
    # tags, ner = zip(*[(val.pos_, val.ents.label_) for val in vals if val.ents.label_ == "ORG"])
    ner = [val.text for val in vals.ents if val.label_ == "ORG"]
    return ner


### LangChain ask
import os

os.environ["OPENAI_API_KEY"] = "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"
from langchain.llms import OpenAI

llm = OpenAI(temperature=0)
order = " ".join(stk_sor.loc[0:15, :].Abstract)
text = f"please extract out organization within this word '{stk.Abstract[1]}'"

In [None]:
# fre = pd.read_csv(r"D:\Code Working Area\Python\knowledge-graph-for-stakeholder-risks-detection-in-mega-infrastructure-projects\ExcelData\Examples\ORG_frequency.csv")
# chosenone = fre[fre.frequency >= 30]
# chosenone = chosenone[chosenone.name.apply(lambda x: x not in stopwords)].reset_index(drop = True)
def combination(keyw: str, extracted: list):
    extracted = list(set(extracted))
    temp = []
    for val in range(0, len(extracted)):
        if re.sub("\W", "", extracted[val]) not in stopwords: temp.append(extracted[val])
    del extracted
    return [ele + " " + keyw for ele in temp] if len(temp) else []


def regex_match(keyword: str, args: str):
    temp = args
    lookbehind = rf"(?<=\b{keyword})(\W\W?\w+)"
    lookforward = rf"(\w+\W\W?)(?=\b{keyword}\s)"
    return combination(keyword, re.findall(lookforward, temp)) + combination(keyword, re.findall(lookbehind, temp))


# pre-process of matching, but only apply for risk keyword abstraction
def reduction(args: str, val: str):
    args = str(args)
    for item in args:
        if item in ["(", ")", "+"]: args = args.replace(item, " ")
    try:
        return re.search(rf"\b{args}\b", val) != None
    except Exception:
        print("the currently word is: %s", args, flags=re.IGNORECASE)


def dummy_project(args: pd.Series, val: str):
    ags = str(args)
    return val.find(args) != -1


def match_attributes(args: str):
    """args are the value from nrisk.Abstract"""
    res = prj_sor["Article Title"].apply(dummy_project, args=(args,))
    casualty = prj_sor["Article Title"][res == True].to_list()
    res = risk_sor["Abstract"].apply(reduction, args=(args,))
    casualty = [*casualty, *risk_sor.Abstract[res == True].to_list()]
    # convert list into new rows (attention here the res is in pd.Series type not pd.DataFrame)
    res = stk_sor["stk"].apply(reduction, args=(args,))
    res = stk_sor.stk[res == True]
    casualty = [*casualty, *res.to_list()]
    return casualty


def write_json(new_data, filepath=jsfile):
    with open(filepath, 'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        # Join new_data with file_data inside emp_details
        file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file, indent=4)


# first run the first 500 lines and find out the frequency, delete the words whose frequency larger than 5
def extraction(rnum: tuple = (0, 500)):
    for value in nrisk.iloc[rnum[0]:rnum[1]].itertuples():
        if type(value.Abstract) is int or type(value.Abstract) is float:
            print(value)
            continue
        write_json(match_attributes(value.Abstract))
    #         match_attributes(value.Abstract)
    return


interval: list = [(0, 50), (150, 400), (1000, 1500)]

for items in interval:
    extraction(items)

In [None]:
# Perform not that good
# from transformers import AutoTokenizer, AutoModelForTokenClassification
# from transformers import pipeline
#
# tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
# model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
#
# nlp = pipeline("ner", model=model, tokenizer=tokenizer)
import tensorflow_datasets as tfds

data = tfds.load("conll2002")
ds_train, info, ds_test = data["train"], data["dev"], data["test"]
ds_train
import tensorflow as tf

physic_devices = tf.config.list_physical_devices("GPU")
physic_devices
# tf.config.experimental.set_memory_growth(physic_devices[0], True)
import tensorflow as tf

print(tf.__version__)

In [1]:
import pandas as pd

In [None]:
risk=pd.read_excel(r"../ExcelData/Source/risk.xlsx", index_col=None)
# risk.head()

In [8]:
tar=risk.Abstract.to_list()
print(len(tar))

125097


In [10]:
with open(r"risk_abstract.txt", "w") as file:
    for val in tar[:10_000]: file.write(f"{val}\n\n")