In [1]:
import pandas as pd
from nltk.tag import CRFTagger
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory 
from nltk.tokenize import RegexpTokenizer 
import nltk
import re
from neo4j import GraphDatabase
from nltk.stem import PorterStemmer
import numpy as np
from tqdm import tqdm

In [2]:
pd.set_option("display.max_colwidth",None)

In [3]:
en_stopwords = nltk.corpus.stopwords.words('english')

In [4]:
en_stemmer = PorterStemmer()

In [5]:
df_dupak_all=pd.read_csv("data/dupak20221010.csv",sep=";")
df_dupak_all.columns

FileNotFoundError: [Errno 2] No such file or directory: 'data/dupak20221010.csv'

In [28]:
tagger=CRFTagger()
tagger.set_model_file(r"model/all_indo_man_tag_corpus_model.crf.tagger")


In [29]:
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
factory = StemmerFactory()
stemmer = factory.create_stemmer()

df_id_stopword=pd.read_csv("data/stopwordbahasa.csv",header=None)
id_stopword=df_id_stopword[0].to_list()

def tokenize_clean(text):
    if(text):
        tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word
            in nltk.word_tokenize(sent)]
        #clean token from numeric and other character like puntuation
        filtered_tokens = []
        for token in tokens:
            txt=re.findall('[a-zA-Z]{3,}', token)
            if txt:
                filtered_tokens.append(txt[0])       
        return filtered_tokens

def remove_stopwords(tokenized_text):
    if(tokenized_text):
        cleaned_token = []
        for token in tokenized_text:
            if token not in id_stopword:
                cleaned_token.append(token)

        return cleaned_token

def stem_text(tokenized_text):
    if (tokenized_text):
        stems = []
        for token in tokenized_text:
            stems.append(stemmer.stem(token))

        return stems

def remove_en_stopwords(text):
    if text:
        return [token for token in text if token not in en_stopwords]

def stem_en_text(text):
    if text:
        return [en_stemmer.stem(word) for word in text]

def revome_slash_n(text):
    if text:
        return [str(txt).replace("\n"," ") for txt in text]

def lower_text(text):
    if text:
        return [str(txt).lower() for txt in text]

def make_sentence(arr):
    if arr:
        return " ".join(arr)
    
def text_preprocessing_id(text):
    if text:
        prep01 = tokenize_clean(text)
        prep02 = remove_stopwords(prep01)
        prep03 = stem_text(prep02)
#         prep04 = remove_en_stopwords(prep03)
#         prep05 = stem_en_text(prep04)
        prep06 = revome_slash_n(prep03)
        prep07 = lower_text(prep06)
        prep08 = make_sentence(prep07)
        return prep08

def text_preprocessing_en(text):
    if text:
        prep01 = tokenize_clean(text)
#         prep02 = remove_stopwords(prep01)
#         prep03 = stem_text(prep02)
        prep04 = remove_en_stopwords(prep01)
        prep05 = stem_en_text(prep04)
        prep06 = revome_slash_n(prep05)
        prep07 = lower_text(prep06)
        prep08 = make_sentence(prep07)
        return prep08

In [30]:
text_preprocessing_id("pemenuhan permintaan dan layanan teknologi informasi")

'penuh minta layan teknologi informasi'

In [31]:
tagged0=tagger.tag_sents(["melakukan pemenuhan permintaan dan layanan teknologi informasi".split()])
tagged1=tagger.tag_sents([text_preprocessing_id("melakukan pemenuhan permintaan dan layanan teknologi informasi").split()])
tagged0,tagged1


([[('melakukan', 'VB'),
   ('pemenuhan', 'NN'),
   ('permintaan', 'NN'),
   ('dan', 'CC'),
   ('layanan', 'NN'),
   ('teknologi', 'NN'),
   ('informasi', 'NN')]],
 [[('penuh', 'JJ'),
   ('minta', 'VB'),
   ('layan', 'NN'),
   ('teknologi', 'NN'),
   ('informasi', 'NN')]])

In [60]:
tagger.tag.

<bound method CRFTagger.tag of <nltk.tag.crf.CRFTagger object at 0x7f32b44d7370>>

In [32]:
[pos for i,pos in tagged0[0]]

['VB', 'NN', 'NN', 'CC', 'NN', 'NN', 'NN']

In [16]:
# !pip install neo4j



In [167]:
class Neo4j_Connect:

    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def print_greeting(self, message):
        with self.driver.session() as session:
            greeting = session.write_transaction(self._create_and_return_greeting, message)
            print(greeting)

    @staticmethod
    def _create_and_return_greeting(tx, message):
        result = tx.run("CREATE (a:Greeting) "
                        "SET a.message = $message "
                        "RETURN a.message + ', from node ' + id(a)", message=message)
        return result.single()[0]
    
    def add_relation(self, node1,rel,node2,score,idx):
        with self.driver.session() as session:
            res = session.write_transaction(self._add_relation, node1,rel,node2,score,idx)
            print(res)

    @staticmethod
    def _add_relation(tx, node1,rel,node2,score,idx):
        result = tx.run("MERGE (p:NODE {label:$node1})"
                        "MERGE (d:NODE {label:$node2})"
                        "WITH p,d"
                        " CALL apoc.create.relationship(p,$rel,{label:$rel,score:$score,idx:$idx},d)"
                        "YIELD rel "
                        "RETURN rel",
                       node1=node1,rel=rel,score=score,node2=node2,idx=idx)
        
        return result.single()[0]
    
    def query(self, m='',n=''):
        m='kumpul informasi'
        n='data instansi'
        with self.driver.session() as session:
            result = session.read_transaction(self._query, m,n)
        return result
#             print(res)

    @staticmethod
    def _query(tx, m,n):
        result = tx.run("match p=(m)-[r]-(n) where m.label contains 'instalasi' or n.label contains 'sistem operasi' return m as noun1, r as relation,m as noun2",
                       m=m,n=n)
        
        return result



n4j = Neo4j_Connect("bolt://10.242.184.93:7687", "neo4j", "test")

In [34]:
n4j.print_greeting("bismillah")

bismillah, from node 214


In [35]:
n4j.add_relation("ahli pertama","menyusun","kajian teknis",0.5,147)

<Relationship id=410 nodes=(<Node id=212 labels=frozenset() properties={}>, <Node id=213 labels=frozenset() properties={}>) type='menyusun' properties={'idx': 147, 'score': 0.5}>


In [36]:
l=0
s=""
for i,row in df_dupak_all.iterrows():
    l2=len(row.activity_last_part)
    if  l2>l:
        l=l2
        s=row.activity_last_part
s

'menyiapkan peralatan video conference (vicon/streaming ), monitoring peralatan (audio,\nvideo, dan perangkat jaringan), mengatur layout'

In [38]:
txt='Ahli Muda menyiapkan peralatan video conference (vicon/streaming ), monitoring peralatan (audio,\nvideo, dan perangkat jaringan), mengatur layout'
tagged0=tagger.tag_sents([txt.split()])
ori_pos=[pos[-1]+"_"+str(i) for i,pos in enumerate(tagged0[0])]
s=" ".join(ori_pos)
print(s)
re.findall("[NNP_\d]{4,6}\s[NNP_\d]*\s*[NNP_\d]*\s*[^NNP]+\s[NNP_\d]{4,6}\s*[NNP_\d]*\s*[NNP_\d]*\s*",s)
arr=re.findall("[NNP_\d]{4,6}\s[NNP_\d]*\s*[NNP_\d]*\s*[NNP_\d]*\s*",s)
re.findall("(?<=NN_3 NN_4).*?(?=NN_9)",s)


NNP_0 NNP_1 VB_2 NN_3 NN_4 FW_5 FW_6 Z_7 FW_8 NN_9 FW_10 FW_11 CC_12 NN_13 NN_14 NN_15 VB_16


[' FW_5 FW_6 Z_7 FW_8 ']

In [40]:
df_dupak_all.columns

Index(['activities', 'activity_code', 'ak', 'batas_penilaian', 'jenjang',
       'kegiatan', 'nomor_sub_unsur', 'nomor_unsur', 'sub_unsur', 'unsur',
       'activity_cleansed', 'tfidf_vec', 'activity_enriched',
       'activity_enriched_cleansed', 'tfidf_vec_enriched', 'actvity_code_only',
       'activity_last_part', 'activity_code_only', 'evidents'],
      dtype='object')

In [42]:
df_dupak_all.activity_last_part.head(2)

0    melakukan pemenuhan permintaan dan layanan teknologi informasi
1            melakukan pengumpulan informasi mengenai data instansi
Name: activity_last_part, dtype: object

In [43]:
df_dupak_all[df_dupak_all.index==147]["activities"]

147    Tata Kelola dan Tata Laksana Teknologi Informasi.Pengelolaan data (Data management).Melakukan perancangan data model
Name: activities, dtype: object

In [None]:
def get_last_activity(txt):
    return txt.split(".")[-1]

def complete_sentence_with_subject(row):
    arr=tagger.tag_sents([row.activity_last_part.split()])
    s=row.activity_last_part
    if arr[0][0][-1] not in ["NN","NNP"]:
        s=row.sub_unsur+" - "+row.activity_last_part
        print("\r{}---->{}".format(row.activity_last_part,s),end='')
    return s
df_dupak_all.activity_last_part=df_dupak_all.activities.apply(get_last_activity)
df_dupak_all.activity_last_part=df_dupak_all.apply(complete_sentence_with_subject,axis=1)
    

In [46]:
def print_subject(row):
    arr=tagger.tag_sents([row.activity_last_part.split()])
    s=row.activity_last_part
    if arr[0][0][-1] not in ["NN","NNP"]:
        print("{}---->{}".format(arr[0][0][-1],row.activity_last_part))

df_dupak_all.apply(print_subject,axis=1)
    

0      None
1      None
2      None
3      None
4      None
       ... 
331    None
332    None
333    None
334    None
335    None
Length: 336, dtype: object

In [47]:
df_dupak_all.to_csv("data/dupak_all.csv",sep=";", index=False)

In [48]:
df_dupak_all=pd.read_csv("data/dupak_all.csv",sep=";")

In [228]:
p=tagger.tag_sents(['melakukan instalasi'.split()])
p[0][0][-1]

'VB'

In [235]:
def get_word(txt,ref,preserve_empty_words=False):
    word=""
    try:
        rel=txt.split()
        rels=[]
        for r in rel:
            a=r.split("_")
            if len(a)==2:
                pos=a[0]
                idx=a[-1]
                label=ref[int(idx)]
                if pos in ["FW"]:#["FW","Z"]:
                    label2=text_preprocessing_en(label)
                else:
                    label2=text_preprocessing_id(label)
                if label2:
                    rels.append(label2)
                else:
                    if preserve_empty_words:
                        rels.append(label)
        word=" ".join(rels)
    except Exception as x:
        print("ERROR on text {} why?:{}".format(txt,str(x)))
    return word
    
    
def get_graph_rel(txt):
    tagged0=tagger.tag_sents([txt.split()])
    if tagged0[0][0][-1]=='VB':
        txt='Pegawai '+txt
        tagged0=tagger.tag_sents([txt.split()])
    ori_pos=[pos[-1]+"_"+str(i) for i,pos in enumerate(tagged0[0])]
    ori_word=[pos[0] for i,pos in enumerate(tagged0[0])]
    pos_sent=" ".join(ori_pos)
    nnps = re.findall("[NPFWZ_\d]{4,6}\s[NPFWZ_\d]*\s*[NPFWZ_\d]*\s*[NPFWZ_\d]*\s*[NPFWZ_\d]*\s*[NPFWZ_\d]*\s*[NPFWZ_\d]*\s*[NPFWZ_\d]*\s*[NPFWZ_\d]*\s*[NPFWZ_\d]*\s*[NPFWZ_\d]*\s*[NPFWZ_\d]*\s*[NPFWZ_\d]*\s*[NPFWZ_\d]*\s*", pos_sent)
    prev=""
    edges=[]
    for nnp in nnps:
        nnp=str(nnp).strip()
        if prev:
            rel=re.findall("(?<="+prev+").*?(?="+nnp+")",pos_sent)
#             print("REL:",rel)
            if rel:
                rel=rel[0]
                try:
#                     i=int(rel.split("_")[-1])
                    vb=get_word(rel,ori_word,preserve_empty_words=True)#ori_word[i]
#                     print(rel+'++++++++'+vb)
                    if re.findall("\w+",vb):
                        r=vb#get_word(rel,ori_word)
                        nn1=get_word(prev,ori_word)
                        nn2=get_word(nnp,ori_word)
                        if nn1 and nn2:
                            edges.append([nn1,r,nn2,0.55,147])
                except Exception as x:
                    print("ERR on vb:{}:{}".format(rel,x))
                    with open('data/err.txt','a+') as fo:
                        fo.write(prev+'---'+rel+'---'+nnp+'\n')
                
        prev=nnp

    return edges
txt='Ahli Muda menyiapkan dan memberikan peralatan video conference (vicon/streaming ), monitoring peralatan (audio,\nvideo, dan perangkat jaringan), mengatur layout'

txt='menyiapkan dan memberikan peralatan video conference'
get_graph_rel(txt)
# n4j.add_relation(nn1,r,nn2,0.5,147)

[['pegawai', 'menyiapkan dan memberikan', 'alat video confer', 0.55, 147]]

In [None]:

def add_dupak_to_neo4j(row):
    activity=row.activity_last_part
    arr=tagger.tag_sents([activity.split()])
    if arr[0][0][-1] not in ["NNP","NN"]:
        activity="Prakom "+row.jenjang+" "+activity
        
    idx=df_dupak_all.index.get_loc(df_dupak_all.index[df_dupak_all['activity_code'] == row.activity_code][0])
    idx=int(idx)
    ak=row.ak
    nodes=get_graph_rel(activity)
    for node in nodes:
        try:
            n4j.add_relation(node[0],node[1],node[2],ak,idx)
            pass
        except Exception as x:
            print("ERROR ADD RELATION:{}====>{}".format(node,x))
        
#         print("{}-{}-{}-{}".format(node[0],node[1],node[2],ak,idx))
tqdm.pandas()
df_dupak_all.progress_apply(add_dupak_to_neo4j,axis=1)

In [101]:
txt='Ahli Muda menyiapkan dan memberikan peralatan video conference (vicon/streaming ), monitoring peralatan (audio,\nvideo, dan perangkat jaringan), mengatur layout'
res=[]
for x in get_graph_rel(txt):
    print('{}-{}'.format(x[0],x[2]))
    res.append(n4j.query(x[0],x[2]))


ahli muda-alat video confer vicon monitor alat audio video
alat video confer vicon monitor alat audio video-perangkat jaring atur


In [None]:
# !pip install fastapi

In [219]:
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse
txt='Ahli Muda menyiapkan dan memberikan peralatan video conference (vicon/streaming ), monitoring peralatan (audio,\nvideo, dan perangkat jaringan), mengatur layout'


from neo4j import GraphDatabase
uri="bolt://10.242.184.93:7687"
user='neo4j'
password='test'
driver = GraphDatabase.driver(uri, auth=(user, password))

def query_graph(txt):
    arr=[]
    with driver.session() as session:
        for x in get_graph_rel(txt):
            print(x)
            result = session.run("match p=(m)-[r]-(n) where toLower(m.label) contains '"+str(x[0]).lower()+"' \
                                 or toLower(n.label) contains '"+str(x[2]).lower()+"' return m,r,n")
            tmp={'node1':{},'rel':{},'node2':{}}
            for record in result:
                tmp['node1']['label']=record['m']['label']
                tmp['rel']['label']=record['r']['label']
                tmp['rel']['idx']=record['r']['idx']
                tmp['rel']['score']=record['r']['score']
                tmp['node2']['label']=record['n']['label']
#             n1 = [(record['m']['label'],(record['r']['label'],record['r']['idx'],record['r']['score']),record['n']['label']) for record in result]
#             print('FOUND RELATIONS({}):{}'.format(len(n1),n1))
                arr.append(tmp)
#             arr+=n1
    result={"results":arr}
    json_compatible_item_data = jsonable_encoder(result)
    return JSONResponse(content=json_compatible_item_data)
#     return arr
print('=============')
a=query_graph(txt)
# print('ARR({}):{}'.format(len(a),a))
a
# session.close()
# driver.close()
# n1

['ahli muda', 'menyiapkan dan memberikan', 'alat video confer vicon monitor alat audio video', 0.55, 147]
['alat video confer vicon monitor alat audio video', 'dan', 'perangkat jaring atur', 0.55, 147]


<starlette.responses.JSONResponse at 0x7f32ac55edc0>

In [240]:
txt='melakukan instalasi jaringan komputer'
get_graph_rel(txt)


[['pegawai', 'melakukan', 'instalasi jaring komputer', 0.55, 147]]

In [231]:
import json
txt='Ahli Muda menyiapkan dan memberikan peralatan video conference (vicon/streaming ), monitoring peralatan (audio,\nvideo, dan perangkat jaringan), mengatur layout'
txt='menyusun kajian'
a=query_graph(txt)
b=json.loads(a.body)
for i in b['results']:
    for k,v in i.items():
        print('{}:{}'.format(k,v))