In [1]:
# Import library-library
import os
import glob
from tqdm import tqdm
import numpy as np
import string

# Data Preparation and Preprocessing
import pandas as pd
import re
from string import digits

# Word Embedding
import joblib
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')
from keybert import KeyBERT
kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens')
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# Input and Expansion Query
import nltk
import math
from textblob import TextBlob
from yake import KeywordExtractor
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk import tokenize
from operator import itemgetter
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from typing import Union

In [2]:
excluded_words = ["tempat", "waktu"]

NLTK_StopWords = stopwords.words('indonesian')
NLTK_StopWords.extend(["detik", "detikjatim", "detikjateng", "detikjabar", "detiksulsel", "detiksumbar", "detikbali", "detikpapua", "detiksulteng", "detikmaluku", "detjatim", "detikcom", "allahumma", "aamiin", "allah", "bismillah"])
NLTK_StopWords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])
txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
NLTK_StopWords.extend(txt_stopword["stopwords"][0].split(' '))
NLTK_StopWords = set(NLTK_StopWords)

In [3]:
# def preprocessing(berita):
#     # Preprocessing
#     s = berita.lower()
#     s = s.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
#     s = s.encode('ascii', 'replace').decode('ascii')
#     ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", s).split())
#     s.replace("http://", " ").replace("https://", " ")
#     s = re.sub('\s+', ' ', s)
#     s = s.strip()
#     s = s.translate(str.maketrans("","", string.punctuation))
#     s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
#     tokens = [token for token in s.split(" ") if token != ""]

#     # Stopwords checking
#     T = [t for t in tokens if ((t in excluded_words) or (t not in NLTK_StopWords))]
#     return T
def preprocessing(berita):
    s = str(berita)
    s = s.lower()
    s = s.replace('\n', ' ')
    s = s.replace('\r', ' ')
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    tokens = [token for token in s.split(" ") if token != ""]
    T = [t for t in tokens if ((t in excluded_words) or (t not in NLTK_StopWords))]
    return T

In [4]:
df_total = pd.read_csv('corpus/dataset/df_total.csv')
df_total = df_total[pd.notnull(df_total['description'])]
print(df_total.info())
print ('-'*90)
document_text= joblib.load('corpus/model/desc_text_train.pkl')
print(len(document_text))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1612 entries, 0 to 1611
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        1612 non-null   object
 1   date         1612 non-null   object
 2   description  1612 non-null   object
 3   source       1612 non-null   object
dtypes: object(4)
memory usage: 63.0+ KB
None
------------------------------------------------------------------------------------------
1450


In [5]:
df_test = pd.read_csv('corpus/dataset/df_test.csv')
df_test = df_test[pd.notnull(df_test['description'])]
print(df_test.info())
print ('-'*90)
document_text_test= joblib.load('corpus/model/desc_text_test.pkl')
print(len(document_text_test))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162 entries, 0 to 161
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        162 non-null    object
 1   date         162 non-null    object
 2   description  162 non-null    object
 3   source       162 non-null    object
dtypes: object(4)
memory usage: 6.3+ KB
None
------------------------------------------------------------------------------------------
162


In [6]:
df_train = pd.read_csv('corpus/dataset/df_train.csv')
df_train = df_train[pd.notnull(df_train['description'])]
print(df_train.info())
print ('-'*90)
document_text_train= joblib.load('corpus/model/desc_text_train.pkl')
print(len(document_text_train))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1450 entries, 0 to 1449
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        1450 non-null   object
 1   date         1450 non-null   object
 2   description  1450 non-null   object
 3   source       1450 non-null   object
dtypes: object(4)
memory usage: 56.6+ KB
None
------------------------------------------------------------------------------------------
1450


In [9]:
# Load bow dataset
df_bow_what = pd.read_csv("bow/bow_what.csv")
df_bow_what.head()

Unnamed: 0,tingkat setelah parent,parent,similarity
0,0,bencana,"[('musibah', 0.6435083150863647), ('banjir', 0..."
1,1,musibah,"[('kecelakaan', 0.7218754887580872), ('insiden..."
2,1,banjir,"[('longsor', 0.7020419836044312), ('kekeringan..."
3,1,kelaparan,"[('wabah', 0.7254995703697205), ('epidemi', 0...."
4,1,gempa,"[('tsunami', 0.6986302137374878), ('letusan', ..."


In [10]:
# Ambil parent dari bow
bow_list_what = []

for i in range(0, df_bow_what.shape[0]):
  bow_list_what.append(df_bow_what.iloc[i, 1])

print("What ", bow_list_what[0])

print({
  'What': len(bow_list_what),
})


What  bencana
{'What': 321}


In [11]:
# Cari dok pertama Use data train
def cari_dokpertama(kueriAsli: str):
    kueriPre = preprocessing(kueriAsli)
    kueriPre = " ".join(kueriPre)
    hasilSearch = []
    tfidf_matrix = joblib.load('corpus/matrix/tfidf_train.pkl')
    tfidf_vectorizer = joblib.load('corpus/vectorizer/vectorizer.pkl')
    query_vec = tfidf_vectorizer.transform([kueriPre])
    results = cosine_similarity(tfidf_matrix, query_vec).reshape((-1))
    for i in results.argsort()[-5:][::-1]:
        hasilSearch.append(df_total.iloc[i,-2])
    hasilSearch=". ".join(hasilSearch)
    
    return hasilSearch

In [12]:
##Keywords Extraction with YAKE
def keyword_yake(hasilSearch: str):
    keywordYake=[]

    k_extractor = KeywordExtractor(lan="id", n=1, top=50)
    k_extractor2 = KeywordExtractor(lan="id", n=2, top=50)
    keywords = k_extractor.extract_keywords(text=hasilSearch)
    # keywords = k_extractor2.extract_keywords(text=hasilSearch)
    keywords.extend(k_extractor2.extract_keywords(text=hasilSearch))
    keywordYake = [x for x, y in keywords]
    # keywordYake.append(keywords)
    print('*'*120)
    print('*'*120)
    print("Keyword yake")
    print(keywordYake)
    print('*'*120)
    print('*'*120)
    
    return keywordYake

In [13]:
#Keywords Extraction with TFIDF
def keyword_tfidf(hasilSearch: str):

    keywordtfidf=[]
    keywordtfidf2=[]

    total_words = re.sub(r'[^\w]', ' ', hasilSearch)
    total_words = total_words.lower().split()
    #print (total_words)
    total_word_length = len(total_words)
    total_sentences = tokenize.sent_tokenize(hasilSearch)
    total_sent_len = len(total_sentences)

    tf_score = {}
    for each_word in total_words:
        #print (each_word)
        each_word = each_word.replace('.','')
        if (each_word in excluded_words) or (each_word not in NLTK_StopWords):
            if each_word in tf_score:
                tf_score[each_word] += 1
            else:
                tf_score[each_word] = 1

    # Dividing by total_word_length for each dictionary element
    tf_score.update((x, y/int(total_word_length)) for x, y in tf_score.items())
    #print(tf_score)
    def check_sent(word, sentences): 
        final = [all([w in x for w in word]) for x in sentences] 
        sent_len = [sentences[i] for i in range(0, len(final)) if final[i]]
        return int(len(sent_len))

    idf_score = {}
    for each_word in total_words:
        #print (each_word)
        each_word = each_word.replace('.','')
        if (each_word in excluded_words) or (each_word not in NLTK_StopWords):
            if each_word in idf_score:
                idf_score[each_word] = check_sent(each_word, total_sentences)
            else:
                idf_score[each_word] = 1

    # Performing a log and divide
    idf_score.update((x, math.log(int(total_sent_len)/y)) for x, y in idf_score.items())

    #print(idf_score)
    tf_idf_score = {key: tf_score[key] * idf_score.get(key, 0) for key in tf_score.keys()}
    #print(tf_idf_score)
    def get_top_n(dict_elem, n):
        result = dict(sorted(dict_elem.items(), key = itemgetter(1), reverse = True)[:n]) 
        hasil =list(result.keys())
        #print(list(result.keys()))        
        return hasil
    #print(get_top_n(tf_idf_score, 25))
    #print(len(get_top_n(tf_idf_score, 1)))
    keywordtfidf.append(get_top_n(tf_idf_score, 25))
    for i in range(len(keywordtfidf)):
        #print (i)
        totalKw=0
        totalKw=len(keywordtfidf[i])
        for j in range(totalKw):
            #print (j)
            keywordtfidf2.append(keywordtfidf[i][j])
    
    print('*'*120)
    print('*'*120)
    print("Keyword TFIDF")
    print (keywordtfidf2)
    print('*'*120)
    print('*'*120)

    return keywordtfidf2

In [14]:
#Keywords Extraction with BERT
def keyword_bert(hasilSearch: str):

    keywordbert=[]

    #for j in range(len(array_text)):
    keyword1 = kw_extractor.extract_keywords(hasilSearch, top_n=50, keyphrase_ngram_range=(1, 1))
    keyword2 = kw_extractor.extract_keywords(hasilSearch, top_n=50, keyphrase_ngram_range=(1, 2))

    #print("Keywords of article\n", keywords)
    for i in range (0,len (keyword1)):
        keywordbert.append(keyword1[i][0])
        keywordbert.append(keyword2[i][0])
    
    print('*'*120)
    print('*'*120)
    print("Keyword Bert")
    print (keywordbert)
    print('*'*120)
    print('*'*120)
    
    return keywordbert

In [15]:
# Borda rangking
def rangking (keywordGabung, kueriAsli: str):
    kandidatFinalCek=[]
    kandidatFinalFix=[]
    
    for i in keywordGabung:
        if (i not in kandidatFinalCek and i!=0):
            kandidatFinalCek.append(i)
    queries=[kueriAsli]
    query_embeddings = embedder.encode(queries)
    corpus_embeddings4 = embedder.encode(kandidatFinalCek)
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    closest_n = 80
    for query, query_embedding in zip(queries, query_embeddings):
        distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings4, 'cosine')[0]
        results = zip(range(len(distances)), distances)
        results = sorted(results, key=lambda x: x[1])
        for idx, distance in results[0:closest_n]:
            kandidatFinalFix.append(kandidatFinalCek[idx])

    print('*'*120)
    print('*'*120)
    print ('Kandidat Final Fix Rank: ', kandidatFinalFix)
    print('*'*120)
    print('*'*120)

    return kandidatFinalFix


In [16]:
# Keyword bow
def keywordCustomBow(bowList, initialQuery: str):
    cekDuplicate = []
    kandidatFix = []

    for i in bowList:
        if(i not in cekDuplicate and i!=0):
            cekDuplicate.append(i)

    queries=[initialQuery]
    query_embeddings = embedder.encode(queries)
    corpus_embeddings4 = embedder.encode(cekDuplicate)
    
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    closest_n = 500
    for query, query_embedding in zip(queries, query_embeddings):
        distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings4, 'cosine')[0]
        results = zip(range(len(distances)), distances)
        results = sorted(results, key=lambda x: x[1])
        for idx, distance in results[0:closest_n]:
            kandidatFix.append(cekDuplicate[idx])
    
    print('*'*120)
    print('*'*120)
    print("Keyword BoW")
    print ('Kandidat BoW: ', kandidatFix)
    print('*'*120)
    print('*'*120)

    return kandidatFix

In [17]:
# Creating query for what, when, where, who
what_initial_query = "bencana apa yang terjadi dalam berita"

what_query = preprocessing(what_initial_query)
what_query = " ".join(what_query)
print (what_query)


bencana berita


In [18]:
# Prepare w data
def prepareWData(initial_query: str, bow_list):
    hasilkandidat = []
    keywordGabung = []
    qeGabungan = []
    kueriFix = []

    hasilSearch     = cari_dokpertama(initial_query)
    # (ini yake + tfidf + bert) = qe statistik
    keywordYake     = keyword_yake(hasilSearch) # 20
    keywordtfidf2   = keyword_tfidf(hasilSearch) # 20
    keywordbert     = keyword_bert(hasilSearch) # 20
    # ini qe bow
    keywordBoW      = keywordCustomBow(bow_list, initial_query)

    for keyword1 in keywordYake:
        keywordGabung.append(keyword1)
    for keyword2 in keywordtfidf2:
        keywordGabung.append(keyword2)
    for keyword3 in keywordbert:
        keywordGabung.append(keyword3)  

    # hasilrank = qe statistik
    hasilrank = rangking(keywordGabung, initial_query)
    
    for word1 in hasilrank:
        kueriFix.append(word1)

    for word2 in keywordBoW:
        kueriFix.append(word2)

    for word3 in kueriFix:
        hasilkandidat.append(word3)

    kueriFix = [preprocessing(i) for i in kueriFix]
    
    qeGabunganDelimiter = []

    for word4 in kueriFix:
        for subWord in word4:
            qeGabungan.append(subWord)
            qeGabunganDelimiter.append(subWord)

    qeGabunganDelimiter = list(dict.fromkeys(qeGabunganDelimiter))
    qeGabungan = list(dict.fromkeys(qeGabungan))

    # (hasil ranking + bow) = kandidat final
    qeGabungan = [" ".join(qeGabungan)]

    print('*'*120)
    qeStatistik = hasilrank
    qeBoW = keywordBoW

    return [qeGabungan, qeStatistik, qeBoW, qeGabunganDelimiter]

In [19]:
whatResultList = prepareWData(what_query, bow_list_what)
print("What")
print()
qeGabunganWhat = whatResultList[0]
qeStatistikWhat = whatResultList[1]
qeBoWWhat = whatResultList[2]
qeGabunganDelimiterWhat = whatResultList[3]


************************************************************************************************************************
************************************************************************************************************************
Keyword yake
['banjir', 'hujan', 'air', 'kawasan', 'warga', 'jakarta', 'rumah', 'sungai', 'tanggul', 'gempa', 'wib', 'timur', 'kramat', 'jati', 'ciliwung', 'kondisi', 'hek', 'pertigaan', 'juni', 'annisa', 'bogor', 'laut', 'dirasakan', 'getaran', 'rizky', 'barat', 'andi', 'luapan', 'salahuddin', 'puncak', 'video', 'desa', 'jembatan', 'pusat', 'gambas', 'hanyut', 'waspada', 'debit', 'deras', 'katulampa', 'lokasi', 'mengguyur', 'manokwari', 'skala', 'iii', 'mmi', 'benda', 'intensitas', 'pantauan', 'fadhila', 'kramat jati', 'jati jakarta', 'sungai ciliwung', 'hek kramat', 'jakarta timur', 'getaran dirasakan', 'pertigaan hek', 'kawasan pertigaan', 'tanggul kawasan', 'banjir', 'annisa rizky', 'hujan', 'air', 'kawasan', 'warga', 'kondisi tanggul', 'gambas

In [20]:
# Calculation what
def wCalculationWhat():
    document_result = []
    document_result_w = []

    for i in range(0, len(document_text_test)-1):
        hasilWhat = []

        teks = df_total.iloc[i, -2]
        tfidf_vectorizer = joblib.load('corpus/vectorizer/vectorizer.pkl')
        tfidf_matrix = tfidf_vectorizer.fit_transform([teks])
        document_result_w.append(df_total.iloc[i, -2])

        query_vec_what = tfidf_vectorizer.transform(qeGabunganWhat)
        results_what = cosine_similarity(tfidf_matrix, query_vec_what).reshape((-1))

        for key in qeGabunganDelimiterWhat:
            cariW = re.findall(key, document_result_w[i])
            if cariW:
                hasilWhat.append(key)

        document_result.append([i, qeGabunganWhat, qeStatistikWhat, qeBoWWhat, hasilWhat, results_what, 0, 0, teks])
        # ------------------------------------------------------------------------------------------------------
    
    writer = pd.DataFrame(document_result, columns=['Data', 'QE Gabungan', 'QE Statistik', 'QE Bow', 'Hasil Query', 'Similarity', 'True Positive', 'True Negative','Skimming News'])
    writer.to_csv('result/QE_Stat_V2_testing_result_what_new.csv', index=False, sep=',')

In [37]:
wCalculationWhat()
# wCalculationWho()
# wCalculationWhen()