In [21]:
# Import library-library
import os
import glob
from tqdm import tqdm
import numpy as np
import string

# Data Preparation and Preprocessing
import pandas as pd
import re
from string import digits

# Word Embedding
import joblib
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')
from keybert import KeyBERT
kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens')
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# Input and Expansion Query
import nltk
import math
from textblob import TextBlob
from yake import KeywordExtractor
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk import tokenize
from operator import itemgetter
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from typing import Union

In [22]:
excluded_words = ["tempat", "waktu"]

NLTK_StopWords = stopwords.words('indonesian')
NLTK_StopWords.extend(["detik", "detikjatim", "detikjateng", "detikjabar", "detiksulsel", "detiksumbar", "detikbali", "detikpapua", "detiksulteng", "detikmaluku", "detjatim", "detikcom", "allahumma", "aamiin", "allah", "bismillah"])
NLTK_StopWords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])
txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
NLTK_StopWords.extend(txt_stopword["stopwords"][0].split(' '))
NLTK_StopWords = set(NLTK_StopWords)

In [23]:
# def preprocessing(berita):
#     # Preprocessing
#     s = berita.lower()
#     s = s.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
#     s = s.encode('ascii', 'replace').decode('ascii')
#     ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", s).split())
#     s.replace("http://", " ").replace("https://", " ")
#     s = re.sub('\s+', ' ', s)
#     s = s.strip()
#     s = s.translate(str.maketrans("","", string.punctuation))
#     s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
#     tokens = [token for token in s.split(" ") if token != ""]

#     # Stopwords checking
#     T = [t for t in tokens if ((t in excluded_words) or (t not in NLTK_StopWords))]
#     return T
def preprocessing(berita):
    s = str(berita)
    s = s.lower()
    s = s.replace('\n', ' ')
    s = s.replace('\r', ' ')
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    tokens = [token for token in s.split(" ") if token != ""]
    T = [t for t in tokens if ((t in excluded_words) or (t not in NLTK_StopWords))]
    return T

In [24]:
from datetime import datetime, timedelta

month_to_number = {
  'jan': '01',
  'feb': '02',
  'mar': '03',
  'apr': '04',
  'mei': '05',
  'jun': '06',
  'jul': '07',
  'agu': '08',
  'sep': '09',
  'okt': '10',
  'nov': '11',
  'des': '12',
}

def getNewsTime(timestamp: str, news: str, source: str):
  if source == "www.detik.com":
    splitted_timestamp = timestamp.split()
    pre_formatted_time = splitted_timestamp[3] + '-' + month_to_number[splitted_timestamp[2].lower()] + '-' + splitted_timestamp[1] + ' ' + splitted_timestamp[4] + ':00'

    date_time_obj = datetime.strptime(pre_formatted_time, '%Y-%m-%d %H:%M:%S')
    
    # 0 sabtu, 1 05, 2 Mar, 3 2022, 4 15:33, 5 WIB
    if "kemarin" in news:
      yesterday = date_time_obj - timedelta(1)
      return yesterday.strftime('%Y-%m-%d %H:%M:%S')
    elif "selumbari" in news:
      daysago = date_time_obj - timedelta(2)
      return daysago.strftime('%Y-%m-%d %H:%M:%S')
    else:
      return date_time_obj
    

In [25]:
df_total = pd.read_csv('corpus/dataset/df_total0.3.csv')
df_total = df_total[pd.notnull(df_total['description'])]
print(df_total.info())
print ('-'*80)
document_text= joblib.load('corpus/model/document_text_train0.3.pkl')
print(len(document_text))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1342 entries, 0 to 1341
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        1342 non-null   object
 1   date         1342 non-null   object
 2   description  1342 non-null   object
 3   source       1342 non-null   object
dtypes: object(4)
memory usage: 52.4+ KB
None
--------------------------------------------------------------------------------
939


In [26]:
df_test = pd.read_csv('corpus/dataset/df_test0.3.csv')
df_test = df_test[pd.notnull(df_test['description'])]
print(df_test.info())
print ('-'*80)
document_text_test= joblib.load('/Users/adi/Desktop/TA_Adi_V2/QE Statistik V2/corpus/model/document_text_test0.3.pkl')
print(len(document_text_test))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 403 entries, 0 to 402
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        403 non-null    object
 1   date         403 non-null    object
 2   description  403 non-null    object
 3   source       403 non-null    object
dtypes: object(4)
memory usage: 15.7+ KB
None
--------------------------------------------------------------------------------
403


In [27]:
df_train = pd.read_csv('corpus/dataset/df_train0.3.csv')
df_train = df_train[pd.notnull(df_train['description'])]
print(df_train.info())
print ('-'*80)
document_text_train= joblib.load('/Users/adi/Desktop/TA_Adi_V2/QE Statistik V2/corpus/model/document_text_train0.3.pkl')
print(len(document_text_train))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 939 entries, 0 to 938
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        939 non-null    object
 1   date         939 non-null    object
 2   description  939 non-null    object
 3   source       939 non-null    object
dtypes: object(4)
memory usage: 36.7+ KB
None
--------------------------------------------------------------------------------
939


In [28]:
# Load bow dataset
df_bow_what = pd.read_csv("bow/bow_what_filter.csv")
df_bow_what.head()

# df_bow_when = pd.read_csv("bow/bow_when.csv")
# df_bow_when.head()

# df_bow_where = pd.read_csv("bow/bow_where.csv")
# df_bow_where.head()

# df_bow_who = pd.read_csv("bow/bow_who.csv")
# df_bow_who.head()


Unnamed: 0,tingkat setelah parent,parent,similarity
0,1,kriminalitas,"[('kekerasan', 0.6600057482719421), ('kejahata..."
1,2,kekerasan,"[('intimidasi', 0.7123724222183228), ('pemerko..."
2,2,kejahatan,"[('kriminal', 0.7204362154006958), ('kekerasan..."
3,2,pemerkosaan,"[('pelecehan', 0.7989924550056458), ('perkosaa..."
4,2,rasisme,"[('antisemitisme', 0.8157601952552795), ('seks..."


In [29]:
# Ambil parent dari bow
bow_list_what = []
# bow_list_when = []
# bow_list_where = []
# bow_list_who = []

for i in range(0, df_bow_what.shape[0]):
  bow_list_what.append(df_bow_what.iloc[i, 1])

# for i in range(0, df_bow_when.shape[0]):
#   bow_list_when.append(df_bow_when.iloc[i, 1])

# for i in range(0, df_bow_where.shape[0]):
#   bow_list_where.append(df_bow_where.iloc[i, 1])

# for i in range(0, df_bow_who.shape[0]):
#   bow_list_who.append(df_bow_who.iloc[i, 1])

print("What ", bow_list_what[0])
# print("When ", bow_list_when[0])
# print("Where ", bow_list_where[0])
# print("Who ", bow_list_who[0])

print({
  'What': len(bow_list_what),
  # 'When': len(bow_list_when),
  # 'Where': len(bow_list_where),
  # 'Who': len(bow_list_who)
})


What  kriminalitas
{'What': 6995}


In [30]:
# Use data train
def cari_dokpertama(kueriAsli: str):
    kueriPre = preprocessing(kueriAsli)
    kueriPre = " ".join(kueriPre)
    hasilSearch = []
    tfidf_matrix = joblib.load('corpus/matrix/tfidf_train0.3.pkl')
    tfidf_vectorizer = joblib.load('corpus/vectorizer/vectorizer0.3.pkl')
    query_vec = tfidf_vectorizer.transform([kueriPre])
    results = cosine_similarity(tfidf_matrix, query_vec).reshape((-1))
    for i in results.argsort()[-5:][::-1]:
        hasilSearch.append(df_total.iloc[i,-2])
    hasilSearch=". ".join(hasilSearch)
    
    return hasilSearch

In [31]:
##Keywords Extraction with YAKE
def keyword_yake(hasilSearch: str) -> list[str]:
    keywordYake=[]

    k_extractor = KeywordExtractor(lan="id", n=1, top=50)
    k_extractor2 = KeywordExtractor(lan="id", n=2, top=50)
    keywords = k_extractor.extract_keywords(text=hasilSearch)
    # keywords = k_extractor2.extract_keywords(text=hasilSearch)
    keywords.extend(k_extractor2.extract_keywords(text=hasilSearch))
    keywordYake = [x for x, y in keywords]
    # keywordYake.append(keywords)
    print('*'*120)
    print('*'*120)
    print("Keyword yake")
    print(keywordYake)
    print('*'*120)
    print('*'*120)
    
    return keywordYake

In [32]:
#Keywords Extraction with TFIDF
def keyword_tfidf(hasilSearch: str) -> list[str]:

    keywordtfidf=[]
    keywordtfidf2=[]

    total_words = re.sub(r'[^\w]', ' ', hasilSearch)
    total_words = total_words.lower().split()
    #print (total_words)
    total_word_length = len(total_words)
    total_sentences = tokenize.sent_tokenize(hasilSearch)
    total_sent_len = len(total_sentences)

    tf_score = {}
    for each_word in total_words:
        #print (each_word)
        each_word = each_word.replace('.','')
        if (each_word in excluded_words) or (each_word not in NLTK_StopWords):
            if each_word in tf_score:
                tf_score[each_word] += 1
            else:
                tf_score[each_word] = 1

    # Dividing by total_word_length for each dictionary element
    tf_score.update((x, y/int(total_word_length)) for x, y in tf_score.items())
    #print(tf_score)
    def check_sent(word, sentences): 
        final = [all([w in x for w in word]) for x in sentences] 
        sent_len = [sentences[i] for i in range(0, len(final)) if final[i]]
        return int(len(sent_len))

    idf_score = {}
    for each_word in total_words:
        #print (each_word)
        each_word = each_word.replace('.','')
        if (each_word in excluded_words) or (each_word not in NLTK_StopWords):
            if each_word in idf_score:
                idf_score[each_word] = check_sent(each_word, total_sentences)
            else:
                idf_score[each_word] = 1

    # Performing a log and divide
    idf_score.update((x, math.log(int(total_sent_len)/y)) for x, y in idf_score.items())

    #print(idf_score)
    tf_idf_score = {key: tf_score[key] * idf_score.get(key, 0) for key in tf_score.keys()}
    #print(tf_idf_score)
    def get_top_n(dict_elem, n):
        result = dict(sorted(dict_elem.items(), key = itemgetter(1), reverse = True)[:n]) 
        hasil =list(result.keys())
        #print(list(result.keys()))        
        return hasil
    #print(get_top_n(tf_idf_score, 25))
    #print(len(get_top_n(tf_idf_score, 1)))
    keywordtfidf.append(get_top_n(tf_idf_score, 25))
    for i in range(len(keywordtfidf)):
        #print (i)
        totalKw=0
        totalKw=len(keywordtfidf[i])
        for j in range(totalKw):
            #print (j)
            keywordtfidf2.append(keywordtfidf[i][j])
    
    print('*'*120)
    print('*'*120)
    print("Keyword TFIDF")
    print (keywordtfidf2)
    print('*'*120)
    print('*'*120)

    return keywordtfidf2

In [33]:
#Keywords Extraction with BERT
def keyword_bert(hasilSearch: str) -> list[str]:

    keywordbert=[]

    #for j in range(len(array_text)):
    keyword1 = kw_extractor.extract_keywords(hasilSearch, top_n=50, keyphrase_ngram_range=(1, 1))
    keyword2 = kw_extractor.extract_keywords(hasilSearch, top_n=50, keyphrase_ngram_range=(1, 2))

    #print("Keywords of article\n", keywords)
    for i in range (0,len (keyword1)):
        keywordbert.append(keyword1[i][0])
        keywordbert.append(keyword2[i][0])
    
    print('*'*120)
    print('*'*120)
    print("Keyword Bert")
    print (keywordbert)
    print('*'*120)
    print('*'*120)
    
    return keywordbert

In [34]:
def rangking (keywordGabung: list[str], kueriAsli: str) -> list[str]:
    kandidatFinalCek=[]
    kandidatFinalFix=[]
    
    for i in keywordGabung:
        if (i not in kandidatFinalCek and i!=0):
            kandidatFinalCek.append(i)
    queries=[kueriAsli]
    query_embeddings = embedder.encode(queries)
    corpus_embeddings4 = embedder.encode(kandidatFinalCek)
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    closest_n = 100
    for query, query_embedding in zip(queries, query_embeddings):
        distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings4, 'cosine')[0]
        results = zip(range(len(distances)), distances)
        results = sorted(results, key=lambda x: x[1])
        for idx, distance in results[0:closest_n]:
            kandidatFinalFix.append(kandidatFinalCek[idx])

    print('*'*120)
    print('*'*120)
    print ('Kandidat Final Fix Rank: ', kandidatFinalFix)
    print('*'*120)
    print('*'*120)

    return kandidatFinalFix


In [35]:
def keywordCustomBow(bowList: list[str], initialQuery: str) -> list[str]:
    cekDuplicate = []
    kandidatFix = []

    for i in bowList:
        if(i not in cekDuplicate and i!=0):
            cekDuplicate.append(i)

    queries=[initialQuery]
    query_embeddings = embedder.encode(queries)
    corpus_embeddings4 = embedder.encode(cekDuplicate)
    
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    closest_n = 500
    for query, query_embedding in zip(queries, query_embeddings):
        distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings4, 'cosine')[0]
        results = zip(range(len(distances)), distances)
        results = sorted(results, key=lambda x: x[1])
        for idx, distance in results[0:closest_n]:
            kandidatFix.append(cekDuplicate[idx])
    
    print('*'*120)
    print('*'*120)
    print("Keyword BoW")
    print ('Kandidat BoW: ', kandidatFix)
    print('*'*120)
    print('*'*120)

    return kandidatFix

In [36]:
# Creating query for what, when, where, who
what_initial_query = "aksi kriminalitas apa yang sebenarnya terjadi"
# when_initial_query = "kapan waktu berita tersebut terjadi"
# where_initial_query = "di daerah mana tempat berita itu terjadi"
# who_initial_query = "siapa nama pelaku, korban, atau orang yang terlibat dalam peristiwa tersebut"

what_query = preprocessing(what_initial_query)
what_query = " ".join(what_query)
print (what_query)

# when_query = preprocessing(when_initial_query)
# when_query = " ".join(when_query)
# print (when_query)

# where_query = preprocessing(where_initial_query)
# where_query = " ".join(where_query)
# print (where_query)

# who_query = preprocessing(who_initial_query)
# who_query = " ".join(who_query)
# print (who_query)


aksi kriminalitas


In [37]:
def prepareWData(initial_query: str, bow_list: list[str]):
    hasilkandidat = []
    keywordGabung = []
    qeGabungan = []
    kueriFix = []

    hasilSearch     = cari_dokpertama(initial_query)
    # (ini yake + tfidf + bert) = qe statistik
    keywordYake     = keyword_yake(hasilSearch) # 20
    keywordtfidf2   = keyword_tfidf(hasilSearch) # 20
    keywordbert     = keyword_bert(hasilSearch) # 20
    # ini qe bow
    keywordBoW      = keywordCustomBow(bow_list, initial_query)

    for keyword1 in keywordYake:
        keywordGabung.append(keyword1)
    for keyword2 in keywordtfidf2:
        keywordGabung.append(keyword2)
    for keyword3 in keywordbert:
        keywordGabung.append(keyword3)  

    # hasilrank = qe statistik
    hasilrank = rangking(keywordGabung, initial_query)
    
    for word1 in hasilrank:
        kueriFix.append(word1)

    for word2 in keywordBoW:
        kueriFix.append(word2)

    for word3 in kueriFix:
        hasilkandidat.append(word3)

    kueriFix = [preprocessing(i) for i in kueriFix]
    
    qeGabunganDelimiter = []

    for word4 in kueriFix:
        for subWord in word4:
            qeGabungan.append(subWord)
            qeGabunganDelimiter.append(subWord)

    qeGabunganDelimiter = list(dict.fromkeys(qeGabunganDelimiter))
    qeGabungan = list(dict.fromkeys(qeGabungan))

    # (hasil ranking + bow) = kandidat final
    qeGabungan = [" ".join(qeGabungan)]

    print('*'*120)
    qeStatistik = hasilrank
    qeBoW = keywordBoW

    return [qeGabungan, qeStatistik, qeBoW, qeGabunganDelimiter]

In [38]:
whatResultList = prepareWData(what_query, bow_list_what)
print("What")
print()
qeGabunganWhat = whatResultList[0]
qeStatistikWhat = whatResultList[1]
qeBoWWhat = whatResultList[2]
qeGabunganDelimiterWhat = whatResultList[3]


************************************************************************************************************************
************************************************************************************************************************
Keyword yake
['pelaku', 'TNI', 'korban', 'motor', 'Selatan', 'warga', 'Dua', 'Jakarta', 'Ari', 'aksi', 'Bandung', 'Baru', 'jalan', 'Kebayoran', 'Senin', 'begal', 'Polres', 'SMPN', 'Prada', 'Gambas', 'Video', 'sepeda', 'salah', 'tas', 'satu', 'Cimahi', 'massa', 'prajurit', 'Semampir', 'rumah', 'Pasar', 'ABC', 'Sabtu', 'diamankan', 'dilakukan', 'tengah', 'polisi', 'anak', 'Beruntung', 'Mulyati', 'anggota', 'sembilan', 'melakukan', 'Minggu', 'diamuk', 'tirinya', 'ujar', 'milik', 'keluarga', 'ponpes', 'Jakarta Selatan', 'pelaku', 'TNI', 'korban', 'motor', 'prajurit TNI', 'Pasar Kebayoran', 'Kebayoran Baru', 'Selatan', 'warga', 'Dua', 'anggota TNI', 'Jakarta', 'Ari', 'aksi', 'Polres Cimahi', 'Bandung', 'Baru', 'jalan', 'dua prajurit', 'Kebayoran', 'Seni

In [19]:
whoResultList = prepareWData(who_query, bow_list_who)
print("Who")
print()
qeGabunganWho = whoResultList[0]
qeStatistikWho = whoResultList[1]
qeBoWWho = whoResultList[2]
qeGabunganDelimiterWho = whoResultList[3]

************************************************************************************************************************
************************************************************************************************************************
Keyword yake
['pelaku', 'korban', 'lantai', 'uang', 'amaq', 'api', 'minimarket', 'brankas', 'begal', 'juta', 'rabu', 'polisi', 'melawan', 'salah', 'senjata', 'gambasvideo', 'setu', 'ruang', 'majikannya', 'berhasil', 'melarikan', 'berinisial', 'tersangka', 'ditangkap', 'oknum', 'kukar', 'surabaya', 'satpol', 'motor', 'balikpapan', 'umur', 'wib', 'miliknya', 'kabur', 'ketiga', 'kawanan', 'sopir', 'amankan', 'mengaku', 'timur', 'dibegal', 'malam', 'rumah', 'berfoyafoya', 'psk', 'gandha', 'polda', 'kombes', 'kasusnya', 'mirzal', 'pelaku', 'korban', 'senjata api', 'api pelaku', 'lantai ruang', 'ruang brankas', 'disuruh lantai', 'mengawasi lantai', 'lantai mengambil', 'bersenjata api', 'api disuruh', 'minimarket raya', 'karyawan minimarket', 'minimarket 

In [20]:
whenResultList = prepareWData(when_query, bow_list_when)
print("When")
print()
qeGabunganWhen = whenResultList[0]
qeStatistikWhen = whenResultList[1]
qeBoWWhen = whenResultList[2]
qeGabunganDelimiterWhen = whenResultList[3]

************************************************************************************************************************
************************************************************************************************************************
Keyword yake
['pelaku', 'korban', 'jakarta', 'karawang', 'bank', 'polisi', 'metro', 'komnas', 'anak', 'jembatan', 'kakak', 'pembunuhan', 'jabar', 'selatan', 'senjata', 'tempat', 'memiliki', 'polres', 'pencurian', 'swasta', 'hrd', 'ridwan', 'mapolres', 'perhatian', 'berusia', 'mengapresiasi', 'bunuh', 'aldi', 'digantung', 'leher', 'membawa', 'kompol', 'minimarket', 'tersangka', 'tajam', 'kepala', 'penyelidikan', 'staf', 'gambasvideo', 'barat', 'reskrim', 'jaya', 'menonton', 'porno', 'pria', 'joko', 'mengungkap', 'mym', 'menyebut', 'keluarga', 'metro jakarta', 'pelaku', 'mapolres karawang', 'korban', 'jakarta selatan', 'karawang komnas', 'karawang anak', 'anak komnas', 'korban jembatan', 'polres metro', 'karawang pelaku', 'anak pelaku', 'jakarta', 'b

In [21]:
whereResultList = prepareWData(where_query, bow_list_where)
print("Where")
print()
qeGabunganWhere = whereResultList[0]
qeStatistikWhere = whereResultList[1]
qeBoWWhere = whereResultList[2]
qeGabunganDelimiterWhere = whereResultList[3]

************************************************************************************************************************
************************************************************************************************************************
Keyword yake
['pelaku', 'mobil', 'bank', 'tersangka', 'pikap', 'polisi', 'korban', 'karawang', 'emas', 'pencurian', 'komnas', 'bantul', 'anak', 'jembatan', 'tkp', 'kambing', 'pajero', 'film', 'nri', 'jabar', 'mengaku', 'pembunuhan', 'kakak', 'perampokan', 'syaprudin', 'warga', 'sport', 'toko', 'beraksi', 'selasa', 'lokasi', 'jawa', 'timur', 'heist', 'money', 'asap', 'penyelidikan', 'ihsan', 'aksinya', 'mapolres', 'kebun', 'lintas', 'terinspirasi', 'berusia', 'mengapresiasi', 'bunuh', 'aldi', 'digantung', 'leher', 'budhi', 'mapolres karawang', 'pelaku', 'mobil pikap', 'karawang komnas', 'karawang anak', 'anak komnas', 'mobil', 'pembunuhan komnas', 'mobil pajero', 'bank', 'tersangka', 'karawang pelaku', 'anak pelaku', 'pajero sport', 'pikap', 'polisi

In [39]:
def wCalculationWhat():
    document_result = []
    document_result_w = []

    for i in range(0, len(document_text_test)-1):
        hasilWhat = []

        teks = df_total.iloc[i, -2]
        tfidf_vectorizer = joblib.load('corpus/vectorizer/vectorizer0.3.pkl')
        tfidf_matrix = tfidf_vectorizer.fit_transform([teks])
        document_result_w.append(df_total.iloc[i, -2])

        query_vec_what = tfidf_vectorizer.transform(qeGabunganWhat)
        results_what = cosine_similarity(tfidf_matrix, query_vec_what).reshape((-1))

        for key in qeGabunganDelimiterWhat:
            cariW = re.findall(key, document_result_w[i])
            print(cariW)
            if cariW:
                hasilWhat.append(key)

        document_result.append([i, 'what', what_query, qeGabunganWhat, qeStatistikWhat, qeBoWWhat, hasilWhat, results_what, 0, 0, 0, document_result_w[i]])
        # ------------------------------------------------------------------------------------------------------
    
    writer = pd.DataFrame(document_result, columns=['Data','W', 'Query', 'QE Gabungan', 'QE Bow', 'QE Statistik', 'Hasil Query', 'Similarity', 'True Positive', 'True Negative', 'Relevansi Berita', 'Skimming News'])
    writer.to_csv('hasilQE_new/QE_Stat_V2_testing_result_what0.3.csv', index=False, sep=',')

In [23]:
def wCalculationWho():
    document_result = []
    document_result_w = []

    for i in range(0, len(document_text_test)-1):
        hasilWho = []

        teks = df_total.iloc[i, -2]
        tfidf_vectorizer = joblib.load('corpus/vectorizer/vectorizer.pkl')
        tfidf_matrix = tfidf_vectorizer.fit_transform([teks])
        document_result_w.append(df_total.iloc[i, -2])

        query_vec_who = tfidf_vectorizer.transform(qeGabunganWho)
        results_who = cosine_similarity(tfidf_matrix, query_vec_who).reshape((-1))

        for key in qeGabunganDelimiterWho:
            cariW = re.findall(key, document_result_w[i])
            if cariW:
                hasilWho.append(key)

        document_result.append([i, 'who', who_query, qeGabunganWho, qeStatistikWho, qeBoWWho, hasilWho, results_who, 0, 0, 0, document_result_w[i]])
        # ------------------------------------------------------------------------------------------------------
    
    writer = pd.DataFrame(document_result, columns=['Data','W', 'Query', 'QE Gabungan', 'QE Bow', 'QE Statistik', 'Hasil Query', 'Similarity', 'True Positive', 'True Negative', 'Relevansi Berita', 'Skimming News'])
    writer.to_csv('hasilQE_new/QE_Stat_V2_testing_result_who.csv', index=False, sep=',')

In [24]:
def wCalculationWhen():
    document_result = []
    document_result_w = []

    for i in range(0, len(document_text_test)-1):
        hasilWhen = []

        teks = df_total.iloc[i, -2]
        date = df_total.iloc[i, -3]
        source = df_total.iloc[i, -1]
        
        # tfidf_vectorizer = joblib.load('corpus/vectorizer/vectorizer.pkl')
        # tfidf_matrix = tfidf_vectorizer.fit_transform([teks])
        # document_result_w.append(df_total.iloc[i, -2])
        # query_vec_when = tfidf_vectorizer.transform(qeGabunganWhen)
        # results_when = cosine_similarity(tfidf_matrix, query_vec_when).reshape((-1))

        # for key in qeGabunganDelimiterWhen:
        #     # key = ""

        #     # if type(a) is list:
        #     #     key = ",".join(a)
        #     # else:
        #     #     key = a

        #     cariW = re.findall(key, document_result_w[i])
        #     if cariW:
        #         hasilWhen.append(key)

        # document_result.append([i, 'when', when_query, qeGabunganWhen, qeStatistikWhen, qeBoWWhen, hasilWhen, results_when, 0, 0, 0, document_result_w[i]])
        extracted_time = getNewsTime(timestamp=date, news=teks, source=source)
        document_result.append([i, 'when', extracted_time, date, teks])
        # ------------------------------------------------------------------------------------------------------
    
    writer = pd.DataFrame(document_result, columns=['Data','W', 'Extracted Time', 'Original Time', 'Skimming News'])
    writer.to_csv('hasilQE_new/QE_Stat_V2_testing_result_when_time.csv', index=False, sep=',')

    # writer = pd.DataFrame(document_result, columns=['Data','W', 'Query', 'QE Gabungan', 'QE Bow', 'QE Statistik', 'Hasil Query', 'Similarity', 'True Positive', 'True Negative', 'Relevansi Berita', 'Skimming News'])
    # writer.to_csv('QE_Stat_V2_testing_result_when.csv', index=False, sep=',')

In [25]:
def wCalculationWhere():
    document_result = []
    document_result_w = []

    for i in range(0, len(document_text_test)-1):
        hasilWhere = []

        teks = df_total.iloc[i, -2]
        tfidf_vectorizer = joblib.load('corpus/vectorizer/vectorizer.pkl')
        tfidf_matrix = tfidf_vectorizer.fit_transform([teks])
        document_result_w.append(df_total.iloc[i, -2])

        query_vec_where = tfidf_vectorizer.transform(qeGabunganWhere)
        results_where = cosine_similarity(tfidf_matrix, query_vec_where).reshape((-1))

        for key in qeGabunganDelimiterWhere:
            # key = ""

            # if type(a) is list:
            #     key = ",".join(a)
            # else:
            #     key = a
            
            cariW = re.findall(key, document_result_w[i])
            if cariW:
                hasilWhere.append(key)

        document_result.append([i, 'where', where_query, qeGabunganWhere, qeStatistikWhere, qeBoWWhere, hasilWhere, results_where, 0, 0, 0, document_result_w[i]])
        # ------------------------------------------------------------------------------------------------------
    
    writer = pd.DataFrame(document_result, columns=['Data','W', 'Query', 'QE Gabungan', 'QE Bow', 'QE Statistik', 'Hasil Query', 'Similarity', 'True Positive', 'True Negative', 'Relevansi Berita', 'Skimming News'])
    writer.to_csv('hasilQE_new/QE_Stat_V2_testing_result_where.csv', index=False, sep=',')

In [40]:
wCalculationWhat()
# wCalculationWho()
# wCalculationWhen()
# wCalculationWhere()

[]
[]
[]
['pelaku', 'pelaku', 'pelaku', 'pelaku', 'pelaku']
['korban', 'korban']
[]
[]
[]
[]
[]
[]
['begal', 'begal', 'begal', 'begal', 'begal']
[]
[]
[]
[]
[]
[]
['jalan']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['rumah', 'rumah']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['motor']
['sepeda']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['gang', 'gang']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]