# Dependecies library needed for preprocessing

In [1]:
import os, glob, tika, re, string, io
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
from tika import parser

# Create Sastrawi function, *stopwords and stemming*

In [2]:
# stopword sastrawi
stopwords_factory = StopWordRemoverFactory()
stopwords = stopwords_factory.create_stop_word_remover()
#additional stopwords
more_stopword = ['undang', 'pasal', 'ayat', 'microsoft', 'word', 'www', 'bphn', 'go', 'id', 'bphngoid', 'wwwbphngoid']
data = stopwords_factory.get_stop_words() + more_stopword #merge data from more_stopwords
dictionary = ArrayDictionary(data)
additional_stopwords = StopWordRemover(dictionary)
stopwords = stopwords_factory.create_stop_word_remover() #create stopwords from default option sastrawi

# create stemmer from StemmerFactory
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

# Create function `text_preprocessing` for cleaning document

In [3]:
# define text_preprocessing function
def text_preprocessing (text):
    # checking if dataset is a full picture with no string attach, if it is will skip and return it with 0
    if text is None: 
        return 0
    else:
        text = text.lower() #case folding
        text = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", text) #remove link URL with https
        text = text.translate(str.maketrans('', '', string.punctuation)) # Remove common punctuations
        text = text.strip() # Remove leading and trailing whitespaces
        text = additional_stopwords.remove(text) #remove custom additional stopwords
        text = stemmer.stem(text) #stemming with sastrawi
        text = stopwords.remove(text) # remove stopwords from sastrawi
        text = re.sub(r'[^A-Za-z]+', ' ', text) #remove exlude alphabet (including whitespace)
        text = re.sub(r'\b\w{1,3}\b', '', text) #remove string <3 character
        text = re.sub(r'[^A-Za-z]+', ' ', text) #remove exlude alphabet (including whitespace)
        #text = re.split("\s", text) # (tokenize) split every whitespace occure ['char', 'char']
        return text

# Create function for extracting preprocessed text into `.txt` file

### `.txt` file will make the computation for document similarities more convenient

In [4]:
# define filename processed file .txt
def process_filename (raw_filename, content): #process_filename(judul.txt, isi_content)
    rm_pdfname = re.sub(".pdf", '', raw_filename) #remove .pdf and replace it with empty string
    processed_file = open(rm_pdfname + '_processed.txt', 'w')
    processed_file_content = repr(content) #repr: fill/append the content txt document
    processed_file.write(processed_file_content) # write the content on processed file
    processed_file.close() #close txt document

# Cleaning document process and exporting it into `.txt` files

In [5]:
# looping sebanyak jumlah dokumen untuk ekstrak preprocessing data
def main_preprocessing_data (input_path):
    for input_file in glob.glob(os.path.join(input_path, '*.pdf')):
        print(input_file) # checking
        print('***********\nprocessing... please wait\n---------')
        parserPDF = parser.from_file(input_file)
        # extracting content
        raw = parserPDF['content'] # parsing document from pdf files with tika parser
        raw = text_preprocessing(raw) # cleaning content for raw content from parserPDF 
        process_filename(input_file,raw) # extracting title,content from raw to .txt
        print(raw)
    print('\n============================ \n')
    

# Path directory contained input file (`.pdf`) *dataset* and *query* 

In [6]:
# input path directory
dataset_path = 'D:\\Kuliah\\dataset'
query_path = 'D:\\Kuliah\\query'

# Query preprocessing

In [7]:
#preprocessing query
main_preprocessing_data(query_path)

D:\Kuliah\query\PERATURAN REKTOR UNIVERSITAS LAMPUNG edit.pdf
***********
processing... please wait
---------


2021-11-23 17:07:11,286 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


atur rektor universitas lampung nomor tahun pedoman barangjasa badan layan umum universitas lampung rahmat tuhan maha rektor universitas lampung timbang rangka penuh atur presiden nomor tahun barangjasa perintah juncto atur presiden tahun ubah atas atur presiden nomor tahun barangjasa perintah universitas lampung badan layan umum perlu susun atur barangjasa dasar timbang bagaimana maksud huruf perlu tetap atur rektor pedoman barangjasa badan layan umum universitas lampung ingat undangundang nomor tahun uang negara lembar negara tahun nomor tambah lembar negara nomor undangundang nomor tahun bendahara negara lembar negara tahun nomor tambah lembar negara nomor undangundang nomor tahun didik tinggi lembar negara republik indonesia tahun nomor tambah lembar negara republik indonesia nomor atur perintah nomor tahun kelola uang badan layan umum bagaimana ubah atur perintah nomor tahun ubah atas atur perintah nomor tahun kelola uang badan layan umum lembar negara tahun nomor tambah lembar ne

# Dataset preprocessing

In [8]:
#preprocessing dataset
main_preprocessing_data(dataset_path)

D:\Kuliah\dataset\PERTOR PBJ UNNES REV 16_11.pdf
***********
processing... please wait
---------
 atur rektor universitas negeri semarang nomor tahun pedoman barangjasa badan layan umum universitas negeri semarang rahmat tuhan maha rektor universitas negeri semarang timbang rangka penuh atur presiden nomor tahun barangjasa perintah universitas negeri semarang badan layan umum perlu susun atur barangjasa dasar timbang bagaimana maksud huruf perlu tetap atur rektor pedoman barangjasa badan layan umum universitas negeri semarang ingat undangundang nomor tahun uang negara lembar negara tahun nomor tambah lembar negara nomor undangundang nomor tahun bendahara negara lembar negara tahun nomor tambah lembar negara nomor undangundang nomor tahun didik tinggi lembar negara republik indonesia tahun nomor tambah lembar negara republik indonesia nomor atur perintah nomor tahun kelola uang badan layan umum bagaimana ubah atur perintah nomor tahun ubah atas atur perintah nomor tahun kelola uang bada