# Input path directory dataset and query

In [1]:
# dataset path can be adjusted base on requirement
dataset_path = 'D:\\Kuliah\\Dataset_UU_15'

# query path can be adjusted base on requirement
query_path = 'D:\\Kuliah\\query_RUU'

# Indexing (*fetching*) title processed dataset document `.txt`

In [2]:
#fileName list for indexing

import os, glob, re, ntpath #npath for removing parent directory location

queryName =[] # list for fetching all of queryName

for input_file in glob.glob(os.path.join(query_path, '*.txt')):
    input_file = re.sub(r'.txt', '', input_file) #remove .txt string from file for cleaner look
    input_file = ntpath.basename(input_file) # fetch only fileName instead all file directory path
    queryName.append(input_file) # append all input_file into fileName list

print('Query Name:')
print(queryName)

fileName =[] # list for fetching all of fileName

for input_file in glob.glob(os.path.join(dataset_path, '*.txt')):
    input_file = re.sub(r'.txt', '', input_file) #remove .txt string from file for cleaner look
    input_file = ntpath.basename(input_file) # fetch only fileName instead all file directory path
    fileName.append(input_file) # append all input_file into fileName list
print('Dataset Name:')
print(fileName)


Query Name:
['Draf_RUU Pelayanan Publik_15 Juni__processed']
Dataset Name:
['2008-14_processed', '2008-39_processed', '2009-25_processed', '2009-36_processed', '2009-38_processed', '2009-39_processed', '2011-4_processed', '2014-23_processed', '2014-30_processed', '2017-1_processed', '2019-2_processed', '2019-3_processed', '2020-11_processed', '2020-1_processed', '2020-4_processed']


# Create corpus dataset content from all of dataset files
all of the content stored into `raw_values` variable

In [3]:
#make a corpus
import re

raw_values = [] # list for fetching all of file document content
for input_file in glob.glob(os.path.join(dataset_path, '*.txt')):
    with open(input_file, 'r') as txt_doc:
        # append all content to a single list 'raw_values'
        raw_values.append(re.sub(r'[^A-Za-z]+', ' ',txt_doc.read())) # re.sub for filtering document content removing all char except alphabet



# Create DataFrame from list  `fileName, raw_values`

In [4]:
# create dataframe from 2 list
import pandas as pd

# create dataframe from fileName and raw_values as columns and name it 'title' and 'content'
df_raw = pd.DataFrame(list(zip(fileName,raw_values)),columns=['title','content'])

df_raw

df_raw.to_csv('indexing_doc.csv', encoding='utf-8')

# TF IDF matrix

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as py

# Vectorizer to convert a collection of raw documents to a matrix of TF-IDF features
vectorizer = TfidfVectorizer()

# Learn vocabulary and idf, return term-document matrix
# The astype(‘U’) is telling numpy to convert the data to Unicode (essentially a string in python 3)
tfidf = vectorizer.fit_transform(df_raw['content'].values.astype('U'))

# Array mapping from feature integer indices to feature name
words = vectorizer.get_feature_names()

# indexing for each term a tfidf score in dataframe from all of dataset (query not yet included)
# tfidf[0] is indexing the vector from tfidf
# T.todense() is for transpose the array vector and make it into matrix
i = 0
for i in range(tfidf.shape[0]): #the [0] index in tfidf is the document index instead of content
    df_tfidf= pd.DataFrame(tfidf[i].T.todense(), index= words, columns= ["tfidf"])

# sorting by the highest TFIDF score in dataframe for knowing the most valuable term in dataset 
df_tfidf_sorted = df_tfidf.sort_values(by=["tfidf"], ascending=False)

# convert and export dataframe df_tfidf_sorted into .csv file
df_tfidf_sorted.to_csv('TFIDFraw.csv', encoding='utf-8')

df_tfidf_sorted


Unnamed: 0,tfidf
indonesia,0.470700
republik,0.322766
ukraina,0.289899
ministers,0.248484
cabinet,0.248484
...,...
integratofl,0.000000
integrasi,0.000000
integral,0.000000
instrumentasi,0.000000


# Cosine similarity query with dataset

In [6]:
# Instead of using fit_transform, you need to first fit 
# the new document to the TFIDF matrix corpus like this:
queryTFIDF = TfidfVectorizer().fit(words)

# create query_value from processed query doc txt (store query content into query_value variable)
query_value=[]

for query_file in glob.glob(os.path.join(query_path, '*.txt')):
    with open(query_file, 'r') as txt_doc:
        # append all content to a single list 'raw_values'
        query_value.append(re.sub(r'[^A-Za-z]+', ' ',txt_doc.read())) # re.sub for filtering document content removing all char except alphabet

# Now we can 'transform' this vector into that matrix shape by using the transform function:
queryTFIDF = queryTFIDF.transform(query_value)

# As we transformed our query in a tfidf object
# we can calculate the cosine similarity in comparison with 
# our pevious corpora
cosine_similarities = cosine_similarity(queryTFIDF, tfidf).flatten() *100 # 100 will make it a percentage

# create final dataframe from combination of 2 list (fileName and cosine_similarities) with columns (Document and Similarity)
df_final = pd.DataFrame(list(zip(fileName,cosine_similarities)),columns=['Document','Similarity'])

# define new dataframe variable = final_similarities, it use to store sorting descending result from similarity column
final_similarities = df_final.sort_values(by=['Similarity'], ascending=False) #sort similarity value in descending order
final_similarities['Similarity'] = [ '%.2f' % elem for elem in final_similarities['Similarity'] ] #limiting floating number into 2 decimal only for more eligible in Similarity column
final_similarities['Similarity']=final_similarities['Similarity'] + '%' # adding '%' symbol into the end of similarity number

# convert and export dataframe final_similarities into .csv file
final_similarities.to_csv('TFIDFcosineSimilarityResult.csv', encoding='utf-8')

# Function to convert list into string 
def listToString(s): 
    # initialize an empty string
    str1 = " "  
    # return string  
    return (str1.join(s))

print('Similarity result with query "'+listToString(queryName)+'" :')

final_similarities



Similarity result with query "Draf_RUU Pelayanan Publik_15 Juni__processed" :


Unnamed: 0,Document,Similarity
2,2009-25_processed,91.90%
4,2009-38_processed,70.53%
6,2011-4_processed,53.75%
0,2008-14_processed,53.41%
12,2020-11_processed,49.98%
5,2009-39_processed,48.67%
8,2014-30_processed,48.38%
3,2009-36_processed,46.94%
1,2008-39_processed,43.05%
7,2014-23_processed,39.55%
