In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.matutils import corpus2csc

from sklearn.neighbors import NearestNeighbors
from joblib import dump, load
import re
import string

In [2]:
df = pd.read_csv("data.csv")

df.head()

Unnamed: 0,prodi,nim,nama,dokumen
0,manajemen,215350030,Andi Adriani Paesal,1 jurnal economos feb um parepare pengaruh mot...
1,manajemen,215350034,Riskianti,efektivitas profitabilitas guna modal kerja pa...
2,manajemen,215350007,Iis Faradillah,1 jurnal economos feb um parepare draft templa...
3,manajemen,215350015,Ardiyanto,1 jurnal economos feb um parepare pengaruh bia...
4,pembangunan,213210027,Junaedi,evaluasi kerja badan rencana bangun daerah bap...


In [3]:
## fungsi untuk menghapus url didalam teks
def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"",text)

## fungsi untuk menghapus tanda baca didalam teks
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

## fungsi untuk mengubah teks menjadi huruf kecil
def case_folding(text):
    return text.casefold()

## stopwords
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

stopwords = StopWordRemoverFactory().create_stop_word_remover()

def remove_stopwords(text):
    return stopwords.remove(text)

## stemming
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

stemmer = StemmerFactory().create_stemmer()

def stemming(text):
    return stemmer.stem(text)

def preprocessing(text):
    text = remove_URL(text)
    text = remove_punct(text)
    text = case_folding(text)
    text = remove_stopwords(text)
    text = stemming(text)

    print(text)
    return text

In [32]:
from tqdm.auto import tqdm as tqdmp
tqdmp.pandas()
import re, unicodedata, string

In [5]:
def text_cleaner(text):
    """
    Function for cleaning text data from unnecessary characters.
    """
    text = text.lower()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\r', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [6]:
dokumen_cleaned = df.dokumen.dropna().reset_index(drop = True)
dokumen_cleaned = dokumen_cleaned.progress_apply(lambda x: text_cleaner(x).split())
dokumen_cleaned.head()

100%|██████| 71/71 [00:00<00:00, 353.44it/s]


0    [jurnal, economos, feb, um, parepare, pengaruh...
1    [efektivitas, profitabilitas, guna, modal, ker...
2    [jurnal, economos, feb, um, parepare, draft, t...
3    [jurnal, economos, feb, um, parepare, pengaruh...
4    [evaluasi, kerja, badan, rencana, bangun, daer...
Name: dokumen, dtype: object

In [7]:
dictionary = Dictionary(dokumen_cleaned)
num_docs = dictionary.num_docs
num_terms = len(dictionary.keys())

In [8]:
# the bag of words corpus
corpus_bow = [dictionary.doc2bow(doc) for doc in dokumen_cleaned]

In [9]:
tfidf = TfidfModel(corpus_bow)
corpus_tfidf = tfidf[corpus_bow]

In [10]:
corpus_tfidf_sparse = corpus2csc(corpus_tfidf, num_terms, num_docs=num_docs).T

In [42]:
model = NearestNeighbors(n_neighbors = 5, n_jobs=-1)
model.fit(corpus_tfidf_sparse)

NearestNeighbors(n_jobs=-1)

In [43]:
teks = "KWH (Kilowatt-Hour) meter adalah kumparan tengangan"

test_dokumen = pd.DataFrame({"dokumen": [teks]})
test_dokumen = test_dokumen.dokumen.dropna().reset_index(drop = True)
test_dokumen = test_dokumen.apply(lambda x: text_cleaner(x).split())

# test corpus from created dictionary
test_corpus_bow = [dictionary.doc2bow(doc) for doc in test_dokumen]

# test tfidf values from created tfidf model
test_corpus_tfidf = tfidf[test_corpus_bow]

# test sparse matrix
test_corpus_tfidf_sparse = corpus2csc(test_corpus_tfidf, num_terms).T

distances, indices = model.kneighbors(test_corpus_tfidf_sparse)

df_hasil = df.loc[indices[0]]    
df_hasil['jarak'] = distances[0]
    
df_hasil   

Unnamed: 0,prodi,nim,nama,dokumen,jarak
16,informatika,210280018,Muhalis,putus daya kwh meter guna komunikasi seluler m...,1.095743
22,sipil,207190004,Abd Samad Syamsuddin,,1.405725
56,akk,213240058,Nurul Ilma AKK,perilaku warga masyarakat kelola sampah rumah ...,1.41179
61,epid,213240024,Hasrun EPID,hubung tingkat tahu perilaku jadi karies gigi ...,1.412159
25,elektro,209180044,Aswan,timbang berat harga digital basis mikrokontrol...,1.412412
