In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.matutils import corpus2csc

from sklearn.neighbors import NearestNeighbors
from joblib import dump, load
import re
import string

In [2]:
df = pd.read_csv("data.csv")

df.head()

Unnamed: 0,prodi,nim,nama,dokumen
0,manajemen,215350030,Andi Adriani Paesal,1 jurnal economos feb um parepare pengaruh mot...
1,manajemen,215350034,Riskianti,efektivitas profitabilitas guna modal kerja pa...
2,manajemen,215350007,Iis Faradillah,1 jurnal economos feb um parepare draft templa...
3,manajemen,215350015,Ardiyanto,1 jurnal economos feb um parepare pengaruh bia...
4,pembangunan,213210027,Junaedi,evaluasi kerja badan rencana bangun daerah bap...


In [3]:
## fungsi untuk menghapus url didalam teks
def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"",text)

## fungsi untuk menghapus tanda baca didalam teks
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

## fungsi untuk mengubah teks menjadi huruf kecil
def case_folding(text):
    return text.casefold()

## stopwords
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

stopwords = StopWordRemoverFactory().create_stop_word_remover()

def remove_stopwords(text):
    return stopwords.remove(text)

## stemming
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

stemmer = StemmerFactory().create_stemmer()

def stemming(text):
    return stemmer.stem(text)

def preprocessing(text):
    text = remove_URL(text)
    text = remove_punct(text)
    text = case_folding(text)
    text = remove_stopwords(text)
    text = stemming(text)

    print(text)
    return text

In [4]:
from tqdm.auto import tqdm as tqdmp
tqdmp.pandas()
import re, unicodedata, string

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def text_cleaner(text):
    """
    Function for cleaning text data from unnecessary characters.
    """
    text = text.lower()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\r', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [6]:
dokumen_cleaned = df.dokumen.dropna().reset_index(drop = True)
dokumen_cleaned = dokumen_cleaned.progress_apply(lambda x: text_cleaner(x).split())
dokumen_cleaned.head()

100%|██████| 71/71 [00:00<00:00, 338.42it/s]


0    [jurnal, economos, feb, um, parepare, pengaruh...
1    [efektivitas, profitabilitas, guna, modal, ker...
2    [jurnal, economos, feb, um, parepare, draft, t...
3    [jurnal, economos, feb, um, parepare, pengaruh...
4    [evaluasi, kerja, badan, rencana, bangun, daer...
Name: dokumen, dtype: object

In [8]:
dictionary = Dictionary(dokumen_cleaned)
num_docs = dictionary.num_docs
num_terms = len(dictionary.keys())

In [9]:
# the bag of words corpus
corpus_bow = [dictionary.doc2bow(doc) for doc in dokumen_cleaned]

In [10]:
tfidf = TfidfModel(corpus_bow)
corpus_tfidf = tfidf[corpus_bow]

In [11]:
corpus_tfidf_sparse = corpus2csc(corpus_tfidf, num_terms, num_docs=num_docs).T

In [12]:
model = NearestNeighbors(n_neighbors = 5, n_jobs=-1)
model.fit(corpus_tfidf_sparse)

NearestNeighbors(n_jobs=-1)

In [20]:
test_dokumen =[ ["jurnal daya kwh meter komunikasi"]]
# test_dokumen.iloc[0] = ['halo'] + test_dokumen.iloc[0]
# test_dokumen.iloc[1] = ['bagus'] + test_dokumen.iloc[1]
# test_dokumen.head()

In [21]:
# test corpus from created dictionary
test_corpus_bow = [dictionary.doc2bow(doc) for doc in test_dokumen]

# test tfidf values from created tfidf model
test_corpus_tfidf = tfidf[test_corpus_bow]

# test sparse matrix
test_corpus_tfidf_sparse = corpus2csc(test_corpus_tfidf, num_terms).T
test_corpus_tfidf_sparse

TypeError: doc2bow expects an array of unicode tokens on input, not a single string

In [16]:
res = model.kneighbors(test_corpus_tfidf_sparse)
res

(array([[0.        , 1.21541621, 1.28905984, 1.32089544, 1.3442157 ],
        [0.00854718, 1.24147353, 1.2734199 , 1.29504769, 1.31176078]]),
 array([[ 2,  0,  9, 48, 51],
        [ 3,  0, 29, 11,  9]]))