# Mengenal Text Processing: Bag of Words and Stop Word Filtering

In [1]:
# bag of words sebagai representasi text
# menyederhanakan representasi text sebagai sekumpulan kata serta mengabadikan grammar dan posisi taip kata 
# pada kalimat. Text akan dikonversi menjadi lowercase dan tanda baca akan diabaikan

In [30]:
corpus = ['Linux has been around since the mid-1990s.',
          'Linux distributions include the Linux kernel',
          'Linux id one of the most prominent open-source software']

corpus

['Linux has been around since the mid-1990s.',
 'Linux distributions include the Linux kernel',
 'Linux id one of the most prominent open-source software']

In [34]:
# bag of words dengan countvectorizer

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorized_x = vectorizer.fit_transform(corpus).toarray()
vectorized_x

array([[1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1],
       [0, 0, 0, 1, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1]],
      dtype=int64)

In [35]:
vectorizer.get_feature_names_out()

array(['1990s', 'around', 'been', 'distributions', 'has', 'id', 'include',
       'kernel', 'linux', 'mid', 'most', 'of', 'one', 'open', 'prominent',
       'since', 'software', 'source', 'the'], dtype=object)

In [40]:
# mengukur kedekatan atau jarak antar dokumen

from sklearn.metrics.pairwise import euclidean_distances

for i in range (len(vectorized_x)):
    for j in range (i, len(vectorized_x)):
        if i==j:
            continue
        jarak = euclidean_distances(vectorized_x[i].reshape(1, -1), vectorized_x[j].reshape(1, -1))
        print(f'jarak dokumen {i+1} dan {j+1}: {jarak[0][0]}')

jarak dokumen 1 dan 2: 3.1622776601683795
jarak dokumen 1 dan 3: 3.7416573867739413
jarak dokumen 2 dan 3: 3.4641016151377544


In [None]:
# stop word filtering pada text
# menyederhanakan repreentasi text dengan mengabaikan beberapa kata seperti
# delimiters (the, a, an)
# auxilary verbs (do, be, will)
# preposition (on, in, at)

In [41]:
corpus

['Linux has been around since the mid-1990s.',
 'Linux distributions include the Linux kernel',
 'Linux id one of the most prominent open-source software']

In [42]:
vectorizer = CountVectorizer (stop_words='english')
vectorized_x = vectorizer.fit_transform(corpus).toarray()
vectorized_x

array([[1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 1, 0, 1, 1, 2, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1]], dtype=int64)

In [45]:
vectorizer.get_feature_names_out()

array(['1990s', 'distributions', 'id', 'include', 'kernel', 'linux',
       'mid', 'open', 'prominent', 'software', 'source'], dtype=object)