# Lab 05 - TF-IDF

In [None]:
#Release: 1.1909.0901

You will learn how to:
1. Calculate TF-IDF using TfidfVectorizer
2. View data in pandas DataFrame

<br>
 
***If you use Google Colab, install sastrawi package***

In [None]:
!pip install sastrawi

Collecting sastrawi
[?25l  Downloading https://files.pythonhosted.org/packages/6f/4b/bab676953da3103003730b8fcdfadbdd20f333d4add10af949dd5c51e6ed/Sastrawi-1.0.1-py2.py3-none-any.whl (209kB)
[K     |█▋                              | 10kB 15.3MB/s eta 0:00:01[K     |███▏                            | 20kB 12.3MB/s eta 0:00:01[K     |████▊                           | 30kB 8.5MB/s eta 0:00:01[K     |██████▎                         | 40kB 7.5MB/s eta 0:00:01[K     |███████▉                        | 51kB 4.4MB/s eta 0:00:01[K     |█████████▍                      | 61kB 4.8MB/s eta 0:00:01[K     |███████████                     | 71kB 5.0MB/s eta 0:00:01[K     |████████████▌                   | 81kB 5.3MB/s eta 0:00:01[K     |██████████████                  | 92kB 5.7MB/s eta 0:00:01[K     |███████████████▋                | 102kB 5.5MB/s eta 0:00:01[K     |█████████████████▏              | 112kB 5.5MB/s eta 0:00:01[K     |██████████████████▊             | 122kB 5.5MB/s

<br>

#### Import required library

In [None]:
import nltk
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer

<br>
 
***If you use Google Colab, download stopwords dan punkt package***

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
stopwords = nltk.corpus.stopwords.words('indonesian')

In [None]:
len(stopwords)

758

In [None]:
stopwords

<br>

#### Prepocessing function from previous labs

In [None]:
def tokenize_clean(text):
    
    #tokenisasi
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word
        in nltk.word_tokenize(sent)]
    
    #clean token from numeric and other character like puntuation
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
            
    return filtered_tokens

In [None]:
def remove_stopwords(tokenized_text):
    
    cleaned_token = []
    for token in tokenized_text:
        if token not in stopwords:
            cleaned_token.append(token)
            
    return cleaned_token

In [None]:
def stemming_text(tokenized_text):
    
    #stem using Sastrawi StemmerFactory 
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    stems = []
    for token in tokenized_text:
        stems.append(stemmer.stem(token))

    return stems

In [None]:
def text_preprocessing(text):
    
    prep01 = tokenize_clean(text)
    prep02 = remove_stopwords(prep01)
    prep03 = stemming_text(prep02)
    
    return prep03

<br>

## Small Dataset

<br>

### Step 01 - Create dataset

In [None]:
corpus = [
     'kucing kucing kucing hitam putih belang',
     'tikus belang',
     'tikus hitam',
     'tikus tikus tikus'
]

In [None]:
len(corpus)

4

In [None]:
corpus[2]

'tikus hitam'

### Step 02 - Compute TF-IDF

In [None]:
#perform tf-idf vectorization
vectorizer = TfidfVectorizer(use_idf=True)
X = vectorizer.fit_transform(corpus)

### Step 03 - View Result

Get List of Words

In [None]:
print(vectorizer.get_feature_names())

['belang', 'hitam', 'kucing', 'putih', 'tikus']


In [None]:
print(vectorizer.vocabulary_)

{'kucing': 2, 'hitam': 1, 'putih': 3, 'belang': 0, 'tikus': 4}


View TF-IDF Result

In [None]:
print(X.shape)

(4, 5)


In [None]:
print(X.toarray())

[[0.23513012 0.23513012 0.89469821 0.29823274 0.        ]
 [0.77722116 0.         0.         0.         0.62922751]
 [0.         0.77722116 0.         0.         0.62922751]
 [0.         0.         0.         0.         1.        ]]


View First Sentence

In [None]:
corpus[0]

'kucing kucing kucing hitam putih belang'

In [None]:
print(X[0])

  (0, 0)	0.2351301157996824
  (0, 3)	0.2982327375202219
  (0, 1)	0.2351301157996824
  (0, 2)	0.8946982125606657


In [None]:
print(X[0].toarray())

[[0.23513012 0.23513012 0.89469821 0.29823274 0.        ]]


View Second Sentence

In [None]:
print(X[1])

  (0, 4)	0.6292275146695526
  (0, 0)	0.7772211620785797


In [None]:
import pandas as pd
df = pd.DataFrame(X[1].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
df.sort_values(by=['TF-IDF'])

Unnamed: 0,TF-IDF
hitam,0.0
kucing,0.0
putih,0.0
tikus,0.629228
belang,0.777221


<br>

View IDF

In [None]:
# print idf values
df_idf = pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names(),columns=["idf"])
 
# sort ascending
df_idf.sort_values(by=['idf'])

Unnamed: 0,idf
tikus,1.223144
belang,1.510826
hitam,1.510826
kucing,1.916291
putih,1.916291


<br>

### Step 04 - Compute TF-IDF with new sentence

In [None]:
str1 = 'kambing hitam'
response = vectorizer.transform([str1])

feature_names = vectorizer.get_feature_names()

In [None]:
feature_names

['belang', 'hitam', 'kucing', 'putih', 'tikus']

In [None]:
response.toarray()

array([[0., 1., 0., 0., 0.]])

<br>

## Bigger Dataset

### Step 01 - Create dataset

In [40]:
files = []
files.append("Sekelompok ibu dan kaum perempuan duduk beralaskan rumput lapangan sambil fokus menganyam bambu yang ia genggam ditangan.")
files.append("Sebagian besar masyarakat rupanya tak mau melewatkan waktu begitu  saja untuk meratapi erupsi.")
files.append("Lombok memang memiliki sejuta pesona yang mampu menyedot perhatian orang untuk datang berwisata.")
files.append("Perempuan yang bergelut di dunia kerelawanan akan belajar caranya bertanggung jawab bagi sendiri dan orang lain.")
files.append("Kami berkoordinasi dan melapor pada posko relawan, kami berkomitmen  siap membantu dengan siaga 24 jam")

### Step 02 - Corpus preparation

In [41]:
token_dict = {}
i = 0
for t in files:
    filename = "file" + str(i)
    token_dict[filename] = t
    i = i + 1

token_dict

{'file0': 'Sekelompok ibu dan kaum perempuan duduk beralaskan rumput lapangan sambil fokus menganyam bambu yang ia genggam ditangan.',
 'file1': 'Sebagian besar masyarakat rupanya tak mau melewatkan waktu begitu  saja untuk meratapi erupsi.',
 'file2': 'Lombok memang memiliki sejuta pesona yang mampu menyedot perhatian orang untuk datang berwisata.',
 'file3': 'Perempuan yang bergelut di dunia kerelawanan akan belajar caranya bertanggung jawab bagi sendiri dan orang lain.',
 'file4': 'Kami berkoordinasi dan melapor pada posko relawan, kami berkomitmen  siap membantu dengan siaga 24 jam'}

In [42]:
token_dict.values()

dict_values(['Sekelompok ibu dan kaum perempuan duduk beralaskan rumput lapangan sambil fokus menganyam bambu yang ia genggam ditangan.', 'Sebagian besar masyarakat rupanya tak mau melewatkan waktu begitu  saja untuk meratapi erupsi.', 'Lombok memang memiliki sejuta pesona yang mampu menyedot perhatian orang untuk datang berwisata.', 'Perempuan yang bergelut di dunia kerelawanan akan belajar caranya bertanggung jawab bagi sendiri dan orang lain.', 'Kami berkoordinasi dan melapor pada posko relawan, kami berkomitmen  siap membantu dengan siaga 24 jam'])

In [43]:
token_dict['file0']

'Sekelompok ibu dan kaum perempuan duduk beralaskan rumput lapangan sambil fokus menganyam bambu yang ia genggam ditangan.'

### Step 03 - Compute TF-IDF

In [44]:
#perform tf-idf vectorization
tfidf = TfidfVectorizer(max_df=0.8,             # terms with document frequency value > 0.8 will be removed
                        min_df=0.2,             # terms with document frequency value < 0.2 will be removed
                        max_features=200000,    # create maximum 200.000 vocabulary that only consider the top max_features ordered by term frequency across the corpus.
                        stop_words = stopwords, # stopwords list
                        use_idf=True,           # enable inverse-document-frequency reweighting
                        tokenizer=text_preprocessing, # override the string tokenization step by using text_prepocessing function 
                        ngram_range=(1,3))      # ngram range 1 - 3 


tfs = tfidf.fit_transform(token_dict.values())

For detail TfidfVectorizer documentation visit: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

### Step 04 - View Result

Let's check the shape. We should have 5 rows (5 docs) and 96 columns (96 unique words):

In [45]:
tfs.shape

(5, 96)

<br>

Inspect the first document vector

In [46]:
print(tfs[0])

  (0, 11)	0.17500574860015006
  (0, 8)	0.17500574860015006
  (0, 24)	0.17500574860015006
  (0, 48)	0.17500574860015006
  (0, 86)	0.17500574860015006
  (0, 5)	0.17500574860015006
  (0, 17)	0.17500574860015006
  (0, 65)	0.17500574860015006
  (0, 36)	0.17500574860015006
  (0, 39)	0.17500574860015006
  (0, 29)	0.17500574860015006
  (0, 10)	0.17500574860015006
  (0, 7)	0.17500574860015006
  (0, 23)	0.17500574860015006
  (0, 47)	0.17500574860015006
  (0, 85)	0.17500574860015006
  (0, 4)	0.17500574860015006
  (0, 16)	0.17500574860015006
  (0, 64)	0.17500574860015006
  (0, 35)	0.17500574860015006
  (0, 38)	0.17500574860015006
  (0, 92)	0.17500574860015006
  (0, 28)	0.17500574860015006
  (0, 9)	0.17500574860015006
  (0, 6)	0.17500574860015006
  (0, 22)	0.17500574860015006
  (0, 46)	0.17500574860015006
  (0, 84)	0.17500574860015006
  (0, 3)	0.17500574860015006
  (0, 15)	0.17500574860015006
  (0, 63)	0.1411935360448027
  (0, 34)	0.17500574860015006
  (0, 37)	0.17500574860015006


View the list of feature

In [47]:
feature_names = tfidf.get_feature_names()

In [48]:
print(len(feature_names))

96


In [49]:
print(feature_names)

['ajar', 'ajar tanggung', 'ajar tanggung orang', 'alas', 'alas rumput', 'alas rumput lapang', 'anyam', 'anyam bambu', 'anyam bambu genggam', 'bambu', 'bambu genggam', 'bambu genggam tang', 'bantu', 'bantu siaga', 'bantu siaga jam', 'duduk', 'duduk alas', 'duduk alas rumput', 'dunia', 'dunia rawan', 'dunia rawan ajar', 'erupsi', 'fokus', 'fokus anyam', 'fokus anyam bambu', 'gelut', 'gelut dunia', 'gelut dunia rawan', 'genggam', 'genggam tang', 'jam', 'juta', 'juta pesona', 'juta pesona sedot', 'kaum', 'kaum perempuan', 'kaum perempuan duduk', 'kelompok', 'kelompok kaum', 'kelompok kaum perempuan', 'komitmen', 'komitmen bantu', 'komitmen bantu siaga', 'koordinasi', 'koordinasi lapor', 'koordinasi lapor posko', 'lapang', 'lapang fokus', 'lapang fokus anyam', 'lapor', 'lapor posko', 'lapor posko rawan', 'lombok', 'lombok milik', 'lombok milik juta', 'masyarakat', 'masyarakat ratap', 'masyarakat ratap erupsi', 'milik', 'milik juta', 'milik juta pesona', 'orang', 'orang wisata', 'perempuan',

In [50]:
# print idf values
df_idf = pd.DataFrame(tfidf.idf_, index=feature_names,columns=["idf"])
 
# sort ascending
df_idf.sort_values(by=['idf'])

Unnamed: 0,idf
rawan,1.693147
perempuan,1.693147
orang,1.693147
ajar,2.098612
perhati,2.098612
...,...
gelut dunia rawan,2.098612
gelut dunia,2.098612
gelut,2.098612
tanggung orang,2.098612


### Step 04 - New sentence TF-IDF transformation

In [51]:
str1 = 'Di kejauhan tampak seorang relawan pria dari Lombok sedang berjalan.'
response = tfidf.transform([str1])

#show result
for col in response.nonzero()[1]:
    print (feature_names[col], ' - ', response[0, col])

rawan  -  0.6279137616509933
lombok  -  0.7782829228046183


In [53]:
print(response[0])

  (0, 79)	0.6279137616509933
  (0, 52)	0.7782829228046183


In [54]:
print (text_preprocessing(str1))

['jauh', 'rawan', 'pria', 'lombok', 'jalan']


<br>
<br>


#### Revision History:
Release: 1.1907.1601
- Initial release

Release: 1.1909.0901
- Install sastrawi package to support Google Colab
- Reorganize code

Release: 1.2011.2701
- Create small sample to make easier to understand