# Lab 05 - TF-IDF

In [None]:
#Release: 1.2107.2401

You will learn how to:
1. Calculate TF-IDF using TfidfVectorizer
2. View data in pandas DataFrame

<br>
 
***If you use Google Colab, install sastrawi package***

In [None]:
!pip install sastrawi

<br>

#### Import required library

In [None]:
import nltk
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer

<br>
 
***If you use Google Colab, download stopwords dan punkt package***

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
stopwords = nltk.corpus.stopwords.words('indonesian')

In [None]:
len(stopwords)

In [None]:
stopwords

<br>

#### Prepocessing function from previous labs

In [None]:
def tokenize_clean(text):
    
    #tokenisasi
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word
        in nltk.word_tokenize(sent)]
    
    #clean token from numeric and other character like puntuation
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
            
    return filtered_tokens

In [None]:
def remove_stopwords(tokenized_text):
    
    cleaned_token = []
    for token in tokenized_text:
        if token not in stopwords:
            cleaned_token.append(token)
            
    return cleaned_token

In [None]:
def stemming_text(tokenized_text):
    
    #stem using Sastrawi StemmerFactory 
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    stems = []
    for token in tokenized_text:
        stems.append(stemmer.stem(token))

    return stems

In [None]:
def text_preprocessing(text):
    
    prep01 = tokenize_clean(text)
    prep02 = remove_stopwords(prep01)
    prep03 = stemming_text(prep02)
    
    return prep03

<br>

## Small Dataset

<br>

### Step 01 - Create dataset

In [None]:
dataset = [
     'kucing kucing kucing hitam putih belang',
     'tikus belang',
     'tikus hitam',
     'tikus tikus tikus'
]

In [None]:
len(dataset)

In [None]:
dataset[2]

### Step 02 - Compute TF-IDF

In [None]:
#perform tf-idf vectorization
vectorizer = TfidfVectorizer(use_idf=True)
result_tfidf = vectorizer.fit_transform(dataset)

### Step 03 - View Result

Get List of Words

In [None]:
print(vectorizer.get_feature_names())

In [None]:
print(vectorizer.vocabulary_)

View TF-IDF Result

In [None]:
print(result_tfidf.shape)

In [None]:
type(result_tfidf)

In [None]:
print(result_tfidf.toarray())

View First Sentence

In [None]:
dataset[0]

In [None]:
print(result_tfidf[0])

In [None]:
print(result_tfidf[0].toarray())

View Second Sentence

In [None]:
print(result_tfidf[1])

In [None]:
print(result_tfidf[1].toarray())

In [None]:
dataset[1]

In [None]:
vectorizer.get_feature_names()

In [None]:
import pandas as pd
df = pd.DataFrame(result_tfidf[1].T.todense(), index=vectorizer.get_feature_names(), columns=["TF-IDF"])
df.sort_values(by=['TF-IDF'])

<br>

View IDF

In [None]:
# print idf values
df_idf = pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names(),columns=["idf"])
 
# sort ascending
df_idf.sort_values(by=['idf'])

<br>

### Step 04 - Compute TF-IDF with new sentence

In [None]:
new_text = 'kambing hitam'
result_tfidf = vectorizer.transform([new_text])

feature_names = vectorizer.get_feature_names()

In [None]:
feature_names

In [None]:
result_tfidf.toarray()

<br>

## Bigger Dataset

### Step 01 - Create dataset

In [None]:
files = []
files.append("Sekelompok ibu dan kaum perempuan duduk beralaskan rumput lapangan sambil fokus menganyam bambu yang ia genggam ditangan.")
files.append("Sebagian besar masyarakat rupanya tak mau melewatkan waktu begitu  saja untuk meratapi erupsi.")
files.append("Lombok memang memiliki sejuta pesona yang mampu menyedot perhatian orang untuk datang berwisata.")
files.append("Perempuan yang bergelut di dunia kerelawanan akan belajar caranya bertanggung jawab bagi sendiri dan orang lain.")
files.append("Kami berkoordinasi dan melapor pada posko relawan, kami berkomitmen  siap membantu dengan siaga 24 jam")

### Step 02 - Corpus preparation

In [None]:
token_dict = {}
i = 0
for t in files:
    filename = "file" + str(i)
    token_dict[filename] = t
    i = i + 1

token_dict

In [None]:
token_dict.values()

In [None]:
token_dict['file0']

### Step 03 - Compute TF-IDF

In [None]:
#perform tf-idf vectorization
tfidf = TfidfVectorizer(max_df=0.8,             # terms with document frequency value > 0.8 will be removed
                        min_df=0.2,             # terms with document frequency value < 0.2 will be removed
                        max_features=200000,    # create maximum 200.000 vocabulary that only consider the top max_features ordered by term frequency across the corpus.
                        stop_words = stopwords, # stopwords list
                        use_idf=True,           # enable inverse-document-frequency reweighting
                        tokenizer=text_preprocessing, # override the string tokenization step by using text_prepocessing function 
                        ngram_range=(1,3))      # ngram range 1 - 3 


tfs = tfidf.fit_transform(token_dict.values())

For detail TfidfVectorizer documentation visit: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

### Step 04 - View Result

Let's check the shape. We should have 5 rows (5 docs) and 96 columns (96 unique words):

In [None]:
tfs.shape

<br>

Inspect the first document vector

In [None]:
print(tfs[0])

View the list of feature

In [None]:
feature_names = tfidf.get_feature_names()

In [None]:
print(len(feature_names))

In [None]:
print(feature_names)

In [None]:
# print idf values
df_idf = pd.DataFrame(tfidf.idf_, index=feature_names,columns=["idf"])
 
# sort ascending
df_idf.sort_values(by=['idf'])

### Step 04 - New sentence TF-IDF transformation

In [None]:
str1 = 'Di kejauhan tampak seorang relawan pria dari Lombok sedang berjalan.'
response = tfidf.transform([str1])

#show result
for col in response.nonzero()[1]:
    print (feature_names[col], ' - ', response[0, col])

In [None]:
print(response[0])

In [None]:
print (text_preprocessing(str1))

<br>
<br>


#### Revision History:
Release: 1.1907.1601
- Initial release

Release: 1.1909.0901
- Install sastrawi package to support Google Colab
- Reorganize code

Release: 1.2011.2701
- Create small sample to make easier to understand

Release: 1.2107.2401
- Tidyup the code