# Lab 05 - TF-IDF


In [None]:
#Release: 1.1907.1602

#### Import required library

In [None]:
import nltk
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer

#### Prepocessing function from previous labs

In [None]:
def tokenize_clean(text):
    
    #tokenisasi
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word
        in nltk.word_tokenize(sent)]
    
    #clean token from numeric and other character like puntuation
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
            
    return filtered_tokens

In [None]:
stopwords = nltk.corpus.stopwords.words('indonesian')

In [None]:
def remove_stopwords(tokenized_text):
    
    cleaned_token = []
    for token in tokenized_text:
        if token not in stopwords:
            cleaned_token.append(token)
            
    return cleaned_token

In [None]:
def stemming_text(tokenized_text):
    
    #stem using Sastrawi StemmerFactory 
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    stems = []
    for token in tokenized_text:
        stems.append(stemmer.stem(token))

    return stems

In [None]:
def text_preprocessing(text):
    
    prep01 = tokenize_clean(text)
    prep02 = remove_stopwords(prep01)
    prep03 = stemming_text(prep02)
    
    return prep03
    

#### Step 01 - Create dataset

In [None]:
files = []
files.append("Sekelompok ibu dan kaum perempuan duduk beralaskan rumput lapangan sambil fokus menganyam bambu yang ia genggam ditangan.")
files.append("Sebagian besar masyarakat rupanya tak mau melewatkan waktu begitu  saja untuk meratapi erupsi.")
files.append("Lombok memang memiliki sejuta pesona yang mampu menyedot perhatian orang untuk datang berwisata.")
files.append("Perempuan yang bergelut di dunia kerelawanan akan belajar caranya bertanggung jawab bagi sendiri dan orang lain.")
files.append("Kami berkoordinasi dan melapor pada posko relawan, kami berkomitmen  siap membantu dengan siaga 24 jam")

#### Step 02 - Corpus preparation

In [None]:
#prepare corpus, load it into dictionary
token_dict = {}
i = 0
for t in files:
    filename = "file" + str(i)
    token_dict[filename] = t
    i = i + 1

token_dict

In [None]:
token_dict.values()

In [None]:
#perform tf-idf vectorization
tfidf = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words = stopwords,
                                 use_idf=True, tokenizer=text_preprocessing, ngram_range=(1,3))


tfs = tfidf.fit_transform(token_dict.values())

In [None]:
print(tfs.shape)

In [None]:
print(tfs)

In [None]:
import pandas as pd
# print idf values
df_idf = pd.DataFrame(tfidf.idf_, index=tfidf.get_feature_names(),columns=["idf"])
 
# sort ascending
df_idf.sort_values(by=['idf'])

#### Step 03 - TF-IDF Transformation

In [None]:
str1 = 'Di kejauhan tampak seorang relawan pria dari Lombok sedang berjalan.'
response = tfidf.transform([str1])

#show result
feature_names = tfidf.get_feature_names()
for col in response.nonzero()[1]:
    print (feature_names[col], ' - ', response[0, col])
    

In [None]:
print (text_preprocessing(str1))

In [None]:
print(feature_names)

### Using Smaller Corpus

In [None]:
# list of text documents
text = ["Sebagian besar masyarakat rupanya tak mau melewatkan waktu begitu  saja untuk meratapi erupsi.",
        "Masyarakat tak mau erupsi",
        "Rupanya waktu erupsi"]

In [None]:
# create the transform
vectorizer = TfidfVectorizer(tokenizer=text_preprocessing)

# tokenize and build vocab
vectorizer.fit(text)

In [None]:
# summarize
print(vectorizer.vocabulary_)
print(vectorizer.idf_)

In [None]:
# print idf values
df_idf = pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names(),columns=["idf"])
 
# sort ascending
df_idf.sort_values(by=['idf'])

In [None]:
# encode document
vector = vectorizer.transform([text[0]])
# summarize encoded vector
print(vector.shape)
print(vector.toarray())