# Text Summarization With TFIDF

NIM: 215314087

# Step 1 - Import Library

In [None]:
!pip install PySastrawi

In [None]:
# import library
import nltk
import os
import re
import math
import operator
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import defaultdict

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

Stopwords = set(stopwords.words('indonesian'))
wordlemmatizer = WordNetLemmatizer()

# Step 2 - Function to clean text and calculate tf idf

In [None]:
# lematization function
def lemmatize_words(words):
    return [wordlemmatizer.lemmatize(word) for word in words]

In [None]:
# remove special characters function
def remove_special_characters(text):
    regex = r'[^a-zA-Z0-9\s]'
    return re.sub(regex, '', text)

In [None]:
# words frequensi function
def freq(words):
    words = [word.lower() for word in words]
    dict_freq = defaultdict(int)
    for word in words:
        dict_freq[word] += 1
    return dict_freq

In [None]:
# pos tagging function
def pos_tagging(text):
    pos_tag = nltk.pos_tag(text.split())
    return [word for word, tag in pos_tag if tag.startswith('NN') or tag.startswith('VB')]

In [None]:
# TF score function
def tf_score(word, sentence):
    word_frequency_in_sentence = sentence.split().count(word)
    len_sentence = len(sentence.split())
    return word_frequency_in_sentence / len_sentence

In [None]:
# IDF score function
def idf_score(no_of_sentences, word, sentences):
    no_of_sentence_containing_word = sum(1 for sentence in sentences if word in sentence)
    return math.log10(no_of_sentences / (no_of_sentence_containing_word + 1))

In [None]:
# TF-IDF function
def tf_idf_score(tf, idf):
    return tf * idf

In [None]:
# tf idf of words function
def word_tfidf(word, sentences, sentence):
    tf = tf_score(word, sentence)
    idf = idf_score(len(sentences), word, sentences)
    return tf_idf_score(tf, idf)

In [None]:
# sentence importance function
def sentence_importance(sentence, dict_freq, sentences):
    sentence_score = 0
    sentence = remove_special_characters(sentence)
    pos_tagged_sentence = pos_tagging(sentence)
    for word in pos_tagged_sentence:
        if word.lower() not in Stopwords and len(word) > 1:
            word = word.lower()
            word = wordlemmatizer.lemmatize(word)
            sentence_score += word_tfidf(word, sentences, sentence)
    return sentence_score

# Step 3 - Text Preprocessing

In [None]:
# import file (document that want to summary) and then read
file_path = '/content/drive/MyDrive/Colab Notebooks/PI/Dataset Project PI/dev.05.txt'
with open(file_path, 'r') as file:
  text = file.read()

In [None]:
# tokenization tekxt to sentence, remove special characters and delate number from text
tokenized_sentence = sent_tokenize(text)
text = remove_special_characters(text)
text = re.sub(r'\d+', '', text)

In [None]:
# removing stopwords and short words then lemmatization the words
tokenized_words_with_stopwords = word_tokenize(text)
tokenized_words = [word.lower() for word in tokenized_words_with_stopwords if word.lower() not in Stopwords and len(word) > 1]
tokenized_words = lemmatize_words(tokenized_words)

# Step 4 - Modelling using TF-IDF

In [None]:
# calculate frequnsi of words
word_freq = freq(tokenized_words)

In [None]:
# takes input of the percentage of information to be retained from the user
input_user = int(input('Percentage of information to retain(in percent):'))

# calculate the number of sentences to be retained in the summary
no_of_sentences = int((input_user * len(tokenized_sentence)) / 100)

In [None]:
# calculate the importance of each sentence then sorting
sentence_with_importance = {i: sentence_importance(sent, word_freq, tokenized_sentence) for i, sent in enumerate(tokenized_sentence, 1)}
sentence_with_importance = sorted(sentence_with_importance.items(), key=operator.itemgetter(1), reverse=True)

In [None]:
# select the sentence with the highest importance
summary_sentences = sorted([sentence_no for sentence_no, _ in sentence_with_importance[:no_of_sentences]])

# make a summary by combining important sentences
summary = " ".join(tokenized_sentence[i - 1] for i in summary_sentences)

In [None]:
# print the summary of document and look the length of words
print("\nSummary:")
print(summary)
len(summary)

In [None]:
# turn and unify separate words into sentence lines
summary_all = ''.join([' '.join(kata) if isinstance(kata, list) else kata for kata in summary])

In [None]:
print(summary_all)
len(summary_all)

In [None]:
import pandas as pd

# make DataFrame from summary result
df = pd.DataFrame([summary], columns=['Summary'])

# save DataFrame with CSV/.txt  file
df.to_csv('summary-dev.05.txt', index=False)

# Step 5 - Model Evaluation with ROUGE

In [None]:
!pip install rouge

In [None]:
from rouge import Rouge

def evaluate_summary(generated_summary, reference_summary):
    # Initialize the ROUGE evaluator
    rouge = Rouge()

    # Calculate ROUGE scores
    scores = rouge.get_scores(generated_summary, reference_summary, avg=True)

    # Extract and print ROUGE scores
    rouge_1 = scores['rouge-1']
    rouge_2 = scores['rouge-2']
    rouge_l = scores['rouge-l']

    print(f"ROUGE-1: Precision: {rouge_1['p']}, Recall: {rouge_1['r']}, F1: {rouge_1['f']}")
    print(f"ROUGE-2: Precision: {rouge_2['p']}, Recall: {rouge_2['r']}, F1: {rouge_2['f']}")
    print(f"ROUGE-L: Precision: {rouge_l['p']}, Recall: {rouge_l['r']}, F1: {rouge_l['f']}")

    return scores

# Example usage
reference_summary = """
Menurut Cheong banyak perusahaan yang sedianya memilih Singapura sebagai kantor regional, namun memutuskan pindah ke negara lain. Ini dilakukan demi menangkap peluang-peluang yang muncuk akibat populasi generasi muda dan tumbuhnya konsumsi negara-negara tujuan investasi di Asia Tenggara.
Salah satu contoh, yaitu Crestar Education Group, perusahaan asal Singapura yang melakukan ekspansi ke Indonesia. Manajemen Crestar menilai, potensi data beli penduduk Indonesia yang berusia relatif muda menarik untuk digarap, termasuk masyarakat berpenghasilan menengah yang jumlah banyak.
"""
generated_summary = """
Ini dilakukan demi menangkap peluang-peluang yang muncuk akibat populasi generasi muda dan tumbuhnya konsumsi negara-negara tujuan investasi di Asia Tenggara.
Manajemen Crestar menilai, potensi data beli penduduk Indonesia yang berusia relatif muda menarik untuk digarap, termasuk masyarakat berpenghasilan menengah yang jumlah banyak.
"""

scores = evaluate_summary(generated_summary, reference_summary)
