In [1]:
import re
from pprint import pprint
import string 

import numpy as np
import nltk
from nltk import sent_tokenize, word_tokenize

from nltk.cluster.util import cosine_distance
from nltk.corpus import stopwords

from bs4 import BeautifulSoup
import pandas as pd
import requests

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory


In [2]:
def scrap_data(url: str) -> tuple:
    site = requests.get(url)
    soup = BeautifulSoup(site.text, 'html.parser')
    
    sentences = []
    
    text_element = soup.find_all('div', {'class' : 'content'})
    
    for sentence in text_element:
        sentences.append(sentence.text)
#     coba stopword disini nanti
    return sentences

In [3]:
class GetMatrix:
    
    def get_symmetric_matrix(matrix):
        """
        Get Symmetric matrix
        :param matrix:
        :return: matrix
        """
        return matrix + matrix.T - np.diag(matrix.diagonal())

    def core_cosine_similarity(vector1, vector2):
        """
        measure cosine similarity between two vectors
        :param vector1:
        :param vector2:
        :return: 0 < cosine similarity value < 1
        """
        return 1 - cosine_distance(vector1, vector2)

In [4]:
MULTIPLE_WHITESPACE_PATTERN = re.compile(r"\s+", re.UNICODE)

def normalize_whitespace(text):
    return MULTIPLE_WHITESPACE_PATTERN.sub(_replace_whitespace, text)

def _replace_whitespace(match):
    text = match.group()
    if "\n" in text or "\r" in text or "\\n" in text:
        return "\n"
    else:
        return " "

def is_blank(string):
    return not string or string.isspace()

In [5]:
class TextRank4Sentences():
    def __init__(self):
        self.damping = 0.85  # damping coefficient, usually is .85
        self.min_diff = 1e-5  # convergence threshold
        self.steps = 100  # iteration steps
        self.text_str = None
        self.sentences = None
        self.pr_vector = None

    def _sentence_similarity(self, sent1, sent2, stop_words=None):
#         if stopwords is None:
#             stopwords = []
        
        stoplist = []
        stoplist = set(stopwords.words('indonesian'))
        sent1 = [w.lower() for w in sent1]
        sent2 = [w.lower() for w in sent2]

        all_words = list(set(sent1 + sent2))

        vector1 = [0] * len(all_words)
        vector2 = [0] * len(all_words)

        # build the vector for the first sentence
        for w in sent1:
            if w in stoplist:
                continue
            vector1[all_words.index(w)] += 1

        # build the vector for the second sentence
        for w in sent2:
            if w in stoplist:
                continue
            vector2[all_words.index(w)] += 1

        return GetMatrix.core_cosine_similarity(vector1, vector2)

    def _build_similarity_matrix(self, sentences, stop_words=None):
        # create an empty similarity matrix
        sm = np.zeros([len(sentences), len(sentences)])

        for idx1 in range(len(sentences)):
            for idx2 in range(len(sentences)):
                if idx1 == idx2:
                    continue

                sm[idx1][idx2] = self._sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

        # Get Symmeric matrix
        sm = GetMatrix.get_symmetric_matrix(sm)

        # Normalize matrix by column
        norm = np.sum(sm, axis=0)
        sm_norm = np.divide(sm, norm, where=norm != 0)  # this is to ignore the 0 element in norm

        return sm_norm

    def _run_page_rank(self, similarity_matrix):

        pr_vector = np.array([1] * len(similarity_matrix))

        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr_vector = (1 - self.damping) + self.damping * np.matmul(similarity_matrix, pr_vector)
            if abs(previous_pr - sum(pr_vector)) < self.min_diff:
                break
            else:
                previous_pr = sum(pr_vector)

        return pr_vector

    def _get_sentence(self, index):

        try:
            return self.sentences[index]
        except IndexError:
            return ""

    def get_top_sentences(self, number=5):

        top_sentences = {}
        top_sent = {}

        if self.pr_vector is not None:

            sorted_pr = np.argsort(self.pr_vector)
            sorted_pr = list(sorted_pr)
            sorted_pr.reverse()

            index = 0
            for epoch in range(number):
                print (str(sorted_pr[index]) + " : " + str(self.pr_vector[sorted_pr[index]]))
                sent = self.sentences[sorted_pr[index]]
                sent = normalize_whitespace(sent)
#                 top_sentences[] = self.sentences[sorted_pr[index]]
                top_sentences[sent] = self.pr_vector[sorted_pr[index]]
                index += 1

        return str(top_sentences)
    
    def analyze(self, text):
        self.text_str = text.lower()
        self.sentences = sent_tokenize(self.text_str)

        tokenized_sentences = [word_tokenize(sent) for sent in self.sentences]
        
        stop_words = []
        stop_words = stopwords.words("indonesian")
        sen_new = [sen for sen in tokenized_sentences if sen not in stop_words]
                
        similarity_matrix = self._build_similarity_matrix(sen_new, stop_words)


        self.pr_vector = self._run_page_rank(similarity_matrix)
        
        print("Nilai Similarity : ", similarity_matrix)
        print("Hasil Ekstraksi Algoritma: ",self.pr_vector)
        print(sen_new)
#         print("Sentences :", len(sen_new))
#         print(tokenized_sentences)

In [6]:
#Stagging
def stagging_text(text):
    text_str = sent_tokenize(text)
    pecahan = [word_tokenize(sent) for sent in text_str]
    return pecahan


#lowercase
def text_lowercase(text): 
    return text.lower()

# Remove numbers 
def remove_numbers(text): 
    result = re.sub(r'\d+', '', text) 
    return result 

# remove punctuation 
# def remove_punctuation(text): 
#     translator = str.maketrans('', '', string.punctuation) 
#     return text.translate(translator)

# remove whitespace from text 
def remove_whitespace(text): 
    return  " ".join(text.split()) 

# remove stopwords function 
def remove_stopwords(text):
    
    stopword_file = open("stopword.txt", "r") #ambil data stopword
    
    lots_of_stopwords = []
    for line in stopword_file.readlines():
        lots_of_stopwords.append(str(line.strip()))
    
    stop_words = set(lots_of_stopwords) 
    word_tokens = word_tokenize(text) 
    filtered_text = [word for word in word_tokens if word not in stop_words] 
    return filtered_text 

def steeming(text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    for kata in text:
        katadasar = stemmer.stem(kata)
    
    return katadasar


def pecahkalimat(text):
    kalimatakhir = re.compile('[.]')
    doc = text.split('.')
    
    return doc

In [7]:
import nltk
from newspaper import Article

def get_title(url):
    gtitle = Article(url)
    gtitle.download()
    gtitle.parse()
    gtitle.title
    
    return gtitle.title
    

In [8]:
#url = 'https://www.mashara.id/bagaimana-agar-selalu-bisa-istiqomah-dalam-hal-ibadah-agar-selalu-mendapatkan-ridho-allah-ta-ala'
url = 'https://www.mashara.id/apakah-ada-kaitan-kejadian-bencana-alam-yang-terus-terjadi-dengan-pilihan-politik-pemimpin-di-suatu-daerah'
#url = 'https://www.mashara.id/adakah-cara-untuk-menjauhkan-diri-dari-hal-yg-buruk'
sentence = str(scrap_data(url))

In [9]:
# Preprocessing

lower = text_lowercase(sentence)
rnumber = remove_numbers(lower)
#punctuation = remove_punctuation(rnumber)
white_space = remove_whitespace(rnumber)
stopword_list = remove_stopwords(white_space)

In [10]:
print(stopword_list)

['[', "'katakanlah", 'jika', 'gempa', 'di', 'lombok', 'palu', 'dan', 'yang', 'lainnya', 'memang', 'benar-benar', 'terjadi', 'akibat', 'dari', 'pilihan', 'politik', 'pemimpinnya', '.', 'berarti', 'samalah', 'dengan', 'masyarakat', 'arab', 'zaman', 'rasulullah', 'yang', 'mengaitkan', 'fenomena', 'gerhana', 'matahari', 'dengan', 'kematian', 'ibrahim', 'putra', 'rasulullah', 'saw', 'yang', 'meninggal', 'akibat', 'sakit', 'keras.masyarakat', 'arab', 'juga', 'sebagian', 'kaum', 'muslimin', 'menganggap', 'gerhana', 'matahari', 'terjadi', 'akibat', 'kematian', 'ibrahim', '.', 'mendengar', 'hal', 'nabi', 'muhammad', 'saw', 'lalu', 'bersabda', '“', 'matahari', 'dan', 'bulan', 'adalah', 'tanda', 'kebesaran', 'allah', 'yang', 'tidak', 'ada', 'hubungannya', 'dengan', 'kematian', 'atau', 'hidup', 'seseorang', '.', 'kalau', 'kalian', 'melihat', 'hal', 'itu', 'maka', 'berlindunglah', 'kepada', 'allah', 'dengan', 'dzikir', 'dan', 'doa', '.', '(', 'bukhari', 'dan', 'muslim', ')', '\\xaallah', 'punya', '

In [11]:
new_sentence = ' '.join(stopword_list)
print (new_sentence)

[ 'katakanlah jika gempa di lombok palu dan yang lainnya memang benar-benar terjadi akibat dari pilihan politik pemimpinnya . berarti samalah dengan masyarakat arab zaman rasulullah yang mengaitkan fenomena gerhana matahari dengan kematian ibrahim putra rasulullah saw yang meninggal akibat sakit keras.masyarakat arab juga sebagian kaum muslimin menganggap gerhana matahari terjadi akibat kematian ibrahim . mendengar hal nabi muhammad saw lalu bersabda “ matahari dan bulan adalah tanda kebesaran allah yang tidak ada hubungannya dengan kematian atau hidup seseorang . kalau kalian melihat hal itu maka berlindunglah kepada allah dengan dzikir dan doa . ( bukhari dan muslim ) \xaallah punya kuasa atas terjadinya fenomena gerhana matahari yang bersamaan dengan kematian ibrahim . sama halnya juga allah punya kuasa atas gempa yang mengguncang lombok juga daerah lainnya di indonesia . sebagai manusia tak berhak menjatuhkan penghakiman pada apa-apa yang tidak paham atasnya . penghakiman penilaian

In [13]:
pecah_kalimat = stagging_text(new_sentence)


In [14]:
textrank = TextRank4Sentences() 
text = textrank.analyze(str(new_sentence))
text = textrank.get_top_sentences(5)

Nilai Similarity :  [[0.         0.04451431 0.01622479 ... 0.02232309 0.01829217 0.01646596]
 [0.03370991 0.         0.0498198  ... 0.01142421 0.00936132 0.00421336]
 [0.01946242 0.07891544 0.         ... 0.01978731 0.01621427 0.02189326]
 ...
 [0.04204364 0.02841281 0.03106811 ... 0.         0.10508051 0.03152991]
 [0.02752402 0.01860055 0.02033885 ... 0.08395044 0.         0.04128234]
 [0.02362649 0.00798331 0.02618815 ... 0.02402088 0.03936677 0.        ]]
Hasil Ekstraksi Algoritma:  [0.83438358 0.72353672 0.99549612 1.26331074 0.88679322 1.13335638
 0.84690409 1.14320466 0.99532671 1.203075   0.93643205 0.94212058
 1.08056176 1.13676601 1.07070875 1.08426997 1.23512935 0.90714207
 1.278174   0.80983316 0.8858505  0.51909782 0.74100277 0.64967723
 1.42390944 0.5163842  1.29543023 0.95389984 1.07647749 1.32524916
 1.04961605 1.16177745 0.77213955 0.73519202 0.68142115 1.42635768
 1.16177745 1.11821505]
[['[', "'katakanlah", 'jika', 'gempa', 'di', 'lombok', 'palu', 'dan', 'yang', 'lai

In [137]:
similarity_matrix = textrank._build_similarity_matrix(pecah_kalimat)
vector = textrank._run_page_rank(similarity_matrix)

In [138]:
print("Similarity Matrix: ", similarity_matrix)
print("Textrank: ", vector)

Similarity Matrix:  [[0.         0.04451431 0.01622479 ... 0.02232309 0.01829217 0.01646596]
 [0.03370991 0.         0.0498198  ... 0.01142421 0.00936132 0.00421336]
 [0.01946242 0.07891544 0.         ... 0.01978731 0.01621427 0.02189326]
 ...
 [0.04204364 0.02841281 0.03106811 ... 0.         0.10508051 0.03152991]
 [0.02752402 0.01860055 0.02033885 ... 0.08395044 0.         0.04128234]
 [0.02362649 0.00798331 0.02618815 ... 0.02402088 0.03936677 0.        ]]
Textrank:  [0.83438358 0.72353672 0.99549612 1.26331074 0.88679322 1.13335638
 0.84690409 1.14320466 0.99532671 1.203075   0.93643205 0.94212058
 1.08056176 1.13676601 1.07070875 1.08426997 1.23512935 0.90714207
 1.278174   0.80983316 0.8858505  0.51909782 0.74100277 0.64967723
 1.42390944 0.5163842  1.29543023 0.95389984 1.07647749 1.32524916
 1.04961605 1.16177745 0.77213955 0.73519202 0.68142115 1.42635768
 1.16177745 1.11821505]


In [15]:
print(text)

{"'bisa dilihat dari dua sisi .": 1.4263576757534167, 'rasulullah menjawab ya .': 1.4239094430787484, 'jadi bencana alam itu hak mutlak allah mau ada sebab atau tidaknya .': 1.3252491600391694, 'rasulullah menjawab mereka terkena musibah yang sama sebagaimana yang lain namun kelak mereka mendapatkan ampunan allah dan ridha-nya ” ( .': 1.2954302326721745, 'bencana alam merupakan kebesaran allah juga ujian dan peringatan ( bukan adzab ) .': 1.2781739970000818}
