# Load Data
---

In [43]:
import pandas as pd
from dotenv import load_dotenv
import os
import psycopg2

# Load environment variables
load_dotenv()

# Database connection details
db_host = os.getenv('DB_HOST')
db_port = os.getenv('DB_PORT')
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_name = os.getenv('DB_NAME')

def load_data_from_db(query):
    """Load data from PostgreSQL database using a SQL query."""
    try:
        # Establish a connection to the database
        with psycopg2.connect(
            host=db_host,
            port=db_port,
            dbname=db_name,
            user=db_user,
            password=db_password
        ) as conn:
            # Execute the query and load data into a DataFrame
            df = pd.read_sql_query(query, conn)
            
        return df
    except psycopg2.OperationalError as e:
        print(f"OperationalError connecting to the database: {e}")
    except Exception as e:
        print(f"Error connecting to the database: {e}")

    # Return an empty DataFrame in case of error
    return pd.DataFrame()

# load data
query = 'SELECT * FROM "jpku"."summarize_ai_7days";'
df = load_data_from_db(query)

df.head()

  df = pd.read_sql_query(query, conn)


Unnamed: 0,date,text
0,2024-10-01,"kenapa cuaca hari ini enak banget, ya kayaknya pas banget buat ke taman, biar bisa menikmati udara segar sambil curhat sama temanteman. . saya lagi belajar python, wow, seru banget kalo ada tips atau trik buat newbie kayak saya, pls share ya bantuin banget . cuaca sejuk hari ini bikin pengin ngumpul sambil ngopi. bagaimana kalo kita ketemuan dan ceritacerita seru pasti asyik banget tadi lihat orang jogging, langsung terinspirasi bikin pengin jadi lebih aktif. kamu juga suka olahraga, kan hari ini saya baca buku pengembangan diri, wow, makin semangat ada buku lain yang wajib dibaca pengin belajar terus malam ini rencananya mau makan pizzza pizza itu makanan fav saya kamu juga suka pizza topping favorit kamu apa baru pulang dari pasar, beli bahan fresh buat masak. akhir pekan ini mau masak apa, ya sudah ada resep seru buat dicoba"
1,2024-10-08,"hari ini pas banget buat piknik siapa mau ikut rasanya asyik kalo bisa ngumpul dan bersenangsenang di luar weekend kemarin seru habis saya habisin waktu bareng keluarga. bagaimana weekend kamu pasti ada cerita seru juga kemarin menonton film yang gila asik ada rekomendasi film lain yang seru pengin menonton yang bikin mengakak atau bawa perasaan begitu. rencananya mau coba resep baru buat sarapan. kamu ada resep enak yang gampang bantuin dong, butuh ideide fresh cuaca hangat ini bikin pengin jalanjalan ke tempat baru ada tempat seru yang kamu tau saya suka explore hal baru tadi ketemu teman lama, nostalgia kapan kita bisa ketemu lagi kangen sama momen seru yang kita lewatin bareng cek ini , tempat ini super cocok buat piknik sudah pernah kesana pasti seru buat relax"
2,2024-10-15,"baca info lebih lanjut di tentang pengembangan diri banyak hal seru yang bisa dipelajari kemarin menonton film terbaru di , dan itu benarbenar seru plot twistnya bikin saya shocked, pengin menonton lagi sekarang lagi mencoba hidup lebih sehat dengan olahraga. kamu olahraga apa, nih pengennya bugar kayak kamu eh, kenapa kamu enggak balas pesan saya penasaran deh kamu sibuk sama apa. ayo, jangan lupa balas, ya makan es krim pas cuaca panas itu enak siapa mau ajak saya makan es krim saya suka semua rasa, hehe kapan kita bisa kumpul lagi sudah kangen banget sama kamu rasanya lama banget enggak ketemu, yuk kita atur waktu kemarin menonton film yang bikin baper siapa mau menonton bareng lagi minggu depan yuk kita atur jadwalnya"


# Extraction Text (GNN Model)
---

In [42]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv, global_mean_pool, global_add_pool, global_max_pool
import networkx as nx
import spacy
from transformers import BertTokenizer, BertModel
import numpy as np

# Load multilingual model for spaCy
nlp = spacy.load("xx_ent_wiki_sm")
nlp.add_pipe('sentencizer')

class SimpleGNN(torch.nn.Module):
    """
    A simple Graph Neural Network (GNN) model that combines 
    Graph Convolutional Networks (GCN) and Graph Attention Networks (GAT).

    Args:
        in_channels (int): Number of input features for each node.
        out_channels (int): Number of output features for each node.
        activation (str): Activation function to use ('relu' or 'tanh').
        aggregation (str): Aggregation method for pooling ('mean', 'sum', or 'max').
    """
    
    def __init__(self, in_channels, out_channels, activation='relu', aggregation='mean'):
        super(SimpleGNN, self).__init__()
        
        # First layer: GCN
        self.conv1 = GCNConv(in_channels, 16)
        
        # Second layer: GAT
        self.conv2 = GATConv(16, 16, heads=1)
        
        # Third layer: GAT
        self.conv3 = GATConv(16, 16, heads=1)
        
        # Output layer
        self.final_conv = GCNConv(16, out_channels)
        
        # Store activation and aggregation type
        self.activation = activation
        self.aggregation = aggregation

    def forward(self, x, edge_index, batch):
        """
        Forward pass for the GNN model.

        Args:
            x (torch.Tensor): Node feature matrix.
            edge_index (torch.Tensor): Graph connectivity in COO format.
            batch (torch.Tensor): Batch vector for batching.

        Returns:
            torch.Tensor: Output node features after passing through the GNN layers.
        """
        # First layer
        h = self.conv1(x, edge_index)
        h = self._apply_activation(h)

        # Second layer
        h = self.conv2(h, edge_index)
        h = self._apply_activation(h)

        # Third layer
        h = self.conv3(h, edge_index)
        h = self._apply_activation(h)

        # Global aggregation function
        if self.aggregation == 'mean':
            h = global_mean_pool(h, batch)
            
        elif self.aggregation == 'sum':
            h = global_add_pool(h, batch)
            
        elif self.aggregation == 'max':
            h = global_max_pool(h, batch)
            
        else:
            raise ValueError("Invalid aggregation type. Choose 'mean', 'sum', or 'max'.")

        # Output
        h = self.final_conv(h, edge_index)
        return h

    def _apply_activation(self, h):
        """
        Applies the specified activation function.

        Args:
            h (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Activated tensor.
        """
        if self.activation == 'relu':
            return F.relu(h)
        
        elif self.activation == 'tanh':
            return F.tanh(h)
        else:
            raise ValueError("Invalid activation type. Choose 'relu' or 'tanh'.")

class TextSummarizer:
    """
    A text summarization class that utilizes BERT for embeddings 
    and builds a graph-based representation of text.

    Attributes:
        tokenizer (BertTokenizer): BERT tokenizer for sentence encoding.
        model (BertModel): BERT model for generating embeddings.
    """

    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.model = BertModel.from_pretrained('bert-base-multilingual-cased')

    @staticmethod
    def extract_sentences(text):
        """
        Extracts sentences from the input text.

        Args:
            text (str): Input text.

        Returns:
            list: A list of sentences extracted from the text.
        """
        return [sent.text.strip() for sent in nlp(text).sents]

    @staticmethod
    def extract_entities(text):
        """
        Extracts named entities from the input text.

        Args:
            text (str): Input text.

        Returns:
            list: A list of tuples containing entities and their labels.
        """
        doc = nlp(text)
        return [(ent.text, ent.label_) for ent in doc.ents]

    @staticmethod
    def normalize_entity(entity):
        """
        Normalizes an entity by converting it to lowercase and stripping whitespace.

        Args:
            entity (str): The entity to normalize.

        Returns:
            str: The normalized entity.
        """
        return entity.lower().strip()

    def build_graph(self, sentences):
        """
        Builds a graph from the list of sentences, where sentences and entities are nodes 
        and relationships are edges.

        Args:
            sentences (list): A list of sentences.

        Returns:
            networkx.Graph: The constructed graph.
        """
        graph = nx.Graph()
        entity_map = {}

        for sentence in sentences:
            entities = self.extract_entities(sentence)
            if not entities:
                continue

            normalized_entities = [(self.normalize_entity(ent[0]), ent[1]) for ent in entities]

            # Add node for the sentence
            graph.add_node(sentence, type='sentence')

            for entity in normalized_entities:
                # Add node for the entity
                graph.add_node(entity[0], type='entity')

                # Add edge between sentence and entity
                graph.add_edge(sentence, entity[0], type='contains')

                if entity[0] in entity_map:
                    entity_map[entity[0]].add(sentence)
                else:
                    entity_map[entity[0]] = {sentence}

            # Connect entities to each other
            for i in range(len(normalized_entities)):
                for j in range(i + 1, len(normalized_entities)):
                    graph.add_edge(normalized_entities[i][0], normalized_entities[j][0], type='related')

            # Add syntactic relationships
            doc = nlp(sentence)
            for token in doc:
                if token.dep_ in {'nsubj', 'dobj', 'pobj'}:
                    graph.add_edge(sentence, token.text, type='syntactic')

                # Identify and connect temporal entities
                if token.ent_type_ in {'DATE', 'TIME', 'EVENT'}:
                    graph.add_node(token.text, type='temporal')
                    graph.add_edge(sentence, token.text, type='temporal')

        return graph

    def extract_bert_embeddings(self, sentences):
        """
        Extracts BERT embeddings for the given sentences.

        Args:
            sentences (list): A list of sentences.

        Returns:
            list: A list of numpy arrays representing the embeddings for each sentence.
        """
        embeddings = []
        for sentence in sentences:
            inputs = self.tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
            with torch.no_grad():
                outputs = self.model(**inputs)
            embeddings.append(outputs.last_hidden_state[0][0].numpy())
        return embeddings

    def select_important_words(self, selected_sentences, embeddings):
        """
        Selects important words from the selected sentences based on embeddings.

        Args:
            selected_sentences (list): List of selected sentences.
            embeddings (list): Corresponding embeddings for the selected sentences.

        Returns:
            list: List of important words from the selected sentences.
        """
        important_words = []
        for i, sentence in enumerate(selected_sentences):
            sentence_embedding = embeddings[i]
            words = sentence.split()
            word_embeddings = self.extract_bert_embeddings(words)

            distances = [np.linalg.norm(word_emb - sentence_embedding) for word_emb in word_embeddings]
            closest_word_index = np.argmin(distances)
            important_words.append(words[closest_word_index])
            
        return important_words

    def summarizer(self, text, num_sentences=5, graph_algorithm='rank'):
        """
        Summarizes the input text by extracting important sentences.

        Args:
            text (str): Input text to summarize.
            num_sentences (int): Number of sentences to include in the summary.
            graph_algorithm (str): Algorithm to use for ranking sentences ('rank', 'closeness', 'hits', or 'degree').

        Returns:
            tuple: A tuple containing a list of selected sentences and a list of important words.
        """
        # Ekstrak kalimat dari teks input
        sentences = self.extract_sentences(text)
        
        # Bangun graf dari kalimat yang diekstrak
        graph = self.build_graph(sentences)

        # Jika graf tidak memiliki node, kembalikan hasil kosong
        if graph.number_of_nodes() == 0:
            return [], ""

        # Ekstrak embedding BERT untuk kalimat
        embeddings = self.extract_bert_embeddings(sentences)
        
        # Inisialisasi skor untuk setiap kalimat
        sentence_scores = {sentence: 0 for sentence in sentences}

        if graph_algorithm == 'rank':
            scores = nx.pagerank(graph)
            for sentence in sentences:
                sentence_scores[sentence] = sum(scores.get(entity[0], 0) for entity in self.extract_entities(sentence))

        elif graph_algorithm == 'closeness':
            scores = nx.degree_centrality(graph)
            for sentence in sentences:
                sentence_scores[sentence] = scores.get(sentence, 0)

        elif graph_algorithm == 'hits':
            hubs, authorities = nx.hits(graph)
            for sentence in sentences:
                sentence_scores[sentence] = sum(authorities.get(entity[0], 0) for entity in self.extract_entities(sentence))

        elif graph_algorithm == 'degree':
            scores = dict(graph.degree())
            for sentence in sentences:
                sentence_scores[sentence] = sum(scores.get(entity[0], 0) for entity in self.extract_entities(sentence))

        else:
            raise ValueError("Invalid graph_algorithm. Use 'rank', 'closeness', 'hits', or 'degree'.")

        for i, sentence in enumerate(sentences):
            embedding_score = np.linalg.norm(embeddings[i])  # Hitung norma dari embedding
            sentence_scores[sentence] += embedding_score * 0.1  # Tambahkan kontribusi embedding ke skor

        # Urutkan kalimat berdasarkan skor
        ranked_sentences = sorted(sentence_scores.items(), key=lambda item: item[1], reverse=True)

        selected_sentences = []  # Daftar kalimat terpilih
        selected_indices = set()  # Set untuk melacak indeks yang sudah dipilih

        for sentence, score in ranked_sentences:
            idx = sentences.index(sentence)  # Ambil indeks kalimat
            
            # Pastikan tidak ada kalimat bertetangga yang dipilih
            if not any(abs(idx - si) < 2 for si in selected_indices):
            
                selected_sentences.append(sentence)  
            
                if len(selected_sentences) == num_sentences: 
                    break

        # Pilih kata penting dari kalimat terpilih
        important_words = self.select_important_words(selected_sentences, embeddings)

        summary = " ".join(selected_sentences).strip('"')
        
        # Kembalikan kalimat terpilih dan kata penting
        return summary, important_words



In [4]:
summarizer_extraction = TextSummarizer()
summary_gnn, important_words_extraction = summarizer_extraction.summarizer(cleaned_text, num_sentences=10)

In [5]:
summary_gnn

'Telemedicine memungkinkan akses baik terhadap layanan kesehatan, mengurangi beban sistem kesehatan, mempercepat waktu respon terhadap masalah kesehatan. Dengan memanfaatkan machine learning, dokter memperoleh wawasan tentang riwayat kesehatan pasien merancang perawatan personal. Media sosial telah menjadi sumber informasi utama bagi banyak orang, meskipun sering kali informasi disebarkan tidak selalu akurat terpercaya. Sistem kerja hybrid juga mulai populer, mana karyawan memilih bekerja kantor rumah sesuai kebutuhan mereka. Selain itu, munculnya fenomena hoaks disinformasi media sosial menuntut pengguna cermat memilah informasi. Dengan semua perkembangan ini, jelas bahwa teknologi terus memainkan peran sentral kehidupan seharihari kita. Oleh karena itu, penting bagi individu perusahaan memahami cara melindungi informasi menggunakan langkahlangkah keamanan tepat, enkripsi data pelatihan keamanan siber bagi karyawan. Misalnya, platform Zoom Microsoft Teams memungkinkan rapat virtual me

In [6]:
important_words_extraction

['kesehatan.',
 'pasien',
 'disebarkan',
 'populer,',
 'disinformasi',
 'kehidupan',
 'melindungi',
 'kolaborasi',
 'menerapkan',
 'berkonsultasi']

# Bart Model

In [26]:
from transformers import BartForConditionalGeneration, BartTokenizer

# Load model dan tokenizer
model_name = 'facebook/bart-large-cnn'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

def summarize_text(text, max_length=100):
    """
    Function to summarize the input text using BART.

    Parameters:
    text (str): The text to summarize.
    max_length (int): Maximum length of the summary.

    Returns:
    str: The summarized text.
    """
    inputs = tokenizer(text, return_tensors='pt', max_length=1024, truncation=True)
    summary_ids = model.generate(inputs['input_ids'], max_length=max_length, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

summary_bart = summarize_text(cleaned_text)
summary_bart

'Teknologi telah mengubah cara berkomunikasi era digital signifikan. Media sosial memainkan peran penting menyebarkan informasi, memberikan platform bagi individu berbagi pemikiran. Banyak orang menghabiskan waktu berjamjam platform Facebook, Instagram, Twitter.'

# Evaluation
---

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarity_score(text1, text2):
    '''Calculate Cosine Similarity between two texts.'''
    # Pastikan input adalah string
    text1 = ' '.join(text1) if isinstance(text1, list) else text1
    text2 = ' '.join(text2) if isinstance(text2, list) else text2

    # Convert both texts into vectors using CountVectorizer
    vectorizer = CountVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()

    # Calculate and return the cosine similarity between the two vectors
    cosine_sim = cosine_similarity(vectors)
    
    return cosine_sim[0][1]  

In [9]:
# cosine_similarity_score
cosine_gnn = cosine_similarity_score(cleaned_sentences, summary_gnn)
cosine_bart = cosine_similarity_score(cleaned_sentences, summary_bart)

print('Cosine GNN', cosine_gnn)
print('Cosine Bert', cosine_bart)

Cosine GNN 0.7586218172151918
Cosine Bert 0.4232334732850844


In [19]:
from rouge_score import rouge_scorer

def evaluate_summary(reference_summary, generated_summary):
    """
    Evaluates the generated summary against the reference summary using ROUGE.

    Args:
        reference_summary (str): The original summary that serves as a reference.
        generated_summary (str): The summary generated by the model.

    Returns:
        dict: A dictionary containing ROUGE-2 scores formatted to two decimal places.
    """
    scorer = rouge_scorer.RougeScorer(['rouge2'], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)

    # Format nilai ROUGE-2 ke dua angka di belakang koma
    return {
        'rouge2': {
            'precision': round(scores['rouge2'].precision, 2),
            'recall': round(scores['rouge2'].recall, 2),
            'fmeasure': round(scores['rouge2'].fmeasure, 2)
        }
    }


**GNN**


In [20]:
reference = cleaned_text
generated_gnn = summary_gnn

rouge_scores_gnn = evaluate_summary(reference, generated_gnn)

rouge_scores_gnn

{'rouge2': {'precision': 0.94, 'recall': 0.3, 'fmeasure': 0.46}}

In [21]:
reference = cleaned_text
generated_bart = summary_bart
rouge_scores_bart = evaluate_summary(reference, generated_bart)

rouge_scores_bart

{'rouge2': {'precision': 0.93, 'recall': 0.06, 'fmeasure': 0.11}}

# Collact Summarize

In [23]:
import pandas as pd

# Menggabungkan semua kata penting menjadi satu string
combined_important_words = ', '.join(important_words_extraction)

data = {
    'Summarize GNN': summary_gnn, 
    'Important Words GNN': combined_important_words,  
    'Summarize Bart' : summary_bart,
    'Metric Rough Bart' : rouge_scores_bart,
    'Metric Rough GNN' : rouge_scores_gnn,
    'Metric Cosine GNN' : cosine_gnn,
    'Metric Cosine Bart' : cosine_bart
    
}

df = pd.DataFrame(data)

# Index secara default sudah mulai dari 0
df.reset_index(drop=True, inplace=True)

df

Unnamed: 0,Summarize GNN,Important Words GNN,Summarize Bart,Metric Rough Bart,Metric Rough GNN,Metric Cosine GNN,Metric Cosine Bart
0,Telemedicine memungkinkan akses baik terhadap ...,"kesehatan., pasien, disebarkan, populer,, disi...",Teknologi telah mengubah cara berkomunikasi er...,"{'precision': 0.93, 'recall': 0.06, 'fmeasure'...","{'precision': 0.94, 'recall': 0.3, 'fmeasure':...",0.758622,0.423233
