In [23]:
from flask import Flask, request, render_template
from bs4 import BeautifulSoup
import requests
import re
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import matplotlib.pyplot as plt
import io
import base64
import nltk
from tqdm import tqdm


In [2]:

nltk.download('stopwords')
nltk.download('punkt')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akhyar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\akhyar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Inisialisasi Flask
app = Flask(__name__)

# Load stopwords
stop_words = stopwords.words('indonesian')

# Simpan stopwords ke file
with open('stopwords.txt', 'w') as f:
    for item in stop_words:
        f.write("%s\n" % item)


In [59]:
# Fungsi untuk mengambil konten berita dari URL
def scrape_news(url):
    isi = []
    judul = []
    response = requests.get(url)
    if response.status_code == 200:
        article_full = BeautifulSoup(response.content, "html.parser")
        judul_artikel = article_full.find("h1", class_="mb-4 text-32 font-extrabold").text.strip()
        artikel_element = article_full.find("div", class_="detail-text")
        artikel_teks = [p.get_text(strip=True) for p in artikel_element.find_all("p")]
        artikel_content = "\n".join(artikel_teks)
        isi.append(artikel_content)
        judul.append(judul_artikel)
    return { "isi": isi} if isi else None


In [5]:

# Fungsi pembersihan teks
def cleansing(text):
    text = re.sub(r'[\s]+', ' ', text)
    text = text.encode('ascii', 'ignore').decode('utf-8')
    text = re.sub(r'[^\x00-\x7f]', r'', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = re.sub(r'\b-\b', ' ', text)
    text = re.sub(r'[^\w\s]+', ' ', text)
    text = text.replace('\n', '')
    return text


In [6]:

# Fungsi untuk menghapus stopword
def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)


In [7]:

# Fungsi stemming
def stemming(text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return stemmer.stem(text)

# Fungsi utama untuk preprocessing teks
def preprocess_text(text):
    clean_text = cleansing(text)
    return clean_text  # Hanya cleansing tanpa stopword dan stemming



In [8]:

# Fungsi utama untuk ringkasan dan visualisasi graf
def summarize_and_visualize(content):
    # Tokenisasi kalimat
    kalimat = sent_tokenize(content)
    
    # Preprocessing teks
    preprocessed_text = preprocess_text(content)
    kalimat_preprocessing = sent_tokenize(preprocessed_text)
    
    # TF-IDF dan cosine similarity
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(kalimat_preprocessing)
    cossim_prep = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Analisis jaringan dengan NetworkX
    G = nx.DiGraph()
    for i in range(len(cossim_prep)):
        G.add_node(i)
        for j in range(len(cossim_prep)):
            if cossim_prep[i][j] > 0.1 and i != j:
                G.add_edge(i, j)
                
    # Hitung closeness centrality dan buat ringkasan
    closeness_scores = nx.closeness_centrality(G)
    sorted_closeness = sorted(closeness_scores.items(), key=lambda x: x[1], reverse=True)
    ringkasan = " ".join(kalimat[node] for node, _ in sorted_closeness[:3])

    # Visualisasi graf
    pos = nx.spring_layout(G, k=2)
    plt.figure(figsize=(10, 8))
    nx.draw_networkx_nodes(G, pos, node_size=500, node_color='b')
    nx.draw_networkx_edges(G, pos, edge_color='red', arrows=True)
    nx.draw_networkx_labels(G, pos, font_size=10)
    plt.title("Graph Representation of Sentence Similarity")

    # Simpan grafik ke dalam format base64
    img = io.BytesIO()
    plt.savefig(img, format='png')
    img.seek(0)
    graph_url = base64.b64encode(img.getvalue()).decode()
    plt.close()

    return ringkasan, graph_url


In [17]:
def clean_text(text):
	text = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+))', ' ', text) # Menghapus https* and www*
	text = re.sub(r'@[^\s]+', ' ', text) # Menghapus username
	text = re.sub(r'[\s]+', ' ', text) # Menghapus tambahan spasi
	text = re.sub(r'#([^\s]+)', ' ', text) # Menghapus hashtags
	text = re.sub(r"[^a-zA-Z :\.]", "", text) # Menghapus tanda baca
	text = re.sub(r'\d', ' ', text) # Menghapus angka
	text = text.lower()
	text = text.encode('ascii','ignore').decode('utf-8') #Menghapus ASCII dan unicode
	text = re.sub(r'[^\x00-\x7f]',r'', text)
	text = text.replace('\n','') #Menghapus baris baru
	text = text.strip()
	return text

def clean_stopword(tokens):
	listStopword =  set(stopwords.words('indonesian'))
	filtered_words = [word for word in tokens if word.lower() not in listStopword]
	return filtered_words

In [52]:
def preprocess_text(content):
	result = {}
	for i, text in enumerate(tqdm(content)):
		cleaned_text = clean_text(text)
		tokens = word_tokenize(cleaned_text)
		cleaned_stopword = clean_stopword(tokens)
		result[i] = ' '.join(cleaned_stopword)
	return result

In [60]:
scrapped = scrape_news("https://www.cnbcindonesia.com/news/20241119114410-4-589340/peternak-nangis-susu-impor-bebas-pajak-kemendag-janji-buka-opsi-ini")

In [61]:
prepos = preprocess_text(scrapped["isi"])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 173.53it/s]


In [62]:
print(prepos)

{0: 'jakarta cnbc indonesia kebijakan pembebasan bea masuk produk susu impor australia selandia new zealand disebutsebut biang kerok harga susu produksi peternak negeri turun . peternak berharap pemerintah mengubah perundingan dagang berjalan aseanaustralianew zealand free trade area aanzfta . menanggapi kepala badan kebijakan perdagangan bk perdag kementerian perdagangan kemendag fajarini puntodewi membuka peluang mereview meninjau ulang perjanjian perdagangan bebas free trade agreementfta negara . punto fta pengkajian ulang sesuai jadwal evaluasi disepakati . fta kitab suci diubah . jepang masanya direview masanya . sekian direview contohnya jepang punto ditemui hotel borobudur jakarta selasa . evaluasi fta salah langkah diambil kebijakan perdagangan merugikan peternak lokal menjaga keseimbangan keterbukaan pasar perlindungan sektor domestik . peternak sapi perah jawa timur jawa protes mandi susu membuang susu perah akibat terserap industri pengolahan susu ips . pemicu marahnya peter

In [64]:
def summarize_and_visualize(content):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_preprocessing = tfidf_vectorizer.fit_transform(content)
    terms = tfidf_vectorizer.get_feature_names_out()
    tfidf_preprocessing = pd.DataFrame(data=tfidf_preprocessing.toarray(), columns=terms)
    cossim_prep = cosine_similarity(tfidf_preprocessing, tfidf_preprocessing)
    similarity_matrix = pd.DataFrame(cossim_prep, 
                                     index=range(len(content)), 
                                     columns=range(len(content)))
    G_preprocessing = nx.DiGraph()
    for i in range(len(cossim_prep)):
        G_preprocessing.add_node(i)
    
    for i in range(len(cossim_prep)):
        for j in range(len(cossim_prep)):
            similarity_preprocessing = cossim_prep[i][j]
            if similarity_preprocessing > 0.1 and i != j:
                G_preprocessing.add_edge(i, j)
    
    pos = nx.spring_layout(G_preprocessing, k=2)
    closeness_preprocessing = nx.closeness_centrality(G_preprocessing)
    sorted_closeness_preprocessing = sorted(closeness_preprocessing.items(), key=lambda x: x[1], reverse=True)
    ringkasan_closeness_preprocessing = ""
    print("Tiga Node Tertinggi Closeness Centrality Menggunakan Preprocessing:")
    for node, closeness_preprocessing in sorted_closeness_preprocessing[:3]:
        top_sentence = kalimat[node]
        ringkasan_closeness_preprocessing += top_sentence + " "
        print(f"Node {node}: Closeness Centrality = {closeness_preprocessing:.4f}")
        print(f"Kalimat: {top_sentence}\n")

In [65]:
hasil = summarize_and_visualize(prepos)

AttributeError: 'int' object has no attribute 'lower'

In [None]:


# Route untuk halaman utama
@app.route("/", methods=["GET", "POST"])
def index():
    ringkasan = None
    graph_url = None
    artikel = None
    if request.method == "POST":
        url_input = request.form.get("url")
        if url_input:
            # Scraping konten artikel
            artikel = scrape_news(url_input)
            if artikel:
                # Analisis dan ringkasan
                print(" ".join(artikel["isi"]))
                ringkasan, graph_url = summarize_and_visualize(" ".join(artikel["isi"]))
            else:
                ringkasan = "Gagal mengambil konten artikel."

    return render_template("summary.html", artikel=artikel, ringkasan=ringkasan, graph_url=graph_url)

# Menjalankan server Flask
if __name__ == "__main__":
    app.run(debug=True)
