<a href="https://colab.research.google.com/github/shadi159/Cloud-Computing-Project/blob/main/HW2/SaveKeyWordsToDatabase.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests beautifulsoup4 spacy scikit-learn
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m99.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import requests
from bs4 import BeautifulSoup
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

nlp = spacy.load("en_core_web_sm")


In [None]:
def fetch_url_text(url: str) -> str:
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0.0.0 Safari/537.36"
        )
    }
    resp = requests.get(url, headers=headers, timeout=20)
    resp.raise_for_status()  # אם הייתה בעיית HTTP – נזרקת שגיאה

    html = resp.text
    soup = BeautifulSoup(html, "html.parser")

    # כאן לוקחים את כל הפסקאות <p>
    paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
    text = "\n".join(paragraphs)
    return text


In [None]:
urls = [
    "https://www.frontiersin.org/journals/plant-science/articles/10.3389/fpls.2019.00567/full",
    "https://www.frontiersin.org/journals/plant-science/articles/10.3389/fpls.2020.00331/full",
    "https://www.frontiersin.org/journals/plant-science/articles/10.3389/fpls.2017.01156/full",
    "https://www.frontiersin.org/journals/plant-science/articles/10.3389/fpls.2019.00352/full",
    "https://www.frontiersin.org/journals/plant-science/articles/10.3389/fpls.2017.01423/full"
]

In [None]:
custom_stopwords = {
    "about", "above", "after", "again", "all", "am",
    "among", "an", "and", "any", "are", "as", "at",
    "be", "because", "been", "before", "being", "below",
    "between", "both", "but", "by", "can", "did", "do",
    "does", "doing", "down", "during", "each", "few",
    "for", "from", "further", "had", "has", "have",
    "having", "he", "her", "here", "hers", "him", "himself",
    "his", "how", "if", "in", "into", "is", "it", "its",
    "itself", "just", "me", "more", "most", "my", "myself",
    "no", "nor", "not", "now", "of", "off", "on", "once",
    "only", "or", "other", "our", "ours", "out", "over",
    "own", "same", "she", "should", "so", "some", "such",
    "than", "that", "the", "their", "them", "themselves",
    "then", "there", "these", "they", "this", "those",
    "through", "to", "too", "under", "until", "up", "very",
    "was", "we", "were", "what", "when", "where", "which",
    "while", "who", "whom", "why", "with", "you", "your",
    "yours", "yourself", "yourselves"
}


In [None]:
def lemmatize_and_filter(text: str, stopwords):
    doc = nlp(text)
    lemmas = []

    for token in doc:
        lemma = token.lemma_.lower()

        # ❌ אם ה־lemma נמצאת ב־stopwords → מדלגים
        if lemma in stopwords:
            continue

        # ❌ לא מלים עם מספרים / סימנים
        if not token.is_alpha:
            continue

        # ❌ רק שם עצם / פועל / תואר
        if token.pos_ not in {"NOUN", "VERB", "ADJ"}:
            continue

        lemmas.append(lemma)

    return lemmas


In [None]:
from collections import Counter
index_data = {
    "pages": {},  # mapping: page_id -> {title, url}
    "terms": {}   # mapping: term -> {term, DocIDs, tf_per_doc}
}

count = 1

for url in urls:
    try:
        doc_text = fetch_url_text(url)
    except Exception as e:
        print("Error fetching URL:", e)
        doc_text = ""

    # ניצור מזהה פנימי למסמך, בלי תווים מסוכנים
    doc_id_name = f"p{count}"

    # שמירה ב-"pages" (כמו בקוד הישן)
    index_data["pages"][doc_id_name] = {
        "title": f"Article {count}",
        "url": url
    }

    # טוקנים למסמך (ל-tf_per_doc)
    tokens = lemmatize_and_filter(doc_text, custom_stopwords)
    tf_counter = Counter(tokens)

    documents = [doc_text]

    vectorizer = TfidfVectorizer(
        tokenizer=lambda text: lemmatize_and_filter(text, custom_stopwords),
        preprocessor=None,
        lowercase=False,
    )

    tfidf_matrix = vectorizer.fit_transform(documents)
    feature_names = vectorizer.get_feature_names_out()

    doc_id = 0
    vector = tfidf_matrix[doc_id].toarray().flatten()

    top_n = 20
    top_indices = np.argsort(vector)[::-1][:top_n]

    print("==============================================")
    print(f"Article {count} key words: ")

    for idx in top_indices:
        term = feature_names[idx]
        score = vector[idx]
        print(term, "→", score)

        # אם המונח עדיין לא באינדקס – יוצרים כניסה חדשה
        if term not in index_data["terms"]:
            index_data["terms"][term] = {
                "term": term,
                "DocIDs": [],
                "tf_per_doc": {}
            }

        # הוספת המסמך לרשימת DocIDs (אם לא קיים)
        if doc_id_name not in index_data["terms"][term]["DocIDs"]:
            index_data["terms"][term]["DocIDs"].append(doc_id_name)

        # כמה פעמים המונח הופיע במסמך הזה
        index_data["terms"][term]["tf_per_doc"][doc_id_name] = int(tf_counter.get(term, 0))

    count += 1
    doc_text = ""



Article 1 key words: 
virus → 0.6134304007112499
cassava → 0.3798909080546827
symptom → 0.2864751109920558
resistance → 0.20551475353777915
infection → 0.199287033733604
variety → 0.17437615451690353
plant → 0.15569299510437815
leaf → 0.1338959757897652
line → 0.11832667627932739
root → 0.11832667627932739
disease → 0.11832667627932739
streak → 0.11209895647515226
isolate → 0.10275737676888957
infect → 0.10275737676888957
use → 0.09030193716053933
full → 0.07161877774801395
text → 0.07161877774801395
become → 0.06850491784592638
germplasm → 0.06539105794383882
figure → 0.06227719804175126
Article 2 key words: 
protein → 0.3485028534363978
rice → 0.2724658672320928
full → 0.2534566206810166
coa → 0.2471202051639912
text → 0.2471202051639912
acyl → 0.23444737412994035
cell → 0.23127916637142765
localization → 0.22494275085440224
bind → 0.2154381275788641
transgenic → 0.19326067326927515
plant → 0.1647468034426608
root → 0.14256934913307184
subcellular → 0.129896518099021
treatment → 0.12

In [None]:
FIREBASE_URL = "My Key"

def save_index_to_firebase(index_data, path="plant_disease_index_tf"):
    url = f"{FIREBASE_URL}/{path}.json"
    try:
        response = requests.put(url, json=index_data)
        response.raise_for_status()
        print("Saved to Firebase successfully!")
        print("Response:", response.json())
    except Exception as e:
        print("Error saving to Firebase:", e)
        # להדפסה קצת יותר מפורטת:
        if hasattr(e, "response") and e.response is not None:
            print("Firebase said:", e.response.text)

save_index_to_firebase(index_data)

Saved to Firebase successfully!
Response: {'pages': {'p1': {'title': 'Article 1', 'url': 'https://www.frontiersin.org/journals/plant-science/articles/10.3389/fpls.2019.00567/full'}, 'p2': {'title': 'Article 2', 'url': 'https://www.frontiersin.org/journals/plant-science/articles/10.3389/fpls.2020.00331/full'}, 'p3': {'title': 'Article 3', 'url': 'https://www.frontiersin.org/journals/plant-science/articles/10.3389/fpls.2017.01156/full'}, 'p4': {'title': 'Article 4', 'url': 'https://www.frontiersin.org/journals/plant-science/articles/10.3389/fpls.2019.00352/full'}, 'p5': {'title': 'Article 5', 'url': 'https://www.frontiersin.org/journals/plant-science/articles/10.3389/fpls.2017.01423/full'}}, 'terms': {'acyl': {'DocIDs': ['p2'], 'term': 'acyl', 'tf_per_doc': {'p2': 74}}, 'analysis': {'DocIDs': ['p3'], 'term': 'analysis', 'tf_per_doc': {'p3': 25}}, 'banana': {'DocIDs': ['p4'], 'term': 'banana', 'tf_per_doc': {'p4': 37}}, 'become': {'DocIDs': ['p1'], 'term': 'become', 'tf_per_doc': {'p1': 2