# **Web scrapper : using Google API**

**This colab sheet contains the code to use google api to collect the top 5 links from the google search and store in the CSV format.**

In [None]:
import sys
sys.version

'3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]'

In [None]:
from googlesearch import search
print("Googlesearch package installed successfully!")

Googlesearch package installed successfully!


In [None]:
# set query to search for in Google
query = "Bell_pepper leaf spot Cure"
# execute query and store search results
results = search(query, tld="com", lang="en", stop=5, pause=2)
# iterate over all search results and print them
for result in results:
  print(result)


https://extension.wvu.edu/files/50d2bf3d-125f-4b45-9c03-487959bd8344/893x595?cb=0862b40bec913b574f1f05f79f2805c7&sa=X&ved=2ahUKEwiCltC5m7aNAxU4RjABHR2UAZoQ_B16BAgLEAI
https://extension.wvu.edu/lawn-gardening-pests/plant-disease/fruit-vegetable-diseases/bacterial-leaf-spot-of-pepper
https://www.gardeningknowhow.com/edible/vegetables/pepper/bacterial-leaf-spot-on-peppers.htm
https://extension.umd.edu/resource/bacterial-leaf-spot-peppers
https://peppergeek.com/spots-on-pepper-leaves/


In [None]:
import yaml
import csv

# 1. Load the YAML
with open('combined.yaml', 'r') as yf:
    data = yaml.safe_load(yf)

# 2. Extract the list of disease names
diseases = data.get('names', [])

# 3. Write to CSV with two rows per disease
with open('diseases.csv', 'w', newline='') as cf:
    writer = csv.writer(cf)
    # Header row
    writer.writerow(['disease', 'keyword'])
    # Two rows per disease: one 'about', one 'cure'
    for disease in diseases:
        writer.writerow([disease, 'about'])
        writer.writerow([disease, 'cure'])

In [None]:
import pandas as pd
import os
from urllib.parse import urlparse

CSV_PATH = 'diseases.csv'
OUTPUT_CSV = 'diseases_with_links.csv'
NUM_RESULTS = 5
FILTER_EXTS = ('.jpg','.jpeg','.png','.gif','.bmp','.svg','.pdf')

def is_valid_url(url: str) -> bool:
    """
    Return False if the URL’s path ends with any of the filtered extensions,
    even if there’s a query string after it.
    """
    path = urlparse(url).path           # e.g. "/Apple-scab-advanced-leaves-Grabowski.jpg"
    ext  = os.path.splitext(path)[1].lower()  # e.g. ".jpg"
    return ext not in FILTER_EXTS

def fetch_links(q, n=NUM_RESULTS):
    found = []
    for url in search(q, tld='com', lang='en', stop=n*2, pause=2.0):
        if is_valid_url(url):
            found.append(url)
        if len(found) >= n:
            break
    return found

# load the CSV you just created
df = pd.read_csv(CSV_PATH)

# for each row, perform a search
df['links'] = df.apply(
    lambda r: fetch_links(f"{r['disease']} {r['keyword']}"), axis=1
)

# save the enriched CSV
df.to_csv(OUTPUT_CSV, index=False)
df.head(6)

Unnamed: 0,disease,keyword,links
0,Apple Scab Leaf,about,[https://www.rhs.org.uk/disease/apple-and-pear...
1,Apple Scab Leaf,cure,[https://extension.umn.edu/plant-diseases/appl...
2,Apple leaf,about,"[https://en.wikipedia.org/wiki/Apple, https://..."
3,Apple leaf,cure,[https://www.rhs.org.uk/fruit/apples/tree-prob...
4,Apple rust leaf,about,[https://extension.okstate.edu/fact-sheets/ced...
5,Apple rust leaf,cure,[https://extension.umn.edu/plant-diseases/ceda...
