In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://archive.ics.uci.edu/datasets?search=heart+disease"
headers = {"User-Agent": "Mozilla/5.0"}

try:
    response = requests.get(url, headers=headers, timeout=10)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    dataset_list = []
    # UCI now uses <a> tags with specific classes for dataset names
    links = soup.find_all('a', class_='link-hover link-primary')
    
    for link in links[:10]:
        title = link.get_text().strip()
        href = "https://archive.ics.uci.edu" + link['href']
        dataset_list.append({"Dataset Name": title, "Link": href})
        print(f"Found: {title} -> {href}")

    
    df = pd.DataFrame(dataset_list)
    df.to_csv("uci_scraped_links.csv", index=False)
    print("\nCSV created: uci_scraped_links.csv")

except requests.exceptions.RequestException as e:
    print(f"Network error: {e}. Check your connection or try a different URL.")


CSV created: uci_scraped_links.csv


In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define target sites and their output filenames
targets = [
    {"url": "https://catalog.data.gov/dataset/?tags=heart-disease", "file": "govt_data.csv"},
    {"url": "https://www.mayoclinic.org/diseases-conditions/heart-disease/symptoms-causes/syc-20353118", "file": "medical_data.csv"},
    {"url": "https://data.gov.in/keywords/heart", "file": "research_data.csv"}
]

# The 13 attributes from your heart.doc to look for
attributes = [
    'age', 'sex', 'chest pain', 'blood pressure', 'cholestoral', 
    'sugar', 'electrocardiographic', 'heart rate', 'angina', 
    'oldpeak', 'slope', 'vessels', 'thal'
]

def extract_and_save(target):
    headers = {'User-Agent': 'Mozilla/5.0'}
    try:
        # Request with 10s timeout to bypass local DNS lags
        response = requests.get(target["url"], headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        extracted_data = []
        # Searching through list items, table cells, and paragraphs
        for tag in soup.find_all(['li', 'td', 'p']):
            text = tag.get_text().strip().lower()
            # If the text contains one of our 13 attributes, save it [cite: 99, 100, 101, 102]
            if any(attr in text for attr in attributes):
                extracted_data.append({"Source": target["url"], "Content": text})
        
        # Convert the list of findings into a CSV
        if extracted_data:
            df = pd.DataFrame(extracted_data)
            df.to_csv(target["file"], index=False)
            print(f"Successfully saved: {target['file']}")
        else:
            print(f"No specific heart attributes found at {target['url']}")
            
    except Exception as e:
        print(f"Failed to scrape {target['url']}: {e}")

# Run the process one by one
for t in targets:
    extract_and_save(t)

print("\n--- Round 1 Data Collection Phase Complete ---")

Successfully saved: govt_data.csv
Successfully saved: medical_data.csv
No specific heart attributes found at https://data.gov.in/keywords/heart

--- Round 1 Data Collection Phase Complete ---


In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
early_stage_diabetes_risk_prediction = fetch_ucirepo(id=529) 
  
# data (as pandas dataframes) 
X = early_stage_diabetes_risk_prediction.data.features 
y = early_stage_diabetes_risk_prediction.data.targets 
  
# metadata 
print(early_stage_diabetes_risk_prediction.metadata) 
  
# variable information 
print(early_stage_diabetes_risk_prediction.variables)

{'uci_id': 529, 'name': 'Early Stage Diabetes Risk Prediction', 'repository_url': 'https://archive.ics.uci.edu/dataset/529/early+stage+diabetes+risk+prediction+dataset', 'data_url': 'https://archive.ics.uci.edu/static/public/529/data.csv', 'abstract': 'This dataset contains the sign and symptpom data of newly diabetic or would be diabetic patient. ', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 520, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Gender'], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2020, 'last_updated': 'Mon Mar 04 2024', 'dataset_doi': '10.24432/C5VG8H', 'creators': [], 'intro_paper': {'ID': 397, 'type': 'NATIVE', 'title': 'Likelihood Prediction of Diabetes at Early Stage Using Data Mining Techniques', 'authors': 'M. M. F. Islam, Rahatara Ferdousi, Sadikur Rahman, Humayra Yas

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# 1. FIXED URL (Removed double https)
url = "https://archive.ics.uci.edu/datasets?search=heart+disease"
headers = {"User-Agent": "Mozilla/5.0"}

try:
    # 2. Added 10-second timeout
    response = requests.get(url, headers=headers, timeout=10)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    # 3. Updated tags for the new UCI layout
    dataset_list = []
    # UCI now uses <a> tags with specific classes for dataset names
    links = soup.find_all('a', class_='link-hover link-primary')
    
    for link in links[:10]:
        title = link.get_text().strip()
        href = "https://archive.ics.uci.edu" + link['href']
        dataset_list.append({"Dataset Name": title, "Link": href})
        print(f"Found: {title} -> {href}")

    # 4. Save for Round 1 submission
    df = pd.DataFrame(dataset_list)
    df.to_csv("uci_scraped_links.csv", index=False)
    print("\nCSV created: uci_scraped_links.csv")

except requests.exceptions.RequestException as e:
    print(f"Network error: {e}. Check your connection or try a different URL.")


CSV created: uci_scraped_links.csv
