In [1]:
import pandas as pd

# Load the uploaded files
govt_data = pd.read_csv('govt_data.csv')
medical_data = pd.read_csv('medical_data.csv')

# Inspect the data
print("Govt Data Head:")
print(govt_data.head())
print("\nGovt Data Info:")
print(govt_data.info())

print("\nMedical Data Head:")
print(medical_data.head())
print("\nMedical Data Info:")
print(medical_data.info())

Govt Data Head:
                                              Source  \
0  https://catalog.data.gov/dataset/?tags=heart-d...   
1  https://catalog.data.gov/dataset/?tags=heart-d...   
2  https://catalog.data.gov/dataset/?tags=heart-d...   
3  https://catalog.data.gov/dataset/?tags=heart-d...   
4  https://catalog.data.gov/dataset/?tags=heart-d...   

                                             Content  
0  federal\n\n\n\nheart disease mortality data am...  
1  federal\n\n\n\nrates and trends in heart disea...  
2  federal\n\n\n\nheart disease mortality data am...  
3  county\n\n\n\nmortality rates\n\n\n\nlake coun...  
4  federal\n\n\n\nheart disease mortality data am...  

Govt Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Source   13 non-null     object
 1   Content  13 non-null     object
dtypes: object(2)
memory usage: 340.0+ bytes
None


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://archive.ics.uci.edu/datasets?search=heart+disease"
headers = {"User-Agent": "Mozilla/5.0"}

try:
    response = requests.get(url, headers=headers, timeout=10)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    dataset_list = []
    links = soup.find_all('a', class_='link-hover link-primary')
    
    for link in links[:10]:
        title = link.get_text().strip()
        href = "https://archive.ics.uci.edu" + link['href']
        dataset_list.append({"Dataset Name": title, "Link": href})
        print(f"Found: {title} -> {href}")

    df = pd.DataFrame(dataset_list)
    df.to_csv("uci_scraped_links.csv", index=False)
    print("\nCSV created: uci_scraped_links.csv")

except requests.exceptions.RequestException as e:
    print(f"Network error: {e}. Check your connection or try a different URL.")


CSV created: uci_scraped_links.csv


In [5]:
import geopandas as gpd
import pandas as pd

# Load the shapefile
# This requires all the uploaded files (.shp, .shx, .dbf, .prj) to be in one folder
counties = gpd.read_file('tl_2018_us_county.shp')

# Basic Preprocessing:
# 1. Simplify geometry to make the file faster to plot
counties['geometry'] = counties.simplify(tolerance=0.01)

# 2. Extract specific columns (GEOID is the standard FIPS code)
county_attributes = counties[['GEOID', 'NAME', 'STATEFP', 'geometry']]

# 3. Handle coordinate systems (Set to standard WGS84 for mapping)
counties = counties.to_crs(epsg=4326)

print(counties.head())

  STATEFP COUNTYFP  COUNTYNS  GEOID       NAME          NAMELSAD LSAD CLASSFP  \
0      31      039  00835841  31039     Cuming     Cuming County   06      H1   
1      53      069  01513275  53069  Wahkiakum  Wahkiakum County   06      H1   
2      35      011  00933054  35011    De Baca    De Baca County   06      H1   
3      31      109  00835876  31109  Lancaster  Lancaster County   06      H1   
4      31      129  00835886  31129   Nuckolls   Nuckolls County   06      H1   

   MTFCC CSAFP CBSAFP METDIVFP FUNCSTAT       ALAND    AWATER     INTPTLAT  \
0  G4020  None   None     None        A  1477652222  10690952  +41.9158651   
1  G4020  None   None     None        A   680956809  61588406  +46.2946377   
2  G4020  None   None     None        A  6016819484  29089486  +34.3592729   
3  G4020   339  30700     None        A  2169287528  22832516  +40.7835474   
4  G4020  None   None     None        A  1489645187   1718484  +40.1764918   

       INTPTLON                             