# Data Extraction Demo
This notebook demonstrates the core workflow for loading and extracting data into a Pandas DataFrame, following project conventions.
- All data extraction functions return a DataFrame.
- Use function-based modules for each data source.
- Document new workflows and keep code modular.

## Extracting Data from CSV Files
Use the function below to load CSV data into a DataFrame.

In [1]:
sample_data_path = "cereal.csv"

In [None]:
import pandas as pd

def load_csv(path):
    df = pd.read_csv(path)
    # ...additional cleaning/transformation...
    """ 
    df = pd.read_csv(path)
    df = df.drop_duplicates()
    df = df.fillna(0)
    df = df[df['calories'] > 50]
    df['is_healthy'] = df['fiber'] > 5
     """
    return df

In [4]:
csv_path = 'cereal.csv'
df_csv = load_csv(csv_path)
df_csv.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843


## Extracting Data from Excel Files
Use the function below to load Excel data into a DataFrame.

In [8]:
def load_excel(path, sheet_name=0):
    df = pd.read_excel(path, sheet_name=sheet_name)
    # ...additional cleaning/transformation...
    return df

In [9]:
excel_path = 'cereal.xlsx'
df_excel = load_excel(excel_path)
df_excel.head()

ValueError: Excel file format cannot be determined, you must specify an engine manually.

## Extracting Data from APIs
Use the function below to load data from a REST API into a DataFrame.

In [None]:
import requests

def load_api(url):
    response = requests.get(url)
    data = response.json()
    df = pd.DataFrame(data)
    # ...additional cleaning/transformation...
    return df

In [None]:
api_url = 'https://api.example.com/data'
df_api = load_api(api_url)
df_api.head()

## Extracting Data from JSON Files
Use the function below to load data from a local JSON file into a DataFrame.

In [None]:
import json

def load_json(path):
    with open(path, 'r') as f:
        data = json.load(f)
    df = pd.DataFrame(data)
    # ...additional cleaning/transformation...
    return df

In [None]:
json_path = 'sample_data.json'
df_json = load_json(json_path)
df_json.head()

## Extracting Data from MongoDB Atlas (NoSQL)
Use the function below to load data from a MongoDB Atlas collection into a DataFrame.

In [None]:
from pymongo import MongoClient

def load_mongodb(uri, db_name, collection_name):
    client = MongoClient(uri)
    db = client[db_name]
    collection = db[collection_name]
    data = list(collection.find())
    df = pd.DataFrame(data)
    # ...additional cleaning/transformation...
    return df

In [None]:
mongodb_uri = 'mongodb+srv://<username>:<password>@cluster0.mongodb.net/'
db_name = 'sample_db'
collection_name = 'sample_collection'
df_mongo = load_mongodb(mongodb_uri, db_name, collection_name)
df_mongo.head()

## Extracting Data from MySQL (SQL-based)
Use the function below to load data from a MySQL database into a DataFrame.

In [None]:
import mysql.connector

def load_mysql(host, user, password, database, query):
    conn = mysql.connector.connect(
        host=host,
        user=user,
        password=password,
        database=database
    )
    df = pd.read_sql(query, conn)
    conn.close()
    # ...additional cleaning/transformation...
    return df

In [None]:
mysql_host = 'localhost'
mysql_user = 'root'
mysql_password = 'password'
mysql_database = 'sample_db'
mysql_query = 'SELECT * FROM sample_table'
df_mysql = load_mysql(mysql_host, mysql_user, mysql_password, mysql_database, mysql_query)
df_mysql.head()

## Extracting Data via Web Scraping
Use the function below to scrape data from a website using BeautifulSoup.

In [None]:
import requests
from bs4 import BeautifulSoup

def scrape_shop(base_url):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    products = []
    for product in soup.select('.product'):  # Example selector
        name = product.select_one('.woocommerce-loop-product__title').text
        price = product.select_one('.price').text
        products.append({'name': name, 'price': price})
    df = pd.DataFrame(products)
    # ...additional cleaning/transformation...
    return df

In [None]:
base_url = 'https://scrapeme.live/shop/'
df_shop = scrape_shop(base_url)
df_shop.head()