In [1]:
import os
os.chdir(r"C:\Users\LENOVE\Desktop\Data Engineers\Python\Daily_crypto_pipeline_project")
print("📂 Current working directory:", os.getcwd())


📂 Current working directory: C:\Users\LENOVE\Desktop\Data Engineers\Python\Daily_crypto_pipeline_project


In [2]:
import pandas as pd
import requests
from datetime import date
import os

# === CONFIGURATION ===
base_folder = r"C:\Users\LENOVE\Desktop\Data Engineers\Python\Daily_crypto_pipeline_project"

raw_folder = os.path.join(base_folder, "raw_data")
cleaned_folder = os.path.join(base_folder, "cleaned_data")

os.makedirs(raw_folder, exist_ok=True)
os.makedirs(cleaned_folder, exist_ok=True)


# === STEP 1: SCRAPE REAL DATA ===
def fetch_data():
    print("\n🔄 Fetching live crypto data from CoinMarketCap...")

    url = "https://coinmarketcap.com/"
    headers = {"User-Agent": "Mozilla/5.0"}  # Prevent blocking
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print("❌ Failed to fetch webpage.")
        return pd.DataFrame()

    # Extract tables from HTML
    tables = pd.read_html(response.text)
    if not tables:
        print("⚠️ No tables found on the page.")
        return pd.DataFrame()

    df = tables[0]
    print(f"📦 Rows fetched: {len(df)}")

    # Save RAW data
    raw_path = os.path.join(raw_folder, f"raw_data_{date.today()}.csv")
    df.to_csv(raw_path, index=False)
    print(f"✅ Raw data saved to: {raw_path}")

    print("🪣 Data Preview:")
    print(df.head(), "\n")

    return df


# === STEP 2: CLEAN DATA ===
def clean_data(df):
    if df.empty:
        print("⚠️ No data to clean. Skipping cleaning step.")
        return

    # Basic cleaning
    df.columns = [col.strip().replace('\n', '_') for col in df.columns]
    df = df.dropna(subset=[df.columns[0]])
    df = df.drop_duplicates()
    df.reset_index(drop=True, inplace=True)

    # Save CLEANED data
    cleaned_path = os.path.join(cleaned_folder, f"cleaned_data_{date.today()}.csv")
    df.to_csv(cleaned_path, index=False)
    print(f"🧹 Cleaned data saved to: {cleaned_path}")


# === RUN PIPELINE ===
if __name__ == "__main__":
    print("\n🚀 Running Daily Crypto Web Scraping Pipeline...\n")
    raw_df = fetch_data()
    clean_data(raw_df)
    print("\n🎯 Pipeline complete!\n")



🚀 Running Daily Crypto Web Scraping Pipeline...


🔄 Fetching live crypto data from CoinMarketCap...
📦 Rows fetched: 100
✅ Raw data saved to: C:\Users\LENOVE\Desktop\Data Engineers\Python\Daily_crypto_pipeline_project\raw_data\raw_data_2025-10-19.csv
🪣 Data Preview:
   Unnamed: 0    #         Name        Price   1h %  24h %    7d %  \
0         NaN  1.0   BitcoinBTC  $108,489.34  0.34%  1.31%   3.53%   
1         NaN  2.0  EthereumETH    $3,991.77  0.48%  2.95%   0.05%   
2         NaN  3.0   TetherUSDT        $1.00  0.01%  0.00%   0.00%   
3         NaN  4.0       BNBBNB    $1,121.07  0.69%  2.47%  11.04%   
4         NaN  5.0       XRPXRP        $2.41  0.49%  2.23%   1.72%   

                 Market Cap              Volume(24h) Circulating Supply  ...  \
0  $2.16T$2,163,044,222,526   $40,386,162,702372.23K         19.93M BTC  ...   
1  $470.08B$470,082,882,300     $28,277,473,3857.08M        117.75M ETH  ...   
2  $181.93B$181,933,520,862  $100,440,271,364100.40B       181.86B USDT 

  tables = pd.read_html(response.text)
