# Data Edge Internal Data - Complete Overview & EDA
This notebook provides a comprehensive overview of all CSV files in the Data Edge Internal Data folder, allowing you to understand the entire dataset structure before diving into individual file analysis.

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

In [None]:
# Set Data Directory and List All Files
data_dir = r"c:\Users\guine\Documents\BC#4\Hyper_python\BI project\Data Edge Internal Data"

# Get all CSV files
csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
print(f"Found {len(csv_files)} CSV files:")
for i, file in enumerate(csv_files, 1):
    file_path = os.path.join(data_dir, file)
    size_mb = os.path.getsize(file_path) / (1024 * 1024)
    print(f"{i:2d}. {file:<40} ({size_mb:.2f} MB)")

# Categorize files by type
fact_tables = [f for f in csv_files if f.startswith('fct_')]
dim_tables = [f for f in csv_files if f.startswith('dim_')]
staging_tables = [f for f in csv_files if f.startswith('stg_')]

print(f"\n📊 Data Architecture Overview:")
print(f"   • Fact Tables: {len(fact_tables)} files")
print(f"   • Dimension Tables: {len(dim_tables)} files") 
print(f"   • Staging Tables: {len(staging_tables)} files")

In [None]:
# Quick Load and Overview of All Files
file_overview = []

for file in csv_files:
    file_path = os.path.join(data_dir, file)
    try:
        # Load just the first few rows to get structure
        df_sample = pd.read_csv(file_path, nrows=5)
        
        file_info = {
            'filename': file,
            'rows': len(pd.read_csv(file_path)),
            'columns': len(df_sample.columns),
            'column_names': list(df_sample.columns),
            'size_mb': os.path.getsize(file_path) / (1024 * 1024),
            'first_row_sample': df_sample.iloc[0].to_dict() if len(df_sample) > 0 else {}
        }
        file_overview.append(file_info)
        print(f"✓ Loaded {file}")
        
    except Exception as e:
        print(f"✗ Error loading {file}: {e}")

# Create summary DataFrame
summary_df = pd.DataFrame([{
    'File': info['filename'],
    'Rows': info['rows'],
    'Columns': info['columns'], 
    'Size (MB)': round(info['size_mb'], 2),
    'Column Names': ', '.join(info['column_names'][:3]) + ('...' if len(info['column_names']) > 3 else '')
} for info in file_overview])

print(f"\n📋 Files Summary:")
display(summary_df.sort_values('Size (MB)', ascending=False))