In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

def generate_comprehensive_ecommerce_data():
    """生成完善的电商模拟数据"""
    
    # 设置随机种子保证可重复性
    np.random.seed(42)
    random.seed(42)
    
    # 1. 用户维度表 - 增加用户画像相关字段
    n_customers = 5000
    customers = pd.DataFrame({
        'customer_id': range(1, n_customers + 1),
        'customer_name': [f'User_{i}' for i in range(1, n_customers + 1)],
        'gender': np.random.choice(['Male', 'Female'], n_customers, p=[0.48, 0.52]),
        'birth_year': np.random.randint(1970, 2005, n_customers),
        'registration_date': pd.to_datetime([datetime(2020,1,1) + timedelta(days=np.random.randint(0, 1460)) for _ in range(n_customers)]),
        'email': [f'user{i}@example.com' for i in range(1, n_customers + 1)],
        'phone': [f'1{np.random.randint(300,999):03d}{np.random.randint(1000,9999):04d}' for _ in range(n_customers)],
        'country': np.random.choice(['USA', 'Canada', 'UK', 'Australia'], n_customers, p=[0.6, 0.2, 0.15, 0.05]),
        'city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Toronto', 'London', 'Sydney'], n_customers),
        'zip_code': [f'{np.random.randint(10000, 99999)}' for _ in range(n_customers)],
        'registration_channel': np.random.choice(['Web', 'Mobile App', 'Social Media', 'Referral'], n_customers, p=[0.4, 0.35, 0.15, 0.1]),
        'loyalty_tier': np.random.choice(['Bronze', 'Silver', 'Gold', 'Platinum'], n_customers, p=[0.5, 0.3, 0.15, 0.05]),
        'preferred_category': np.random.choice(['Electronics', 'Clothing', 'Home', 'Books', 'Sports'], n_customers),
        'avg_order_value_segment': np.random.choice(['Low', 'Medium', 'High'], n_customers, p=[0.6, 0.3, 0.1]),
        'last_login_date': None,
        'total_orders': 0,
        'total_spent': 0.0
    })
    
    # 2. 商品维度表
    n_products = 200
    categories = {
        'Electronics': ['Smartphone', 'Laptop', 'Tablet', 'Headphones', 'Smartwatch'],
        'Clothing': ['T-Shirt', 'Jeans', 'Dress', 'Jacket', 'Shoes'],
        'Home': ['Furniture', 'Kitchenware', 'Decor', 'Lighting', 'Bedding'],
        'Books': ['Fiction', 'Non-Fiction', 'Academic', 'Children', 'Cookbook'],
        'Sports': ['Equipment', 'Apparel', 'Footwear', 'Accessories']
    }
    
    products_data = []
    product_id = 1
    for category, subcategories in categories.items():
        for subcategory in subcategories:
            for i in range(n_products // (len(categories) * len(subcategories))):
                products_data.append({
                    'product_id': product_id,
                    'product_name': f'{subcategory} Model {product_id}',
                    'category': category,
                    'subcategory': subcategory,
                    'brand': np.random.choice(['BrandA', 'BrandB', 'BrandC', 'BrandD', 'BrandE']),
                    'price': round(np.random.uniform(10, 500), 2),
                    'cost_price': round(np.random.uniform(5, 300), 2),
                    'stock_quantity': np.random.randint(0, 1000),
                    'supplier': f'Supplier_{np.random.randint(1, 20)}',
                    'rating': round(np.random.uniform(3.0, 5.0), 1),
                    'review_count': np.random.randint(0, 500),
                    'created_date': pd.to_datetime(datetime(2021,1,1) + timedelta(days=np.random.randint(0, 1000))),
                    'is_active': np.random.choice([True, False], p=[0.9, 0.1])
                })
                product_id += 1
    
    products = pd.DataFrame(products_data)
    
    # 3. 时间维度表
    dates = pd.date_range('2022-01-01', '2024-12-31', freq='D')
    time_dim = pd.DataFrame({
        'date': dates,
        'day': dates.day,
        'month': dates.month,
        'month_name': dates.strftime('%B'),
        'quarter': dates.quarter,
        'year': dates.year,
        'day_of_week': dates.dayofweek,
        'day_name': dates.strftime('%A'),
        'is_weekend': (dates.dayofweek >= 5).astype(int),
        'is_holiday': np.random.choice([0, 1], len(dates), p=[0.9, 0.1])
    })
    
    # 4. 地区维度表
    regions = pd.DataFrame({
        'region_id': range(1, 7),
        'region_name': ['North America', 'Europe', 'Asia Pacific', 'South America', 'Africa', 'Middle East'],
        'region_manager': ['Manager_A', 'Manager_B', 'Manager_C', 'Manager_D', 'Manager_E', 'Manager_F']
    })
    
    # 5. 订单事实表 (包含用户行为数据)
    n_orders = 50000
    orders_data = []
    
    for i in range(n_orders):
        customer_id = np.random.randint(1, n_customers + 1)
        product_id = np.random.randint(1, len(products) + 1)
        order_date = pd.to_datetime(datetime(2023,1,1) + timedelta(days=np.random.randint(0, 730)))
        quantity = np.random.randint(1, 6)
        price = products.loc[products['product_id'] == product_id, 'price'].values[0]
        amount = quantity * price
        
        # 模拟用户行为数据
        browsing_duration = np.random.randint(10, 1800)  # 浏览时长(秒)
        click_count = np.random.randint(1, 20)
        
        orders_data.append({
            'order_id': i + 1,
            'customer_id': customer_id,
            'product_id': product_id,
            'order_date': order_date,
            'quantity': quantity,
            'unit_price': price,
            'amount': amount,
            'region_id': np.random.randint(1, 7),
            'payment_method': np.random.choice(['Credit Card', 'PayPal', 'Apple Pay', 'Google Pay']),
            'shipping_method': np.random.choice(['Standard', 'Express', 'Next Day']),
            'order_status': np.random.choice(['Completed', 'Shipped', 'Processing', 'Cancelled'], p=[0.7, 0.15, 0.1, 0.05]),
            'browsing_duration_seconds': browsing_duration,
            'click_count': click_count,
            'add_to_cart_count': np.random.randint(0, 5),
            'wishlist_added': np.random.choice([0, 1], p=[0.7, 0.3]),
            'discount_applied': round(np.random.uniform(0, 0.3), 2),
            'customer_rating': np.random.choice([1, 2, 3, 4, 5], p=[0.05, 0.1, 0.2, 0.4, 0.25]),
            'return_requested': np.random.choice([0, 1], p=[0.95, 0.05])
        })
    
    orders = pd.DataFrame(orders_data)
    
    # 6. 更新用户表中的聚合信息 - 修复部分
    customer_stats = orders.groupby('customer_id').agg({
        'order_id': 'count',
        'amount': 'sum',
        'order_date': 'max'
    }).reset_index()
    
    customer_stats.columns = ['customer_id', 'total_orders_new', 'total_spent_new', 'last_order_date']
    
    # 合并时使用suffixes参数避免列名冲突
    customers = customers.merge(customer_stats, on='customer_id', how='left')
    
    # 更新统计信息
    customers['total_orders'] = customers['total_orders_new'].fillna(0)
    customers['total_spent'] = customers['total_spent_new'].fillna(0.0)
    customers['last_login_date'] = customers['last_order_date']  # 简化处理
    
    # 删除临时列
    customers = customers.drop(['total_orders_new', 'total_spent_new'], axis=1)
    
    # 7. 用户行为日志表 (用于推荐算法)
    n_behavior_logs = 100000
    behavior_logs = pd.DataFrame({
        'log_id': range(1, n_behavior_logs + 1),
        'customer_id': np.random.randint(1, n_customers + 1, n_behavior_logs),
        'product_id': np.random.randint(1, len(products) + 1, n_behavior_logs),
        'behavior_type': np.random.choice(['view', 'click', 'add_to_cart', 'purchase', 'wishlist'], 
                                        n_behavior_logs, p=[0.4, 0.3, 0.15, 0.1, 0.05]),
        'timestamp': pd.to_datetime([datetime(2024,1,1) + timedelta(seconds=np.random.randint(0, 31536000)) 
                                   for _ in range(n_behavior_logs)]),
        'session_id': [f'session_{np.random.randint(1, 10000)}' for _ in range(n_behavior_logs)],
        'device_type': np.random.choice(['Desktop', 'Mobile', 'Tablet'], n_behavior_logs),
        'browser': np.random.choice(['Chrome', 'Safari', 'Firefox', 'Edge'], n_behavior_logs)
    })
    
    return {
        'customers': customers,
        'products': products,
        'time_dim': time_dim,
        'regions': regions,
        'orders': orders,
        'behavior_logs': behavior_logs
    }

# 生成数据
data_dict = generate_comprehensive_ecommerce_data()

# 验证数据生成成功
print("数据生成成功！")
for table_name, table_data in data_dict.items():
    print(f"{table_name}: {len(table_data)} 行")

数据生成成功！
customers: 5000 行
products: 200 行
time_dim: 1096 行
regions: 6 行
orders: 50000 行
behavior_logs: 100000 行


In [4]:
!pip install pymysql

Collecting pymysql
  Downloading pymysql-1.1.2-py3-none-any.whl.metadata (4.3 kB)
Downloading pymysql-1.1.2-py3-none-any.whl (45 kB)
Installing collected packages: pymysql
Successfully installed pymysql-1.1.2


In [7]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
from sqlalchemy import create_engine, text
import pymysql

def create_database_connection():
    """创建数据库连接"""
    db_config = {
        'host': 'localhost',
        'port': 628,
        'user': 'root',  # 替换为你的用户名
        'password': '021015',  # 替换为你的密码
        'database': 'ecommerce_portfolio'
    }
    
    try:
        engine = create_engine(
            f"mysql+pymysql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
        )
        # 测试连接
        with engine.connect() as conn:
            print("✅ 数据库连接成功!")
        return engine
    except Exception as e:
        print(f"❌ 数据库连接失败: {e}")
        return None

def import_data_to_mysql(data_dict, engine):
    """将数据导入MySQL数据库"""
    
    # 定义导入顺序（先维度表，后事实表）
    import_order = ['regions', 'time_dim', 'products', 'customers', 'orders', 'behavior_logs']
    
    for table_name in import_order:
        if table_name in data_dict:
            df = data_dict[table_name]
            try:
                # 导入数据
                df.to_sql(
                    name=table_name,
                    con=engine,
                    if_exists='append',  # 追加到现有表
                    index=False,
                    method='multi',
                    chunksize=1000
                )
                print(f"✅ 成功导入表: {table_name}, 数据量: {len(df)}")
            except Exception as e:
                print(f"❌ 导入表 {table_name} 失败: {e}")

def generate_comprehensive_ecommerce_data():
    """生成完善的电商模拟数据 - 修复版本"""
    
    # 设置随机种子保证可重复性
    np.random.seed(42)
    random.seed(42)
    
    # 1. 用户维度表 - 修复字段匹配问题
    n_customers = 5000
    customers = pd.DataFrame({
        'customer_id': range(1, n_customers + 1),
        'customer_name': [f'User_{i}' for i in range(1, n_customers + 1)],
        'gender': np.random.choice(['Male', 'Female'], n_customers, p=[0.48, 0.52]),
        'birth_year': np.random.randint(1970, 2005, n_customers),
        'registration_date': pd.to_datetime([datetime(2020,1,1) + timedelta(days=np.random.randint(0, 1460)) for _ in range(n_customers)]),
        'email': [f'user{i}@example.com' for i in range(1, n_customers + 1)],
        'phone': [f'1{np.random.randint(300,999):03d}{np.random.randint(1000,9999):04d}' for _ in range(n_customers)],
        'country': np.random.choice(['USA', 'Canada', 'UK', 'Australia'], n_customers, p=[0.6, 0.2, 0.15, 0.05]),
        'city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Toronto', 'London', 'Sydney'], n_customers),
        'zip_code': [f'{np.random.randint(10000, 99999)}' for _ in range(n_customers)],
        'registration_channel': np.random.choice(['Web', 'Mobile App', 'Social Media', 'Referral'], n_customers, p=[0.4, 0.35, 0.15, 0.1]),
        'loyalty_tier': np.random.choice(['Bronze', 'Silver', 'Gold', 'Platinum'], n_customers, p=[0.5, 0.3, 0.15, 0.05]),
        'preferred_category': np.random.choice(['Electronics', 'Clothing', 'Home', 'Books', 'Sports'], n_customers),
        'avg_order_value_segment': np.random.choice(['Low', 'Medium', 'High'], n_customers, p=[0.6, 0.3, 0.1]),
        'last_login_date': None,  # 初始化为None，后面更新
        'total_orders': 0,
        'total_spent': 0.0
    })
    
    # 2. 商品维度表
    n_products = 200
    categories = {
        'Electronics': ['Smartphone', 'Laptop', 'Tablet', 'Headphones', 'Smartwatch'],
        'Clothing': ['T-Shirt', 'Jeans', 'Dress', 'Jacket', 'Shoes'],
        'Home': ['Furniture', 'Kitchenware', 'Decor', 'Lighting', 'Bedding'],
        'Books': ['Fiction', 'Non-Fiction', 'Academic', 'Children', 'Cookbook'],
        'Sports': ['Equipment', 'Apparel', 'Footwear', 'Accessories']
    }
    
    products_data = []
    product_id = 1
    for category, subcategories in categories.items():
        for subcategory in subcategories:
            for i in range(n_products // (len(categories) * len(subcategories))):
                products_data.append({
                    'product_id': product_id,
                    'product_name': f'{subcategory} Model {product_id}',
                    'category': category,
                    'subcategory': subcategory,
                    'brand': np.random.choice(['BrandA', 'BrandB', 'BrandC', 'BrandD', 'BrandE']),
                    'price': round(np.random.uniform(10, 500), 2),
                    'cost_price': round(np.random.uniform(5, 300), 2),
                    'stock_quantity': np.random.randint(0, 1000),
                    'supplier': f'Supplier_{np.random.randint(1, 20)}',
                    'rating': round(np.random.uniform(3.0, 5.0), 1),
                    'review_count': np.random.randint(0, 500),
                    'created_date': pd.to_datetime(datetime(2021,1,1) + timedelta(days=np.random.randint(0, 1000))),
                    'is_active': np.random.choice([True, False], p=[0.9, 0.1])
                })
                product_id += 1
    
    products = pd.DataFrame(products_data)
    
    # 3. 时间维度表
    dates = pd.date_range('2022-01-01', '2024-12-31', freq='D')
    time_dim = pd.DataFrame({
        'date': dates,
        'day': dates.day,
        'month': dates.month,
        'month_name': dates.strftime('%B'),
        'quarter': dates.quarter,
        'year': dates.year,
        'day_of_week': dates.dayofweek,
        'day_name': dates.strftime('%A'),
        'is_weekend': (dates.dayofweek >= 5).astype(int),
        'is_holiday': np.random.choice([0, 1], len(dates), p=[0.9, 0.1])
    })
    
    # 4. 地区维度表
    regions = pd.DataFrame({
        'region_id': range(1, 7),
        'region_name': ['North America', 'Europe', 'Asia Pacific', 'South America', 'Africa', 'Middle East'],
        'region_manager': ['Manager_A', 'Manager_B', 'Manager_C', 'Manager_D', 'Manager_E', 'Manager_F']
    })
    
    # 5. 订单事实表 (包含用户行为数据)
    n_orders = 50000
    orders_data = []
    
    for i in range(n_orders):
        customer_id = np.random.randint(1, n_customers + 1)
        product_id = np.random.randint(1, len(products) + 1)
        order_date = pd.to_datetime(datetime(2023,1,1) + timedelta(days=np.random.randint(0, 730)))
        quantity = np.random.randint(1, 6)
        price = products.loc[products['product_id'] == product_id, 'price'].values[0]
        amount = quantity * price
        
        # 模拟用户行为数据
        browsing_duration = np.random.randint(10, 1800)  # 浏览时长(秒)
        click_count = np.random.randint(1, 20)
        
        orders_data.append({
            'order_id': i + 1,
            'customer_id': customer_id,
            'product_id': product_id,
            'order_date': order_date,
            'quantity': quantity,
            'unit_price': price,
            'amount': amount,
            'region_id': np.random.randint(1, 7),
            'payment_method': np.random.choice(['Credit Card', 'PayPal', 'Apple Pay', 'Google Pay']),
            'shipping_method': np.random.choice(['Standard', 'Express', 'Next Day']),
            'order_status': np.random.choice(['Completed', 'Shipped', 'Processing', 'Cancelled'], p=[0.7, 0.15, 0.1, 0.05]),
            'browsing_duration_seconds': browsing_duration,
            'click_count': click_count,
            'add_to_cart_count': np.random.randint(0, 5),
            'wishlist_added': np.random.choice([0, 1], p=[0.7, 0.3]),
            'discount_applied': round(np.random.uniform(0, 0.3), 2),
            'customer_rating': np.random.choice([1, 2, 3, 4, 5], p=[0.05, 0.1, 0.2, 0.4, 0.25]),
            'return_requested': np.random.choice([0, 1], p=[0.95, 0.05])
        })
    
    orders = pd.DataFrame(orders_data)
    
    # 6. 更新用户表中的聚合信息 - 修复版本
    customer_stats = orders.groupby('customer_id').agg({
        'order_id': 'count',
        'amount': 'sum',
        'order_date': 'max'
    }).reset_index()
    
    customer_stats.columns = ['customer_id', 'total_orders_new', 'total_spent_new', 'last_order_date']
    
    # 合并统计信息
    customers = customers.merge(customer_stats, on='customer_id', how='left')
    
    # 更新统计信息 - 只更新MySQL表中存在的字段
    customers['total_orders'] = customers['total_orders_new'].fillna(0).astype(int)
    customers['total_spent'] = customers['total_spent_new'].fillna(0.0)
    customers['last_login_date'] = customers['last_order_date']  # 使用last_order_date更新last_login_date
    
    # 删除临时列，只保留MySQL表中存在的字段
    customers = customers.drop(['total_orders_new', 'total_spent_new', 'last_order_date'], axis=1)
    
    # 7. 用户行为日志表 (用于推荐算法)
    n_behavior_logs = 100000
    behavior_logs = pd.DataFrame({
        'log_id': range(1, n_behavior_logs + 1),
        'customer_id': np.random.randint(1, n_customers + 1, n_behavior_logs),
        'product_id': np.random.randint(1, len(products) + 1, n_behavior_logs),
        'behavior_type': np.random.choice(['view', 'click', 'add_to_cart', 'purchase', 'wishlist'], 
                                        n_behavior_logs, p=[0.4, 0.3, 0.15, 0.1, 0.05]),
        'timestamp': pd.to_datetime([datetime(2024,1,1) + timedelta(seconds=np.random.randint(0, 31536000)) 
                                   for _ in range(n_behavior_logs)]),
        'session_id': [f'session_{np.random.randint(1, 10000)}' for _ in range(n_behavior_logs)],
        'device_type': np.random.choice(['Desktop', 'Mobile', 'Tablet'], n_behavior_logs),
        'browser': np.random.choice(['Chrome', 'Safari', 'Firefox', 'Edge'], n_behavior_logs)
    })
    
    return {
        'customers': customers,
        'products': products,
        'time_dim': time_dim,
        'regions': regions,
        'orders': orders,
        'behavior_logs': behavior_logs
    }

def generate_and_import_data():
    """生成数据并导入到MySQL"""
    
    print("开始生成模拟数据...")
    data_dict = generate_comprehensive_ecommerce_data()
    
    # 验证数据列与MySQL表匹配
    print("\n验证数据列结构:")
    for table_name, df in data_dict.items():
        print(f"{table_name} 列: {list(df.columns)}")
    
    print("\n连接数据库...")
    engine = create_database_connection()
    
    if engine:
        print("开始导入数据到MySQL...")
        import_data_to_mysql(data_dict, engine)
        print("🎉 数据导入完成!")
    else:
        print("无法连接数据库，请检查配置")

# 执行数据生成和导入
if __name__ == "__main__":
    generate_and_import_data()

开始生成模拟数据...

验证数据列结构:
customers 列: ['customer_id', 'customer_name', 'gender', 'birth_year', 'registration_date', 'email', 'phone', 'country', 'city', 'zip_code', 'registration_channel', 'loyalty_tier', 'preferred_category', 'avg_order_value_segment', 'last_login_date', 'total_orders', 'total_spent']
products 列: ['product_id', 'product_name', 'category', 'subcategory', 'brand', 'price', 'cost_price', 'stock_quantity', 'supplier', 'rating', 'review_count', 'created_date', 'is_active']
time_dim 列: ['date', 'day', 'month', 'month_name', 'quarter', 'year', 'day_of_week', 'day_name', 'is_weekend', 'is_holiday']
regions 列: ['region_id', 'region_name', 'region_manager']
orders 列: ['order_id', 'customer_id', 'product_id', 'order_date', 'quantity', 'unit_price', 'amount', 'region_id', 'payment_method', 'shipping_method', 'order_status', 'browsing_duration_seconds', 'click_count', 'add_to_cart_count', 'wishlist_added', 'discount_applied', 'customer_rating', 'return_requested']
behavior_logs 列: ['