In [1]:
%pip install h5py

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import duckdb
import numpy as np
import h5py
import pandas as pd
from tqdm import tqdm
import os

# Paths
master_path = r"C:\Users\wongb\twitter-community-notes-time-series\twitter-community-notes-user-time-series\aggregator\data\user_period_master_complete_with_authored_scores.parquet"
factor_data_path = r"C:\Users\wongb\twitter-community-notes-time-series\twitter-community-notes-user-time-series\aggregator\data\prescoringRaterModelOutput_3dim.tsv"
output_path = r"C:\Users\wongb\twitter-community-notes-time-series\twitter-community-notes-user-time-series\user_factor_analysis\data\tensordata\user_timeseries_tensor.h5"

print("üöÄ Creating 3D tensor dataset (High Memory Mode - 20GB)...")
print("=" * 60)

# Connect to DuckDB with high memory
con = duckdb.connect()
con.execute("PRAGMA memory_limit='18GB';")  # Leave 2GB for other operations

# Step 1: Load entire dataset into memory at once
print("üìä Step 1: Loading complete dataset...")
print("‚ö° Using high-memory mode for maximum speed...")

full_data = con.execute(f"""
    SELECT * FROM '{master_path}' ORDER BY userId, period_start
""").fetchdf()

print(f"‚úÖ Loaded {len(full_data):,} records")

# Get dimensions
unique_users = full_data['userId'].unique()
unique_periods = sorted(full_data['period_start'].unique())
feature_columns = [col for col in full_data.columns if col not in ['userId', 'period_start', 'period_end']]

n_users = len(unique_users)
n_timesteps = len(unique_periods)
n_features = len(feature_columns)

print(f"Tensor dimensions: {n_users:,} √ó {n_timesteps} √ó {n_features}")
print(f"Total elements: {n_users * n_timesteps * n_features:,}")

# Step 2: Load factor1 labels (all at once)
print("\nüéØ Step 2: Loading factor1 labels...")
factor_df = pd.read_csv(factor_data_path, sep='\t')

# Filter out rows where internalRaterFactor1 is null
factor_df_clean = factor_df.dropna(subset=['internalRaterFactor1'])
factor_dict = dict(zip(factor_df_clean['raterParticipantId'], factor_df_clean['internalRaterFactor1']))

print(f"Total factor records: {len(factor_df):,}")
print(f"Records with non-null factor1: {len(factor_df_clean):,}")
print(f"Null factor1 percentage: {((len(factor_df) - len(factor_df_clean)) / len(factor_df) * 100):.1f}%")

# Create labels array
user_to_idx = {user_id: idx for idx, user_id in enumerate(unique_users)}
labels = np.full(n_users, np.nan, dtype=np.float32)
labeled_count = 0

for user_id, factor1 in factor_dict.items():
    if user_id in user_to_idx:
        labels[user_to_idx[user_id]] = factor1
        labeled_count += 1

print(f"Labels assigned: {labeled_count:,} users ({labeled_count/n_users*100:.1f}%)")

# Step 3: Create tensor in memory (fastest approach)
print("\nüîß Step 3: Building tensor in memory...")
tensor = np.full((n_users, n_timesteps, n_features), np.nan, dtype=np.float32)

# Create period mapping
period_to_idx = {period: idx for idx, period in enumerate(unique_periods)}

# Vectorized approach - group by user and period for batch processing
print("‚ö° Using vectorized processing...")

# Process all data at once using pandas pivot operations
for i, (user_id, user_data) in enumerate(tqdm(full_data.groupby('userId'), desc="Processing users")):
    user_idx = user_to_idx[user_id]
    
    for _, row in user_data.iterrows():
        period_idx = period_to_idx[row['period_start']]
        
        # Extract feature values with proper null handling
        feature_values = np.full(n_features, np.nan, dtype=np.float32)
        for j, col in enumerate(feature_columns):
            val = row[col]
            if pd.notna(val) and val is not pd.NA:
                try:
                    feature_values[j] = float(val)
                except (ValueError, TypeError):
                    feature_values[j] = np.nan
            else:
                feature_values[j] = np.nan
        
        tensor[user_idx, period_idx, :] = feature_values

# Step 4: Data quality analysis
print("\nüìà Step 4: Data quality analysis...")
total_elements = tensor.size
missing_elements = np.sum(np.isnan(tensor))
fill_rate = (total_elements - missing_elements) / total_elements

print(f"Total tensor elements: {total_elements:,}")
print(f"Missing elements: {missing_elements:,}")
print(f"Fill rate: {fill_rate:.1%}")

# Step 5: Save to HDF5 (all at once - fastest)
print("\nüíæ Step 5: Saving to HDF5...")
os.makedirs(os.path.dirname(output_path), exist_ok=True)

with h5py.File(output_path, 'w') as f:
    # Save tensor (single write operation)
    print("  üíΩ Writing tensor...")
    f.create_dataset('tensor', data=tensor, compression='gzip', compression_opts=6)
    
    # Save metadata
    print("  üìã Writing metadata...")
    user_ids_bytes = [str(uid).encode('utf-8') for uid in unique_users]
    f.create_dataset('user_ids', data=user_ids_bytes)
    f.create_dataset('factor1_labels', data=labels)
    
    period_strings = [str(p).encode('utf-8') for p in unique_periods]
    f.create_dataset('periods', data=period_strings)
    
    feature_names_bytes = [name.encode('utf-8') for name in feature_columns]
    f.create_dataset('feature_names', data=feature_names_bytes)
    
    # Attributes
    f.attrs['n_users'] = n_users
    f.attrs['n_timesteps'] = n_timesteps
    f.attrs['n_features'] = n_features
    f.attrs['fill_rate'] = fill_rate
    f.attrs['labeled_users'] = labeled_count
    f.attrs['creation_date'] = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    f.attrs['memory_mode'] = 'high_memory_20gb'

con.close()

print("\n‚úÖ High-speed tensor creation complete!")
print("=" * 60)
print(f"üìÅ File: {output_path}")
print(f"üìä Size: {os.path.getsize(output_path) / (1024**2):.1f} MB")
print(f"üè∑Ô∏è Labeled users: {labeled_count:,}/{n_users:,}")
print(f"‚ö° Processing mode: High-memory (20GB)")

# Verification
with h5py.File(output_path, 'r') as f:
    print(f"\nüîç Verification:")
    print(f"  Tensor shape: {f['tensor'].shape}")
    print(f"  Tensor dtype: {f['tensor'].dtype}")
    print(f"  First user: {f['user_ids'][0].decode('utf-8')}")
    print(f"  Labels: {np.sum(~np.isnan(f['factor1_labels'][:]))} non-null")
    print(f"  Fill rate: {f.attrs['fill_rate']:.1%}")

print("\nüéØ Ready for high-performance time series ML!")

üöÄ Creating 3D tensor dataset (High Memory Mode - 20GB)...
üìä Step 1: Loading complete dataset...
‚ö° Using high-memory mode for maximum speed...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

‚úÖ Loaded 21,020,118 records
Tensor dimensions: 1,279,178 √ó 72 √ó 68
Total elements: 6,262,855,488

üéØ Step 2: Loading factor1 labels...
Tensor dimensions: 1,279,178 √ó 72 √ó 68
Total elements: 6,262,855,488

üéØ Step 2: Loading factor1 labels...
Labels assigned: 422,979 users (33.1%)

üîß Step 3: Building tensor in memory...
Labels assigned: 422,979 users (33.1%)

üîß Step 3: Building tensor in memory...
‚ö° Using vectorized processing...
‚ö° Using vectorized processing...


Processing users:   0%|          | 0/1279178 [00:27<?, ?it/s]



TypeError: float() argument must be a string or a real number, not 'NAType'