# Merge Block Data
Merge block_top_services.csv with ml_block_master.csv to create a clean dataset with block_id, block_name, and top services

In [None]:
---
## Create Clean Block Top Services CSV
# Generate final CSV with only: block_id, block_name, service_name, rank_in_block

In [15]:
# Create clean CSV with block_id, block_name, service_name, rank_in_block
print("="*60)
print("CREATING CLEAN BLOCK TOP SERVICES CSV")
print("="*60)

# Reload the original files
block_top_services = pd.read_csv(os.path.join(DATA_DIR, "block_top_services.csv"), encoding='latin-1')
ml_block_master = pd.read_csv(os.path.join(DATA_DIR, "ml_block_master.csv"), encoding='latin-1')

print(f"\nOriginal block_top_services shape: {block_top_services.shape}")
print(f"Columns: {block_top_services.columns.tolist()}")

# First, merge to get block_id from ml_block_master
# block_top_services has block_mun_id
# ml_block_master has block_muni_id
merged = pd.merge(
    block_top_services,
    ml_block_master[['block_muni_id', 'block_muni_name']],
    left_on='block_mun_id',
    right_on='block_muni_id',
    how='inner'
)

print(f"\nAfter merge shape: {merged.shape}")

# Now aggregate by block (not BSK) - sum usage_count by block and service
block_service_agg = merged.groupby(['block_muni_id', 'block_muni_name', 'service_name']).agg(
    total_usage=('usage_count', 'sum'),
    total_customers=('unique_customers', 'sum')
).reset_index()

print(f"\nAfter aggregating by block and service: {block_service_agg.shape}")

# Rank services within each block
block_service_agg['rank_in_block'] = block_service_agg.groupby('block_muni_id')[
    'total_usage'
].rank(ascending=False, method='dense').astype(int)

# Filter to top 6 per block
top_6_per_block = block_service_agg[block_service_agg['rank_in_block'] <= 6].copy()

# Rename columns to match required format
top_6_per_block = top_6_per_block.rename(columns={
    'block_muni_id': 'block_id',
    'block_muni_name': 'block_name'
})

# Select only the required 4 columns
clean_df = top_6_per_block[['block_id', 'block_name', 'service_name', 'rank_in_block']].copy()

# Sort by block_id and rank
clean_df = clean_df.sort_values(['block_id', 'rank_in_block'])

print(f"\n‚úì Clean DataFrame created")
print(f"  Shape: {clean_df.shape}")
print(f"  Columns: {clean_df.columns.tolist()}")
print(f"  Unique blocks: {clean_df['block_id'].nunique()}")

# Verify no duplicates
duplicates = clean_df.groupby(['block_id', 'rank_in_block']).size()
duplicates = duplicates[duplicates > 1]
if len(duplicates) > 0:
    print(f"\n‚ö†Ô∏è Warning: Found {len(duplicates)} block-rank combinations with duplicates")
else:
    print(f"\n‚úì No duplicates - each block has exactly one service per rank")

print(f"\nFirst 12 rows (showing 2 blocks with top 6 services each):")
print(clean_df.head(12))

CREATING CLEAN BLOCK TOP SERVICES CSV

Original block_top_services shape: (21958, 8)
Columns: ['bsk_id', 'bsk_name_x', 'block_mun_id', 'service_id', 'usage_count', 'unique_customers', 'service_name', 'rank_in_block']

After merge shape: (21958, 10)

After aggregating by block and service: (8565, 5)

‚úì Clean DataFrame created
  Shape: (2866, 4)
  Columns: ['block_id', 'block_name', 'service_name', 'rank_in_block']
  Unique blocks: 474


First 12 rows (showing 2 blocks with top 6 services each):
    block_id     block_name  \
0          1   ALIPURDUAR I   
4          1   ALIPURDUAR I   
9          1   ALIPURDUAR I   
7          1   ALIPURDUAR I   
2          1   ALIPURDUAR I   
1          1   ALIPURDUAR I   
20         2  ALIPURDUAR II   
23         2  ALIPURDUAR II   
14         2  ALIPURDUAR II   
29         2  ALIPURDUAR II   
25         2  ALIPURDUAR II   
26         2  ALIPURDUAR II   

                                         service_name  rank_in_block  
0                 Applic

In [16]:
# Save the clean CSV
clean_output_file = os.path.join(DATA_DIR, "block_wise_top_services.csv")
clean_df.to_csv(clean_output_file, index=False, encoding='utf-8')

print("="*60)
print("SAVED CLEAN CSV")
print("="*60)
print(f"\n‚úÖ Successfully saved to: {clean_output_file}")
print(f"\nFile structure:")
print(f"  - Headers: {', '.join(clean_df.columns.tolist())}")
print(f"  - Total rows: {len(clean_df)}")
print(f"  - Unique blocks: {clean_df['block_id'].nunique()}")
print(f"  - Services per block: Top 6")

# Show sample for first 3 blocks
print("\n" + "="*60)
print("SAMPLE DATA - First 3 Blocks")
print("="*60)

for block_id in clean_df['block_id'].unique()[:3]:
    block_data = clean_df[clean_df['block_id'] == block_id]
    block_name = block_data['block_name'].iloc[0]
    
    print(f"\nüìç Block ID: {block_id} - {block_name}")
    print("-" * 60)
    for _, row in block_data.iterrows():
        print(f"  {row['rank_in_block']}. {row['service_name']}")

SAVED CLEAN CSV

‚úÖ Successfully saved to: C:\SysReco\data\block_wise_top_services.csv

File structure:
  - Headers: block_id, block_name, service_name, rank_in_block
  - Total rows: 2866
  - Unique blocks: 474
  - Services per block: Top 6

SAMPLE DATA - First 3 Blocks

üìç Block ID: 1 - ALIPURDUAR I
------------------------------------------------------------
  1. Application for Income Certificates
  2. Apply - Payment of WBSEDCL Electricity Bill
  3. Caste Certificate
  4. Apply - Submission of Form for Electoral Roll
  5. Apply - Land Revenue (Khajna)
  6. Apply - GP Certificate

üìç Block ID: 2 - ALIPURDUAR II
------------------------------------------------------------
  1. Apply - Payment of WBSEDCL Electricity Bill
  2. Apply - eKYC Seeding of Aadhaar with Digital Ration Card
  3. Application for Income Certificates
  4. Search - Bangla Awas Yojana
  5. Caste Certificate
  6. Payment History - WBSEDCL

üìç Block ID: 3 - FALAKATA
--------------------------------------------