In [13]:
import pandas as pd
import os

# Define the data directory
data_dir = '../data'

# Read the CSV files with encoding specified
try:
    ml_provision = pd.read_csv(os.path.join(data_dir, 'ml_provision.csv'), encoding='latin1')
    new_ml_provision = pd.read_csv(os.path.join(data_dir, 'new_ml_provision.csv'), encoding='latin1')
    print("Files loaded successfully with latin1 encoding")
except UnicodeDecodeError:
    # Try with Windows-1252 encoding
    ml_provision = pd.read_csv(os.path.join(data_dir, 'ml_provision.csv'), encoding='cp1252')
    new_ml_provision = pd.read_csv(os.path.join(data_dir, 'new_ml_provision.csv'), encoding='cp1252')
    print("Files loaded successfully with cp1252 encoding")

# Display basic info
print(f"\nml_provision shape: {ml_provision.shape}")
print(f"new_ml_provision shape: {new_ml_provision.shape}")

Files loaded successfully with latin1 encoding

ml_provision shape: (5140830, 9)
new_ml_provision shape: (11486353, 9)


In [14]:
# Union (concatenate) both datasets
combined_provision = pd.concat([ml_provision, new_ml_provision], ignore_index=True)

# Display information about the combined dataset
print("Combined Dataset:")
print(f"Shape: {combined_provision.shape}")
print(f"Total rows: {len(combined_provision)}")
print(f"\nRows from ml_provision: {len(ml_provision)}")
print(f"Rows from new_ml_provision: {len(new_ml_provision)}")

# Check for duplicates if any
print(f"\nDuplicate rows: {combined_provision.duplicated().sum()}")

# Display first few rows
print("\nFirst few rows:")
print(combined_provision.head())

Combined Dataset:
Shape: (16627183, 9)
Total rows: 16627183

Rows from ml_provision: 5140830
Rows from new_ml_provision: 11486353

Duplicate rows: 0

First few rows:
   bsk_id                 bsk_name   customer_id        customer_name  \
0     222  HARISCHANDRAPUR - I BDO  GRPD_7087933        Sayema khatun   
1     882                 GAZNA GP  GRPF_9903885        PRADIP MONDAL   
2    1036              RAMNAGAR GP  GRPB_9333501        HEMONTA GHOSH   
3    1036              RAMNAGAR GP  GRPB_9630402  SURJA NARAYAN MITRA   
4    1036              RAMNAGAR GP  GRPB_8842011          SUMAN BAGDI   

   customer_phone  service_id  \
0      9609645980         352   
1      9932181232         357   
2      9475801203         392   
3      8293588703         392   
4      9001345678         392   

                                        service_name            prov_date  \
0  Apply - eKYC Seeding of Aadhaar with Digital R...  31/03/2025 20:39:54   
1                          Payment History

In [9]:
# Define the list of services to filter
target_services = [
    "Application for Aikyasree Scholarship",
    "Search - Sabuj Sathi Scheme Information",
    "Search - Sikshashree Information",
    "Apply - Aadhaar & Mobile Link with Digital Ration Card",
    "Apply - Swami Vivekananda Scholarship",
    "Search - Mass Education & Library Related Schemes Information",
    "Apply - Educational Loan Application",
    "Application for EWS Certificates",
    "Search - Admit Card download & Printing for JEXPO",
    "Apply - Post-Matric Scholarship Under Talent Support Programme (TSP) Application",
    "Apply - Merit Scholarship Examination (NMMSE)",
    "Search - Entrance Result of Admission in Government ITI",
    "Search - Entrance Result of JEXPO",
    "Apply - National Talent Search Examination (NTSE)",
    "Apply - Choice Filling of Admission in Government ITI",
    "Apply - Counselling of Choice Filling Up for JEXPO",
    "Search - Counselling Result of Admission in Government ITI",
    "Search - Counselling for Result for JEXPO",
    "Online fee submission for JEXPO",
    "Online fee submission in Government ITI",
    "Apply - Provisional Admission for JEXPO",
    "Apply - Registration of Candidates for JEXPO",
    "Apply - Scholarship of Students with Disabilities",
    "Apply - Student Credit Card",
    "Online Admission in Government ITIs",
    "Search - Discover Your Eligible Schemes",
    "Search - Application Status of Kanyasree Prakalpa"
]

# Filter the combined dataset for these services
# Assuming the service name column is named 'service_name' or similar - adjust column name as needed
# First, let's check what columns exist
print("Available columns:")
print(combined_provision.columns.tolist())
print("\n")

# Adjust the column name below based on the actual column name in your dataset
service_column = 'service_name'  # Change this to the actual column name

# Filter and group by service name
filtered_services = combined_provision[combined_provision[service_column].isin(target_services)]

# Group by service name and count occurrences
service_counts = filtered_services.groupby(service_column).size().reset_index(name='count')

# Sort by count in descending order
service_counts_sorted = service_counts.sort_values('count', ascending=False)

# Display the results
print("Service Counts (Sorted by Count - Descending):")
print(service_counts_sorted)
print(f"\nTotal services: {len(service_counts_sorted)}")
print(f"Total records: {service_counts_sorted['count'].sum()}")

# Save to CSV
output_file = os.path.join(data_dir, 'under18_service_counts.csv')
service_counts_sorted.to_csv(output_file, index=False)
print(f"\nCSV saved to: {output_file}")

Available columns:
['bsk_id', 'bsk_name', 'customer_id', 'customer_name', 'customer_phone', 'service_id', 'service_name', 'prov_date', 'docket_no']


Service Counts (Sorted by Count - Descending):
                                         service_name    count
2   Apply - Aadhaar & Mobile Link with Digital Rat...  1206696
1                    Application for EWS Certificates    29868
12              Apply - Swami Vivekananda Scholarship    24962
25                   Search - Sikshashree Information    23806
24            Search - Sabuj Sathi Scheme Information    22779
0               Application for Aikyasree Scholarship    21536
17  Search - Application Status of Kanyasree Prakalpa    19528
11                        Apply - Student Credit Card    14562
23  Search - Mass Education & Library Related Sche...     3902
18  Search - Counselling Result of Admission in Go...     2399
6       Apply - Merit Scholarship Examination (NMMSE)     2223
10  Apply - Scholarship of Students with Disabi

In [10]:
# Get top 8 services
top_8_services = service_counts_sorted.head(8)

# Display the top 8
print("Top 8 Services (Sorted by Count - Descending):")
print(top_8_services)
print(f"\nTotal records in top 8: {top_8_services['count'].sum()}")

# Save top 8 to CSV
output_file = os.path.join(data_dir, 'under18_top_services.csv')
top_8_services.to_csv(output_file, index=False)
print(f"\nTop 8 CSV saved to: {output_file}")

Top 8 Services (Sorted by Count - Descending):
                                         service_name    count
2   Apply - Aadhaar & Mobile Link with Digital Rat...  1206696
1                    Application for EWS Certificates    29868
12              Apply - Swami Vivekananda Scholarship    24962
25                   Search - Sikshashree Information    23806
24            Search - Sabuj Sathi Scheme Information    22779
0               Application for Aikyasree Scholarship    21536
17  Search - Application Status of Kanyasree Prakalpa    19528
11                        Apply - Student Credit Card    14562

Total records in top 8: 1363737

Top 8 CSV saved to: ../data\under18_top_services.csv


In [12]:
# Define the list of services to filter for above 60 users
above_60_services = [
    "Apply - Krishak Bandhu Scheme",
    "Search - Taposali Bandhu Pension Scheme",
    "Apply - Building and other Construction Workers Pension Scheme",
    "Search - Subscription of Building and other Construction Workers Pension Scheme Information",
    "Search - Registration of Transport Workers Pension Scheme Information",
    "Apply - Samajik Suraksha Yojana Application",
    "Info - Jai Johar Pension Scheme Application",
    "Info - Old Age Pension Scheme",
    "Information on widow pension under Jai Bangla",
    "Search - Gitanjali Scheme Information",
    "Search - Working Women Hostel Information",
    "According Academic Recognition to the Special Schools organised by NGOs",
    "Search - NSAP under Jai Bangla"
]

# Filter the combined dataset for these services
filtered_above_60 = combined_provision[combined_provision['service_name'].isin(above_60_services)]

# Group by service name and count occurrences
above_60_counts = filtered_above_60.groupby('service_name').size().reset_index(name='count')

# Sort by count in descending order
above_60_counts_sorted = above_60_counts.sort_values('count', ascending=False)

# Display the results
print("Above 60 Service Counts (Sorted by Count - Descending):")
print(above_60_counts_sorted)
print(f"\nTotal services: {len(above_60_counts_sorted)}")
print(f"Total records: {above_60_counts_sorted['count'].sum()}")

# Get top 5 services for the CSV
top_5_above_60 = above_60_counts_sorted.head(5)

# Save top 5 to CSV
output_file = os.path.join(data_dir, 'above60_top_services.csv')
top_5_above_60.to_csv(output_file, index=False)
print(f"\nTop 5 CSV saved to: {output_file}")
print(f"\nTop 5 services:")
print(top_5_above_60)


Above 60 Service Counts (Sorted by Count - Descending):
                                        service_name   count
3                      Info - Old Age Pension Scheme  144311
1                      Apply - Krishak Bandhu Scheme   98079
8            Search - Taposali Bandhu Pension Scheme   26576
5                     Search - NSAP under Jai Bangla   14091
7  Search - Subscription of Building and other Co...    8383
6  Search - Registration of Transport Workers Pen...    8058
2        Info - Jai Johar Pension Scheme Application    7330
0  Apply - Building and other Construction Worker...    7119
4              Search - Gitanjali Scheme Information    6818
9          Search - Working Women Hostel Information    1466

Total services: 10
Total records: 322231

Top 5 CSV saved to: ../data\above60_top_services.csv

Top 5 services:
                                        service_name   count
3                      Info - Old Age Pension Scheme  144311
1                      Apply - Krishak