In [41]:
import os
import pandas as pd
from datetime import datetime
import json
import gc


#47729 number of records for kharar with status ACTIVE

folder_path = '/home/prerna/Punjab/punjab-data-prod-analysis/kharar/ouput_demand_detail/'

# read active properties & needed columns
property_df = pd.read_csv(
    '/home/prerna/Punjab/punjab-data-prod-analysis/kharar/eg_pt_property.csv',
    usecols=['id', 'propertyid', 'tenantid', 'createdtime', 'additionaldetails', 'ownershipcategory', 'status', 'usagecategory']
)
property_df = property_df[property_df['status'] == 'ACTIVE'].copy()
# property_df = property_df[property_df['propertyid'] == 'PT-1503-2007991'].copy()

# read units
unit_df = pd.read_csv(
    '/home/prerna/Punjab/punjab-data-prod-analysis/kharar/eg_pt_unit.csv',
    usecols=['propertyid', 'occupancytype']
)

# read demand
demand_df = pd.read_csv(
    '/home/prerna/Punjab/punjab-data-prod-analysis/kharar/egbs_demand_v1.csv',
    usecols=['id', 'taxperiodfrom', 'taxperiodto', 'consumercode', 'status']
)
demand_df = demand_df[demand_df['status'] == 'ACTIVE'].copy()

# read demand details (memory‑efficient, in chunks)
all_chunks = []
needed_cols = ['demandid', 'taxamount', 'collectionamount', 'taxheadcode']
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        print(f'Loading: {file_path}')
        chunk = pd.read_csv(file_path, usecols=needed_cols)
        all_chunks.append(chunk)
demand_details_df = pd.concat(all_chunks, ignore_index=True)
del all_chunks; gc.collect()

print("✅ Loaded data")

Loading: /home/prerna/Punjab/punjab-data-prod-analysis/kharar/ouput_demand_detail/output_4.csv
Loading: /home/prerna/Punjab/punjab-data-prod-analysis/kharar/ouput_demand_detail/output_34.csv
Loading: /home/prerna/Punjab/punjab-data-prod-analysis/kharar/ouput_demand_detail/output_79.csv
Loading: /home/prerna/Punjab/punjab-data-prod-analysis/kharar/ouput_demand_detail/output_38.csv
Loading: /home/prerna/Punjab/punjab-data-prod-analysis/kharar/ouput_demand_detail/output_6.csv
Loading: /home/prerna/Punjab/punjab-data-prod-analysis/kharar/ouput_demand_detail/output_60.csv
Loading: /home/prerna/Punjab/punjab-data-prod-analysis/kharar/ouput_demand_detail/output_44.csv
Loading: /home/prerna/Punjab/punjab-data-prod-analysis/kharar/ouput_demand_detail/output_83.csv
Loading: /home/prerna/Punjab/punjab-data-prod-analysis/kharar/ouput_demand_detail/output_107.csv
Loading: /home/prerna/Punjab/punjab-data-prod-analysis/kharar/ouput_demand_detail/output_89.csv
Loading: /home/prerna/Punjab/punjab-data-

In [42]:
# join pt and unit
joined_pt_unit = property_df.merge(unit_df, left_on='id', right_on='propertyid', how='left', suffixes=('_property', '_unit'))
del property_df, unit_df; gc.collect()
joined_pt_unit.head()
print(demand_df['id'].nunique())

287351


In [43]:
# join demand and demand details
joined_demand = demand_df.merge(demand_details_df, left_on='id', right_on='demandid', how='left', suffixes=('_demand', '_detail'))
print(joined_demand['id'].nunique())
del demand_details_df, demand_df; gc.collect()
joined_demand.head()

287351


Unnamed: 0,id,consumercode,taxperiodfrom,taxperiodto,status,demandid,taxheadcode,taxamount,collectionamount
0,21745,PT-1503-011887,1522540800000,1554076799000,ACTIVE,21745,PT_OWNER_EXEMPTION,0.0,0.0
1,21745,PT-1503-011887,1522540800000,1554076799000,ACTIVE,21745,PT_TIME_REBATE,0.0,0.0
2,21745,PT-1503-011887,1522540800000,1554076799000,ACTIVE,21745,PT_UNIT_USAGE_EXEMPTION,0.0,0.0
3,21745,PT-1503-011887,1522540800000,1554076799000,ACTIVE,21745,PT_TAX,4500.0,4500.0
4,21745,PT-1503-011887,1522540800000,1554076799000,ACTIVE,21745,PT_FIRE_CESS,0.0,0.0


In [44]:
# import pytz
# print(joined_demand.head(10))
# print(joined_demand['taxperiodfrom'].head(10))

# # If not already datetime, convert
# joined_demand['taxperiodfrom'] = pd.to_datetime(joined_demand['taxperiodfrom'], unit='ms', utc=True)
# joined_demand['taxperiodto'] = pd.to_datetime(joined_demand['taxperiodto'], unit='ms', utc=True)

# print(joined_demand['taxperiodfrom'].head(10))

# # Convert to IST (Asia/Kolkata)
# ist = pytz.timezone('Asia/Kolkata')
# joined_demand['taxperiodfrom'] = joined_demand['taxperiodfrom'].dt.tz_convert(ist)
# joined_demand['taxperiodto'] = joined_demand['taxperiodto'].dt.tz_convert(ist)


# def get_fy(date):
#     if date.month >= 4:
#         fy_start = date.year
#         fy_end = date.year + 1
#     else:
#         fy_start = date.year - 1
#         fy_end = date.year
#     return f"{fy_start}-{str(fy_end)[-2:]}"
    
# joined_demand['fy'] = joined_demand['taxperiodfrom'].apply(get_fy)

# result = joined_demand.groupby('consumercode')['fy'].agg(['min', 'max']).reset_index()
# result.rename(columns={'min': 'earliest_fy', 'max': 'latest_fy'}, inplace=True)

# print(result)
import pytz

# Correct: parse as datetime from milliseconds since epoch
joined_demand['taxperiodfrom'] = pd.to_datetime(joined_demand['taxperiodfrom'], unit='ms', utc=True)
joined_demand['taxperiodto'] = pd.to_datetime(joined_demand['taxperiodto'], unit='ms', utc=True)

# Convert to IST (Asia/Kolkata)
ist = pytz.timezone('Asia/Kolkata')
joined_demand['taxperiodfrom'] = joined_demand['taxperiodfrom'].dt.tz_convert(ist)
joined_demand['taxperiodto'] = joined_demand['taxperiodto'].dt.tz_convert(ist)

# Financial year calculation
def get_fy(date):
    if date.month >= 4:
        fy_start = date.year
        fy_end = date.year + 1
    else:
        fy_start = date.year - 1
        fy_end = date.year
    return f"{fy_start}-{str(fy_end)[-2:]}"

joined_demand['fy'] = joined_demand['taxperiodfrom'].apply(get_fy)

# Group by consumercode
result = joined_demand.groupby('consumercode')['fy'].agg(['min', 'max']).reset_index()
result.rename(columns={'min': 'earliest_fy', 'max': 'latest_fy'}, inplace=True)

print(result)


         consumercode earliest_fy latest_fy
0      PT-1503-005758     2018-19   2024-25
1      PT-1503-005764     2018-19   2024-25
2      PT-1503-005947     2018-19   2024-25
3      PT-1503-005994     2018-19   2025-26
4      PT-1503-005998     2018-19   2025-26
...               ...         ...       ...
47091  PT-1503-999930     2019-20   2024-25
47092  PT-1503-999931     2020-21   2024-25
47093  PT-1503-999932     2020-21   2024-25
47094  PT-1503-999933     2020-21   2024-25
47095  PT-1503-999982     2020-21   2024-25

[47096 rows x 3 columns]


In [45]:
# Merge latest_fy onto joined_demand by consumercode
joined = joined_demand.merge(
    result[['consumercode', 'latest_fy']],
    on='consumercode',
    how='left'
)

latest_demand = joined[joined['fy'] == joined['latest_fy']]

demand_sum = latest_demand.groupby('consumercode')['taxamount'].sum().reset_index()
demand_sum.rename(columns={'taxamount':'latest_fy_taxamount'}, inplace=True)

result = result.merge(demand_sum, on='consumercode', how='left')

print(result)

         consumercode earliest_fy latest_fy  latest_fy_taxamount
0      PT-1503-005758     2018-19   2024-25                912.0
1      PT-1503-005764     2018-19   2024-25                372.0
2      PT-1503-005947     2018-19   2024-25                671.0
3      PT-1503-005994     2018-19   2025-26                452.0
4      PT-1503-005998     2018-19   2025-26               1057.0
...               ...         ...       ...                  ...
47091  PT-1503-999930     2019-20   2024-25                  0.0
47092  PT-1503-999931     2020-21   2024-25                  0.0
47093  PT-1503-999932     2020-21   2024-25               6706.0
47094  PT-1503-999933     2020-21   2024-25              16200.0
47095  PT-1503-999982     2020-21   2024-25                246.0

[47096 rows x 4 columns]


In [46]:
#Calculating the tax amount(demand) of current year
target_fy = "2025-26"
current_fy_demand = joined_demand[joined_demand['fy'] == target_fy]

df_fy_sum = current_fy_demand.groupby('consumercode')['taxamount'].sum().reset_index()
df_fy_sum.rename(columns={'taxamount': 'current_fy_taxamount'}, inplace=True)

all_consumercodes = pd.DataFrame(joined_demand['consumercode'].unique(), columns=['consumercode'])

final = all_consumercodes.merge(df_fy_sum, on='consumercode', how='left')
final['current_fy_taxamount'] = final['current_fy_taxamount'].fillna(0)

result = result.merge(final, on='consumercode', how='left')
result['current_fy_taxamount'] = result['current_fy_taxamount'].fillna(0)

print(result)

         consumercode earliest_fy latest_fy  latest_fy_taxamount  \
0      PT-1503-005758     2018-19   2024-25                912.0   
1      PT-1503-005764     2018-19   2024-25                372.0   
2      PT-1503-005947     2018-19   2024-25                671.0   
3      PT-1503-005994     2018-19   2025-26                452.0   
4      PT-1503-005998     2018-19   2025-26               1057.0   
...               ...         ...       ...                  ...   
47091  PT-1503-999930     2019-20   2024-25                  0.0   
47092  PT-1503-999931     2020-21   2024-25                  0.0   
47093  PT-1503-999932     2020-21   2024-25               6706.0   
47094  PT-1503-999933     2020-21   2024-25              16200.0   
47095  PT-1503-999982     2020-21   2024-25                246.0   

       current_fy_taxamount  
0                       0.0  
1                       0.0  
2                       0.0  
3                     452.0  
4                    1057.0  
...

In [47]:
# Fiscal years before the current FY
arrear_demand = joined_demand[joined_demand['fy'] < "2025-26"]

agg = arrear_demand.groupby('consumercode').agg(
    arrear_taxamount_sum=('taxamount', 'sum'),
    arrear_collectionamount_sum=('collectionamount', 'sum')
).reset_index()

agg['arrear_years_demand_generated'] = (
    agg['arrear_taxamount_sum'] - agg['arrear_collectionamount_sum']
)

result = result.merge(
    agg[['consumercode', 'arrear_years_demand_generated']],
    on='consumercode', how='left'
)
result['arrear_years_demand_generated'] = result['arrear_years_demand_generated'].fillna(0)

print(result)

         consumercode earliest_fy latest_fy  latest_fy_taxamount  \
0      PT-1503-005758     2018-19   2024-25                912.0   
1      PT-1503-005764     2018-19   2024-25                372.0   
2      PT-1503-005947     2018-19   2024-25                671.0   
3      PT-1503-005994     2018-19   2025-26                452.0   
4      PT-1503-005998     2018-19   2025-26               1057.0   
...               ...         ...       ...                  ...   
47091  PT-1503-999930     2019-20   2024-25                  0.0   
47092  PT-1503-999931     2020-21   2024-25                  0.0   
47093  PT-1503-999932     2020-21   2024-25               6706.0   
47094  PT-1503-999933     2020-21   2024-25              16200.0   
47095  PT-1503-999982     2020-21   2024-25                246.0   

       current_fy_taxamount  arrear_years_demand_generated  
0                       0.0                         3632.0  
1                       0.0                         1373.0  


In [48]:
relevant_codes = ['PT_TIME_PENALTY', 'PT_TIME_INTEREST']
filtered = joined_demand[joined_demand['taxheadcode'].isin(relevant_codes)]

grouped = (
    filtered.groupby(['consumercode', 'taxheadcode'])['taxamount']
    .sum()
    .unstack(fill_value=0)  # Puts taxheadcodes as columns, fills missing with 0
    .reset_index()
)

grouped = grouped[['consumercode', 'PT_TIME_PENALTY', 'PT_TIME_INTEREST']]
grouped = grouped.fillna(0)

result = result.merge(grouped, on='consumercode', how='left')
result[['PT_TIME_PENALTY', 'PT_TIME_INTEREST']] = result[['PT_TIME_PENALTY', 'PT_TIME_INTEREST']].fillna(0)

print(result)

         consumercode earliest_fy latest_fy  latest_fy_taxamount  \
0      PT-1503-005758     2018-19   2024-25                912.0   
1      PT-1503-005764     2018-19   2024-25                372.0   
2      PT-1503-005947     2018-19   2024-25                671.0   
3      PT-1503-005994     2018-19   2025-26                452.0   
4      PT-1503-005998     2018-19   2025-26               1057.0   
...               ...         ...       ...                  ...   
47091  PT-1503-999930     2019-20   2024-25                  0.0   
47092  PT-1503-999931     2020-21   2024-25                  0.0   
47093  PT-1503-999932     2020-21   2024-25               6706.0   
47094  PT-1503-999933     2020-21   2024-25              16200.0   
47095  PT-1503-999982     2020-21   2024-25                246.0   

       current_fy_taxamount  arrear_years_demand_generated  PT_TIME_PENALTY  \
0                       0.0                         3632.0           351.72   
1                       0

In [49]:
unit_all_columns_df = pd.read_csv(
    '/home/prerna/Punjab/punjab-data-prod-analysis/kharar/eg_pt_unit_all_columns.csv'
)
print(unit_all_columns_df)

                                          id   tenantid  \
0       3ad7b225-df76-4bae-9de8-f5de857896f5  pb.kharar   
1       9b11da22-085b-4b59-822c-743864c230ff  pb.kharar   
2       0d5c6bf6-35d2-484d-a821-2718a7b61166  pb.kharar   
3       d8cceb6e-e4de-4eab-b4dc-ca19ef8b0f61  pb.kharar   
4       b607b756-e2b6-467f-911a-4bbae377614a  pb.kharar   
...                                      ...        ...   
132819  671c6a7c-e345-4b54-8804-958307bc6852  pb.kharar   
132820  6ad3135f-7d3d-44a1-8b2c-329dbe22dcff  pb.kharar   
132821  880450d7-8765-43e2-8efe-c417177bcfe2  pb.kharar   
132822  3c4709e7-db6a-4450-96aa-b5a505df1045  pb.kharar   
132823  c21031ae-53af-4558-801e-a110588dc0f5  pb.kharar   

                                  propertyid  floorno unittype  \
0       899ec7ef-075c-41a8-8b71-9d46b6dbf36f        1      NaN   
1       5f3ada86-8162-4cbd-a4d3-eded20f5ac4f        0      NaN   
2       5f3ada86-8162-4cbd-a4d3-eded20f5ac4f        0      NaN   
3       5f3ada86-8162-4cbd-

In [50]:

# read active properties & needed columns
property_df = pd.read_csv(
    '/home/prerna/Punjab/punjab-data-prod-analysis/kharar/eg_pt_property.csv',
    usecols=['id', 'propertyid', 'tenantid', 'createdtime', 'additionaldetails', 'ownershipcategory', 'status', 'usagecategory', 'propertytype']
)
property_df = property_df[property_df['status'] == 'ACTIVE'].copy()
# Merge properties and units by property id
merged = property_df.merge(unit_all_columns_df, left_on='id', right_on='propertyid', suffixes=('_property', '_unit'))

def classify_ownership(occupancies):
    unique_types = set(occupancies)
    if 'RENTED' in unique_types:
        if len(unique_types) > 1:
            return 'Mixed'
        else:
            return 'Tenant'
    if 'SELFOCCUPIED' in unique_types:
        # If only SELFOCCUPIED or SELFOCCUPIED + UNOCCUPIED
        return 'Owner'
    if 'UNOCCUPIED' in unique_types:
        return 'Owner'
    # fallback
    return None

# Find occupancytypes per property id
ownership = (
    merged.groupby('propertyid_property')['occupancytype']
    .apply(classify_ownership)
    .reset_index()
    .rename(columns={'occupancytype': 'Owned_Rented'})
)

property_df = property_df.merge(ownership, left_on='propertyid', right_on = 'propertyid_property', how='left')

print(property_df)


                                         id       propertyid   tenantid  \
0      3949d5ff-bf33-4d9c-ab4b-90f0b3393eb0  PT-1503-2032754  pb.kharar   
1      9dd602f3-761b-4928-bec1-ea97141af1ef  PT-1503-2025080  pb.kharar   
2      cb3beec6-1fb7-4d53-9194-292119386cc7  PT-1503-1544579  pb.kharar   
3      dfe6ca3c-06d5-463e-a3ff-5c504f4fd965  PT-1503-2110143  pb.kharar   
4      5893f98c-c156-4da2-bc19-5104aad93497  PT-1503-2110145  pb.kharar   
...                                     ...              ...        ...   
46832  616f468e-9f9c-43fd-b83e-3618cef07d2a   PT-1503-086007  pb.kharar   
46833  da124d4b-550c-4860-865c-1693f09acf2f  PT-1503-1994060  pb.kharar   
46834  fe051f92-aafe-4490-b70e-acec827ced30  PT-1503-1994078  pb.kharar   
46835  65eae68a-d39d-47d8-ae47-411633e83d08   PT-1503-635396  pb.kharar   
46836  04853ec9-a6c9-4ffa-a00e-1c723a53b8bd  PT-1503-1994085  pb.kharar   

       status                 propertytype          ownershipcategory  \
0      ACTIVE       BUILTU

In [51]:
# # def clean_numeric(series):
# #     # Replace 'NULL' strings and NaNs with 0, then convert to float
# #     return pd.to_numeric(series.replace('NULL', 0), errors='coerce').fillna(0)

# # unit_all_columns_df['builtuparea'] = clean_numeric(unit_all_columns_df['builtuparea'])
# # unit_all_columns_df['plintharea'] = clean_numeric(unit_all_columns_df['plintharea'])

# # area_summary = (
# #     unit_all_columns_df.groupby('propertyid')
# #     .agg(
# #         total_builtup_area=('builtuparea', 'sum'),
# #         total_plinth_area=('plintharea', 'sum')
# #     )
# #     .reset_index()
# # )

# # property_df = property_df.merge(area_summary, left_on='id', right_on='propertyid', how='left')
# # print(property_df)
# # property_df['total_builtup_area'] = property_df['total_builtup_area'].fillna(0)
# # property_df['total_plinth_area'] = property_df['total_plinth_area'].fillna(0)
# total_plinth_area

def clean_numeric(series):
    # Replace 'NULL' strings and NaNs with 0, then convert to float
    return pd.to_numeric(series.replace('NULL', 0), errors='coerce').fillna(0)

merged['builtuparea'] = clean_numeric(merged['builtuparea'])
merged['plintharea'] = clean_numeric(merged['plintharea'])

area_summary = (
    merged.groupby('propertyid_property', as_index=False)
    .agg(
        total_builtup_area=('builtuparea', 'sum'),
        total_plinth_area=('plintharea', 'sum')
    )
)
# for col in ['total_builtup_area', 'total_plinth_area']:
#     if col in property_df.columns:
#         property_df = property_df.drop(col, axis=1)

property_df = property_df.merge(area_summary, left_on='propertyid' ,right_on='propertyid_property', how='left')
property_df['total_builtup_area'] = property_df['total_builtup_area'].fillna(0)
property_df['total_plinth_area'] = property_df['total_plinth_area'].fillna(0)

print(property_df)

                                         id       propertyid   tenantid  \
0      3949d5ff-bf33-4d9c-ab4b-90f0b3393eb0  PT-1503-2032754  pb.kharar   
1      9dd602f3-761b-4928-bec1-ea97141af1ef  PT-1503-2025080  pb.kharar   
2      cb3beec6-1fb7-4d53-9194-292119386cc7  PT-1503-1544579  pb.kharar   
3      dfe6ca3c-06d5-463e-a3ff-5c504f4fd965  PT-1503-2110143  pb.kharar   
4      5893f98c-c156-4da2-bc19-5104aad93497  PT-1503-2110145  pb.kharar   
...                                     ...              ...        ...   
46832  616f468e-9f9c-43fd-b83e-3618cef07d2a   PT-1503-086007  pb.kharar   
46833  da124d4b-550c-4860-865c-1693f09acf2f  PT-1503-1994060  pb.kharar   
46834  fe051f92-aafe-4490-b70e-acec827ced30  PT-1503-1994078  pb.kharar   
46835  65eae68a-d39d-47d8-ae47-411633e83d08   PT-1503-635396  pb.kharar   
46836  04853ec9-a6c9-4ffa-a00e-1c723a53b8bd  PT-1503-1994085  pb.kharar   

       status                 propertytype          ownershipcategory  \
0      ACTIVE       BUILTU

In [52]:
# for col in ['total_builtup_area_x', 'total_plinth_area_x', 'total_builtup_area_y', 'total_plinth_area_y', 'propertyid_property_y', 'propertyid_property', 'total_builtup_area', 'total_plinth_area']:
#     if col in property_df.columns:
#         property_df = property_df.drop(col, axis=1)

# print(property_df)

In [53]:
property_result_merged = property_df.merge(
    result,
    left_on='propertyid',
    right_on='consumercode',
    how='left'
)

print(property_result_merged)

                                         id       propertyid   tenantid  \
0      3949d5ff-bf33-4d9c-ab4b-90f0b3393eb0  PT-1503-2032754  pb.kharar   
1      9dd602f3-761b-4928-bec1-ea97141af1ef  PT-1503-2025080  pb.kharar   
2      cb3beec6-1fb7-4d53-9194-292119386cc7  PT-1503-1544579  pb.kharar   
3      dfe6ca3c-06d5-463e-a3ff-5c504f4fd965  PT-1503-2110143  pb.kharar   
4      5893f98c-c156-4da2-bc19-5104aad93497  PT-1503-2110145  pb.kharar   
...                                     ...              ...        ...   
46832  616f468e-9f9c-43fd-b83e-3618cef07d2a   PT-1503-086007  pb.kharar   
46833  da124d4b-550c-4860-865c-1693f09acf2f  PT-1503-1994060  pb.kharar   
46834  fe051f92-aafe-4490-b70e-acec827ced30  PT-1503-1994078  pb.kharar   
46835  65eae68a-d39d-47d8-ae47-411633e83d08   PT-1503-635396  pb.kharar   
46836  04853ec9-a6c9-4ffa-a00e-1c723a53b8bd  PT-1503-1994085  pb.kharar   

       status                 propertytype          ownershipcategory  \
0      ACTIVE       BUILTU

In [23]:
property_result_merged.to_csv('result_with_areas.csv', index= False)

In [55]:
pt_owner_df = pd.read_csv(
    '/home/prerna/Punjab/punjab-data-prod-analysis/kharar/eg_pt_owner.csv',
    usecols=['propertyid', 'ownertype']
)
print(pt_owner_df.shape)

exempt_types = {'FREEDOMFIGHTER', 'WIDOW'}

# For each propertyid, check if ANY owner has an exempt type
exempt_flags = (
    pt_owner_df.groupby('propertyid')['ownertype']
    .apply(lambda x: any(o in exempt_types for o in x))
    .reset_index()
    .rename(columns={'ownertype': 'Is property exempted'})
)

property_result_merged = property_result_merged.merge(
    exempt_flags,
    left_on='id',
    right_on='propertyid',
    how='left'
)

# Convert True/False to 'Yes'/'No', fill missing (i.e., no owners) as 'No'
property_result_merged['Is property exempted'] = (
    property_result_merged['Is property exempted'].fillna(False)
    .map({True: 'Yes', False: 'No'})
)


(72985, 2)


In [56]:
property_result_merged.head()

Unnamed: 0,id,propertyid_x,tenantid,status,propertytype,ownershipcategory,usagecategory,createdtime,additionaldetails,propertyid_property_x,...,consumercode,earliest_fy,latest_fy,latest_fy_taxamount,current_fy_taxamount,arrear_years_demand_generated,PT_TIME_PENALTY,PT_TIME_INTEREST,propertyid_y,Is property exempted
0,3949d5ff-bf33-4d9c-ab4b-90f0b3393eb0,PT-1503-2032754,pb.kharar,ACTIVE,BUILTUP.SHAREDPROPERTY,INDIVIDUAL.SINGLEOWNER,RESIDENTIAL,1734679839992,"{""remarks"": ""CORRECTION"", ""marketValue"": ""0"", ...",PT-1503-2032754,...,PT-1503-2032754,2014-15,2024-25,303.0,0.0,0.0,504.56,2321.03,3949d5ff-bf33-4d9c-ab4b-90f0b3393eb0,No
1,9dd602f3-761b-4928-bec1-ea97141af1ef,PT-1503-2025080,pb.kharar,ACTIVE,BUILTUP.SHAREDPROPERTY,INDIVIDUAL.MULTIPLEOWNERS,RESIDENTIAL,1732783076067,"{""yearConstruction"": null}",PT-1503-2025080,...,PT-1503-2025080,2014-15,2024-25,218.0,0.0,0.0,362.92,1649.94,9dd602f3-761b-4928-bec1-ea97141af1ef,No
2,cb3beec6-1fb7-4d53-9194-292119386cc7,PT-1503-1544579,pb.kharar,ACTIVE,BUILTUP.INDEPENDENTPROPERTY,INDIVIDUAL.SINGLEOWNER,RESIDENTIAL,1732680694531,"{""remarks"": ""property sale deed "", ""marketValu...",PT-1503-1544579,...,PT-1503-1544579,2016-17,2024-25,299.0,0.0,0.0,318.05,895.85,cb3beec6-1fb7-4d53-9194-292119386cc7,No
3,dfe6ca3c-06d5-463e-a3ff-5c504f4fd965,PT-1503-2110143,pb.kharar,ACTIVE,BUILTUP.SHAREDPROPERTY,INDIVIDUAL.SINGLEOWNER,RESIDENTIAL,1748845358814,"{""yearConstruction"": null}",PT-1503-2110143,...,PT-1503-2110143,2021-22,2025-26,184.0,184.0,0.0,0.0,0.0,dfe6ca3c-06d5-463e-a3ff-5c504f4fd965,No
4,5893f98c-c156-4da2-bc19-5104aad93497,PT-1503-2110145,pb.kharar,ACTIVE,BUILTUP.INDEPENDENTPROPERTY,INDIVIDUAL.SINGLEOWNER,RESIDENTIAL,1748845407024,"{""yearConstruction"": null}",PT-1503-2110145,...,PT-1503-2110145,2014-15,2025-26,0.0,0.0,0.0,0.0,0.0,5893f98c-c156-4da2-bc19-5104aad93497,Yes


In [59]:
# build final report: rename & reorder
report = property_result_merged.rename(columns={
    'tenantid': 'ULB',
    'propertyid_x': 'Property ID',
    'usagecategory': 'Usage',
    'createdtime': 'Date of Creation of the Property in the System',
    'additionaldetails': 'Date of Construction of the Property',
    'ownershipcategory': 'Ownership Type',
    'Owned_Rented': 'Owned_Rented (Owner/ Rented/ Mixed)',
    'earliest_fy': 'Earliest Financial Year for which Demand was Generated',
    'latest_fy': 'Latest Financial Year for which Demand was Generated',
    'latest_fy_taxamount': 'Latest Demand Generated [in Rs.]',
    'current_fy_taxamount': 'Current Years Demand Generated [in Rs.]',
    'PT_TIME_PENALTY': 'Penalty',
    'PT_TIME_INTEREST': 'Interest',
    'arrear_years_demand_generated': 'Arrear Years Demand Generated [in Rs.]',
    'propertytype' : 'Property Type[Building/ Vacant]',
    'total_builtup_area' : 'Total Builtup Area [Sum of all units/ floors]',
    'total_plinth_area' : 'Total Plinth Area [Sum of all units/ floors]', 
    'Is property exempted' : 'Is Property Exempted [Yes/ No]'
}).copy()

def epoch_to_custom_date(epoch_ms):
    return datetime.fromtimestamp(epoch_ms / 1000).strftime('%d-%b-%Y') if pd.notna(epoch_ms) else None

def get_year_construction(val):
    if pd.isna(val): return None
    try: return json.loads(val).get('yearConstruction')
    except: return None

# format ULB & dates
report['ULB'] = report['ULB'].str.split('.').str[1].str.capitalize()
report['Date of Creation of the Property in the System'] = report['Date of Creation of the Property in the System'].apply(epoch_to_custom_date)
report['Date of Construction of the Property'] = report['Date of Construction of the Property'].apply(get_year_construction)

# then keep only those columns, in the exact order you want:
final_report = report[
    [
        'ULB',
        'Property ID',
        'Usage',
        'Date of Creation of the Property in the System',
        'Date of Construction of the Property',
        'Ownership Type',
        'Is Property Exempted [Yes/ No]',
        'Owned_Rented (Owner/ Rented/ Mixed)',
        'Earliest Financial Year for which Demand was Generated',
        'Latest Financial Year for which Demand was Generated',
        'Latest Demand Generated [in Rs.]',
        'Current Years Demand Generated [in Rs.]',
        'Penalty',
        'Interest',
        'Arrear Years Demand Generated [in Rs.]',
        'Property Type[Building/ Vacant]',
        'Total Builtup Area [Sum of all units/ floors]',
        'Total Plinth Area [Sum of all units/ floors]'
    ]
].copy()

print("✅ Writing CSV")
final_report.to_csv('Punjab_Data_Analysis_Kharar.csv', index=False)
print(f"🎉 Done! CSV generated with {len(final_report)} properties")

✅ Writing CSV
🎉 Done! CSV generated with 46837 properties


In [32]:
# joined_demand[joined_demand['consumercode'] == 'PT-1503-1994032']
joined_demand[joined_demand['consumercode'].str.contains('PT-1503-011887', na=False)]

# joined_demand.head()
print(joined_demand.shape)

print(joined_demand['id'].nunique())

(3490242, 10)
285210


In [33]:
demand_df = pd.read_csv(
    '/home/prerna/Punjab/punjab-data-prod-analysis/kharar/egbs_demand_v1.csv',
    usecols=['id', 'taxperiodfrom', 'taxperiodto', 'consumercode', 'status']
)
demand_df = demand_df[demand_df['status'] == 'ACTIVE'].copy()

print(demand_df['id'].nunique())

287351
