In [1]:
import os
import pandas as pd
from datetime import datetime
import json
import gc

folder_path_demanddetails = '/home/prerna/Punjab/punjab-data-prod-analysis/srihargobindpur/output_demand_details/'

# read active properties & needed columns
property_df = pd.read_csv(
    '/home/prerna/Punjab/punjab-data-prod-analysis/srihargobindpur/eg_pt_property.csv',
    usecols=['id', 'propertyid', 'tenantid', 'createdtime', 'additionaldetails', 'ownershipcategory', 'status', 'usagecategory']
)
property_df = property_df[property_df['status'] == 'ACTIVE'].copy()

# read units
# unit_df = pd.read_csv(
#     '/home/prerna/Punjab/punjab-data-prod-analysis/srihargobindpur/eg_pt_unit.csv',
#     usecols=['propertyid', 'occupancytype']
# )



# read demand
demand_df = pd.read_csv(
    '/home/prerna/Punjab/punjab-data-prod-analysis/srihargobindpur/egbs_demand_v1.csv',
    dtype={"consumercode": str},
    low_memory=False,
    usecols=['id', 'taxperiodfrom', 'taxperiodto', 'consumercode', 'status', 'businessservice']
)
demand_df = demand_df[demand_df['status'] == 'ACTIVE'].copy()
demand_df = demand_df[demand_df['businessservice'] == 'PT'].copy()


# read demand details (memory‑efficient, in chunks)
all_chunks = []
needed_cols = ['demandid', 'taxamount', 'collectionamount', 'taxheadcode']
for filename in os.listdir(folder_path_demanddetails):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path_demanddetails, filename)
        print(f'Loading: {file_path}')
        chunk = pd.read_csv(file_path, usecols=needed_cols)
        all_chunks.append(chunk)
demand_details_df = pd.concat(all_chunks, ignore_index=True)
del all_chunks; gc.collect()

print("✅ Loaded data")

Loading: /home/prerna/Punjab/punjab-data-prod-analysis/srihargobindpur/output_demand_details/output_1.csv
Loading: /home/prerna/Punjab/punjab-data-prod-analysis/srihargobindpur/output_demand_details/output_0.csv
✅ Loaded data


In [2]:
print(len(property_df))         # number of rows in properties
# print(len(unit_df))             # number of rows in units
print(len(demand_df))   # number of rows in demand details
print(len(demand_details_df))   # number of rows in demand details

1528
4596
91995


In [3]:
# join demand and demand details
joined_demand = demand_df.merge(demand_details_df, left_on='id', right_on='demandid', how='left', suffixes=('_demand', '_detail'))
print(joined_demand['id'].nunique())
del demand_details_df, demand_df; gc.collect()
joined_demand.head()

4596


Unnamed: 0,id,consumercode,businessservice,taxperiodfrom,taxperiodto,status,demandid,taxheadcode,taxamount,collectionamount
0,17580,PT-808-009413,PT,1522540800000,1554076799000,ACTIVE,17580,PT_OWNER_EXEMPTION,0.0,0.0
1,17580,PT-808-009413,PT,1522540800000,1554076799000,ACTIVE,17580,PT_TIME_REBATE,0.0,0.0
2,17580,PT-808-009413,PT,1522540800000,1554076799000,ACTIVE,17580,PT_UNIT_USAGE_EXEMPTION,0.0,0.0
3,17580,PT-808-009413,PT,1522540800000,1554076799000,ACTIVE,17580,PT_ROUNDOFF,-0.27,-0.27
4,17580,PT-808-009413,PT,1522540800000,1554076799000,ACTIVE,17580,PT_TIME_INTEREST,0.0,0.0


In [4]:
import pytz

# Correct: parse as datetime from milliseconds since epoch
joined_demand['taxperiodfrom'] = pd.to_datetime(joined_demand['taxperiodfrom'], unit='ms', utc=True)
joined_demand['taxperiodto'] = pd.to_datetime(joined_demand['taxperiodto'], unit='ms', utc=True)

# Convert to IST (Asia/Kolkata)
ist = pytz.timezone('Asia/Kolkata')
joined_demand['taxperiodfrom'] = joined_demand['taxperiodfrom'].dt.tz_convert(ist)
joined_demand['taxperiodto'] = joined_demand['taxperiodto'].dt.tz_convert(ist)

# Financial year calculation
def get_fy(date):
    if date.month >= 4:
        fy_start = date.year
        fy_end = date.year + 1
    else:
        fy_start = date.year - 1
        fy_end = date.year
    return f"{fy_start}-{str(fy_end)[-2:]}"

joined_demand['fy'] = joined_demand['taxperiodfrom'].apply(get_fy)

# Group by consumercode
result = joined_demand.groupby('consumercode')['fy'].agg(['min', 'max']).reset_index()
result.rename(columns={'min': 'earliest_fy', 'max': 'latest_fy'}, inplace=True)

print(result)

      consumercode earliest_fy latest_fy
0    PT-808-003622     2018-19   2023-24
1    PT-808-004348     2014-15   2025-26
2    PT-808-004360     2015-16   2025-26
3    PT-808-004439     2018-19   2025-26
4    PT-808-004457     2018-19   2025-26
..             ...         ...       ...
556  PT-808-990360     2014-15   2023-24
557  PT-808-995234     2015-16   2023-24
558  PT-808-995256     2015-16   2023-24
559  PT-808-995270     2020-21   2023-24
560  PT-808-997331     2014-15   2023-24

[561 rows x 3 columns]


In [5]:
# Merge latest_fy onto joined_demand by consumercode
joined = joined_demand.merge(
    result[['consumercode', 'latest_fy']],
    on='consumercode',
    how='left'
)

# Filter only latest FY
latest_demand = joined[joined['fy'] == joined['latest_fy']]

# Pivot taxheadcode values into separate columns
pivoted = latest_demand.pivot_table(
    index='consumercode',
    columns='taxheadcode',
    values='taxamount',
    aggfunc='sum',
    fill_value=0
).reset_index()

# Apply formula:
# PT_TAX + PT_CANCER_CESS + PT_FIRE_CESS + PT_ROUNDOFF - (PT_OWNER_EXEMPTION + PT_UNIT_USAGE_EXEMPTION)
pivoted['latest_fy_taxamount'] = (
    pivoted.get('PT_TAX', 0) +
    pivoted.get('PT_CANCER_CESS', 0) +
    pivoted.get('PT_FIRE_CESS', 0) +
    pivoted.get('PT_ROUNDOFF', 0) -
    ( pivoted.get('PT_OWNER_EXEMPTION', 0).abs() + pivoted.get('PT_UNIT_USAGE_EXEMPTION', 0).abs() )
)

# Merge back into result
result = result.merge(
    pivoted[['consumercode', 'latest_fy_taxamount']],
    on='consumercode',
    how='left'
)

print(result.head())


    consumercode earliest_fy latest_fy  latest_fy_taxamount
0  PT-808-003622     2018-19   2023-24               453.45
1  PT-808-004348     2014-15   2025-26              1003.60
2  PT-808-004360     2015-16   2025-26               211.74
3  PT-808-004439     2018-19   2025-26                77.61
4  PT-808-004457     2018-19   2025-26                97.58


In [6]:
# Calculating the tax amount (demand) of current year using formula
target_fy = "2025-26"
current_fy_demand = joined_demand[joined_demand['fy'] == target_fy]

# Pivot taxheadcode values into separate columns
pivoted_current = current_fy_demand.pivot_table(
    index='consumercode',
    columns='taxheadcode',
    values='taxamount',
    aggfunc='sum',
    fill_value=0
).reset_index()

# Apply formula:
pivoted_current['current_fy_taxamount'] = (
    pivoted_current.get('PT_TAX', 0) +
    pivoted_current.get('PT_CANCER_CESS', 0) +
    pivoted_current.get('PT_FIRE_CESS', 0) +
    pivoted_current.get('PT_ROUNDOFF', 0) -
    ( pivoted_current.get('PT_OWNER_EXEMPTION', 0).abs() + pivoted_current.get('PT_UNIT_USAGE_EXEMPTION', 0).abs() )
)

# Keep only required cols
pivoted_current = pivoted_current[['consumercode', 'current_fy_taxamount']]

# Ensure all consumercodes are present
all_consumercodes = pd.DataFrame(joined_demand['consumercode'].unique(), columns=['consumercode'])
final = all_consumercodes.merge(pivoted_current, on='consumercode', how='left')
final['current_fy_taxamount'] = final['current_fy_taxamount'].fillna(0)

# Merge into result
result = result.merge(final, on='consumercode', how='left')
result['current_fy_taxamount'] = result['current_fy_taxamount'].fillna(0)

print(result.head())


    consumercode earliest_fy latest_fy  latest_fy_taxamount  \
0  PT-808-003622     2018-19   2023-24               453.45   
1  PT-808-004348     2014-15   2025-26              1003.60   
2  PT-808-004360     2015-16   2025-26               211.74   
3  PT-808-004439     2018-19   2025-26                77.61   
4  PT-808-004457     2018-19   2025-26                97.58   

   current_fy_taxamount  
0                  0.00  
1               1003.60  
2                211.74  
3                 77.61  
4                 97.58  


In [7]:
property_result_merged = property_df.merge(
    result,
    left_on='propertyid',
    right_on='consumercode',
    how='left'
)

print(property_result_merged)

                                        id      propertyid  \
0     3255e541-747d-4867-a2ec-80fb51942f8f  PT-808-2068803   
1     65b4b946-099e-42c9-93de-8b810ea33a9c  PT-808-2068804   
2     9fc15c76-50c2-47e3-a3f2-3de0c81b1f56  PT-808-2068805   
3     acbad58a-265d-4195-987b-bcc51513d8f0  PT-808-2068808   
4     d757a948-a01a-486c-a272-282a3f2b5785  PT-808-2068812   
...                                    ...             ...   
1523  499d6a6d-88ba-4756-9d44-5c5b51dfea65  PT-808-1990386   
1524  d4fd73fd-1bef-41b4-b7b9-5234017a6224  PT-808-1990650   
1525  a6d5746c-4a1c-4c20-b64a-7391815392bf  PT-808-1990686   
1526  03c69f7b-123c-4504-8e24-69880a27b579  PT-808-1991771   
1527  3e8025ba-d978-4369-b0bb-439758c11258  PT-808-1992011   

                tenantid  status       ownershipcategory  \
0     pb.srihargobindpur  ACTIVE  INDIVIDUAL.SINGLEOWNER   
1     pb.srihargobindpur  ACTIVE  INDIVIDUAL.SINGLEOWNER   
2     pb.srihargobindpur  ACTIVE  INDIVIDUAL.SINGLEOWNER   
3     pb.srihar

In [8]:
property_result_merged.to_csv('Punjab_Data_Analysis_srihargobindpur_final_2.csv', index=False)