In [8]:
#Step-1: Bring in raw data

import pandas as pd
import numpy as np
from scipy.stats.mstats import winsorize

# Read the raw data
df = pd.read_csv("ACETP_Raw_Data.csv")


# Bring in Segment Field
df2 = pd.read_csv("Ind_Seg_Mapping.csv")   # has industry + segment
df = df.merge(df2[["CD_Industry", "Segment"]], on="CD_Industry", how="left")

# Replace missing segments with 'Other'
df["Segment"] = df["Segment"].fillna("Other")

# Fill NaN in numeric columns only
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].fillna(0)

# Drop Cols
df = df.drop(columns=["Sr.No.", "Accord Code"])
df = df[df["Segment"] != "Other"]


df.shape

(1054, 308)

In [4]:
# Step-2: Create separate dataframes for FYs

# Common columns
common_cols = [
    "Company Name", 
    "CD_CIN Number", 
    "CD_Industry", 
    "CD_Economic Activity(NIC)", 
    "Segment"
]

# Identify financial columns by suffix
cols_2024 = [c for c in df.columns if not c.endswith(("1","2")) and c not in common_cols]
cols_2023 = [c for c in df.columns if c.endswith("1")]
cols_2022 = [c for c in df.columns if c.endswith("2")]


# Create separate dataframes
df_2024 = df[common_cols + cols_2024]
df_2023 = df[common_cols + cols_2023]
df_2022 = df[common_cols + cols_2022]

In [39]:
# Step-3: Function to create derived fields

def find_col(df, keyword):
    """Find the first column containing the keyword (case-insensitive)."""
    for col in df.columns:
        if keyword.lower() in col.lower():
            return col
    return None  # if not found



def create_derived_fields(df):
    # Build all derived cols at once
    derived = {}

    # OR & OC
    derived["OR"] = df[find_col(df,"Net Sales")]
    derived["OC"] = (
        df[find_col(df,"Total Expenditure")] 
        - df[find_col(df,"Interest Expenses")] 
        - df[find_col(df,"Donations")]
    )

    # PLI
    derived["PLI1(OP/OC)(%)"] = (derived["OR"] - derived["OC"]) / derived["OC"] * 100

    # Service & Export Income %
    derived["Service_Inc_%"] = (
        df[find_col(df,"Net Sales")] /
        (df[find_col(df,"Net Sales")] + df[find_col(df,"Other Income")]) * 100
    )
    derived["Export_Inc_%"] = df[find_col(df,"Sales - Exports")] / derived["OR"] * 100

    # RPT Inc & Pay
    derived["RPT-inc(%)"] = (
        df[find_col(df,"RPTS_Sales")] +
        df[find_col(df,"RPTS_Discount Income")] +
        df[find_col(df,"RPTS_Commission Income")] +
        df[find_col(df,"RPTS_Claims Received")] +
        df[find_col(df,"RPTS_Other Income")]
    ) / derived["OR"] * 100

    derived["RPT-pay(%)"] = (
        df[find_col(df,"RPTS_Purchases")] +
        df[find_col(df,"RPTS_Interest Expenses")] +
        df[find_col(df,"RPTS_Director Remuneration")] +
        df[find_col(df,"RPTS_Rent Expense")] +
        df[find_col(df,"RPTS_Expenses")] +
        df[find_col(df,"RPTS_Dividend Expense")] +
        df[find_col(df,"RPTS_Discount Expense")] +
        df[find_col(df,"RPTS_Commission Expense")] +
        df[find_col(df,"RPTS_Claims Paid")]
    ) / derived["OC"] * 100

    derived["RPT_%"] = pd.concat(
        [derived["RPT-inc(%)"], derived["RPT-pay(%)"]], axis=1
    ).max(axis=1)

    # Employee cost %
    derived["Emp_Cost_%"] = df[find_col(df,"Employee Cost")] / derived["OC"] * 100

    # Convert dict → DataFrame and join in one shot
    df = df.assign(**derived)

    # Cap % columns between 0 and 100
    pct_cols = ["Service_Inc_%", "Export_Inc_%", "RPT_%", "Emp_Cost_%"]
    df[pct_cols] = df[pct_cols].clip(0, 100)

    return df




In [40]:
# Step-4: Function to rename & select fields

def select_and_rename(df):
    # Map of standard names
    rename_map = {
        "Company Name": "CompanyName",
        "CD_CIN Number": "CIN",
        "CD_Industry": "Industry",
        "CD_Economic Activity(NIC)": "EconomicActivity",
        find_col(df, "Net Worth"): "Networth",
        "PLI1(OP/OC)(%)": "PLI",
    }

    # Columns we want to keep (original names)
    keep_cols = [
        "Company Name",
        "CD_CIN Number",
        "CD_Industry",
        "CD_Economic Activity(NIC)",
        "Segment",
        find_col(df, "Net Worth"),
        "OR",
        "PLI1(OP/OC)(%)",
        "Emp_Cost_%",
        "Service_Inc_%",
        "Export_Inc_%",
        "RPT_%",
    ]

    # Select & rename in one go
    df_final = df.loc[:, keep_cols].rename(columns=rename_map)

    return df_final




In [41]:
# Step-5: Combine all and save

dfs = []
for fy, data in {
    "2024-25": df_2024,
    "2023-24": df_2023,
    "2022-23": df_2022,
}.items():
    df_proc = create_derived_fields(data.copy())
    df_proc = select_and_rename(df_proc)
    df_proc["FY"] = fy
    dfs.append(df_proc)

final_df = pd.concat(dfs, ignore_index=True)

# Save to CSV
final_df.to_csv("ACETP_Processed_Data.csv", index=False)
