In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

BASE_DIR = Path.cwd().parent   
data_path = BASE_DIR / "1. data" / "raw" / "prowess" / "extracted" / "Prowess_All_Data_Raw.csv"



# Read the first two rows separately
df_header = pd.read_csv(data_path, nrows=2, header=None)

# Combine them into one header
combined_header = df_header.iloc[0].fillna('') + "_" + df_header.iloc[1].fillna('')
combined_header = combined_header.str.strip("_")  # remove leading/trailing underscores

# Now read the actual data using the combined header
df = pd.read_csv(data_path, skiprows=2, names=combined_header)

# Merge on "industry" to bring Segment info
data_path2 = BASE_DIR / "1. data" / "raw" / "prowess" / "extracted" / "Seg_Mapping.csv"
df2 = pd.read_csv(data_path2)   # has industry + segment
df = df.merge(df2, on="Industry group", how="left")
# Replace missing segments with 'Other'
df["Segment"] = df["Segment"].fillna("Other")

print(df.shape)
df.head()

(7954, 210)


Unnamed: 0,Company Name,Business Description,Industry group,CIN,Main product/service group,NIC name,NIC code,YE-Mar 2020_Short term trade receivables & bills receivable,YE-Mar 2020_Long term trade receivables,YE-Mar 2024_RPT-pay(%),...,YE-Mar 2022_Intangibles Filter,YE-Mar 2022_Receivables Filter,YE-Mar 2022_RPT Filter,YE-Mar 2022_Filters Sum,YE-Mar 2022_Overall Filter,YE-Mar 2022_Adj-PLI1(OP/OC),YE-Mar 2022_Adj-PLI2(OP/OR),All Years_Wt. PLI1 (%),All Years_Wt. PLI2 (%),Segment
0,Teamo Productions H Q Ltd.,Teamo Productions H Q Ltd. is a public limited...,Computer software,L74110DL2006PLC413221,Software services,Providing software support and maintenance to ...,62013.0,3.01,,1.26,...,1,0,1,0,1,133.33,57.14,2750.89,96.49,SDS
1,Shriram Value Services Ltd.,Shriram Value Services Ltd. is a public limite...,ITES,U63090TN1995PLC033513,Information technology enabled service/BPO,Other information service activities n.e.c.,63999.0,73.42,,14.89,...,1,1,0,0,1,4079.69,97.61,2638.39,96.35,ITES
2,Narayana Institute For Advanced Research Pvt. ...,Narayana Institute For Advanced Research Pvt. ...,Business services & consultancy,U85121KA2006PTC040989,Research & development services,Scientific research and Development,72.0,,,,...,1,1,0,0,1,1700.0,94.44,1700.0,94.44,ITES
3,Melstar Information Technologies Ltd.,Melstar Information Technologies Ltd. is a pub...,Computer software,L85493MH1986PLC040604,Software services,Providing software support and maintenance to ...,62013.0,7.65,,,...,1,1,1,0,1,-100.0,-999.0,1151.55,92.01,SDS
4,Yume India Pvt. Ltd.,Yume India Pvt. Ltd. is a private limited comp...,Computer software,U72200TN2008PTC069574,Software services,Providing software support and maintenance to ...,62013.0,16.6,,,...,1,0,1,0,1,900.0,90.0,900.0,90.0,SDS


In [3]:
# ----------------------------
# Function to process one FY
# ----------------------------
def process_year(df, year):
    # ---------- Cell 1: Keep only required columns ----------
    keep_cols = [col for col in df.columns 
                 if col in ["CIN", "Company Name", "Industry group","Segment"] or year in col]
    df_filtered = df[keep_cols]

    # strip prefix up to first underscore
    def remove_prefix(colname):
        if "_" in colname and not colname.startswith(("Company Name", "Industry group")):
            return colname.split("_", 1)[1]
        return colname

    df_filtered = df_filtered.rename(columns=remove_prefix)

    # ---------- Cell 2: Keep only required cols + filters ----------
    required_cols = [
        "CIN","Company Name","Industry group","Segment",
        "Net worth","Compensation to employees",
        "Trading revenue and non-financial services",
        "Of which: Rent/Operating lease rent income",
        "Of which: Trading income","RPT-inc(%)","RPT-pay(%)",
        "Total forex earnings","Persistent Loss Filter",
        "Gross intangible assets","Depreciation on intangible assets for the year",
        "OR","OP","PLI1(OP/OC)(%)"
    ]
    keep_cols = [col for col in required_cols if col in df_filtered.columns]
    df_filtered = df_filtered[keep_cols]

    df_filtered = df_filtered[df_filtered["PLI1(OP/OC)(%)"] != -999]
    df_filtered = df_filtered[df_filtered["OR"] > 0]

    # ---------- Cell 3: Derived % columns ----------
    df_final = df_filtered.fillna(0)

    df_final["Service_Inc_%"] = (
        df_final["Trading revenue and non-financial services"]
        - df_final["Of which: Rent/Operating lease rent income"]
        - df_final["Of which: Trading income"]
    ) / df_final["OR"]*100

    df_final["Export_Inc_%"] = df_final["Total forex earnings"] / df_final["OR"]*100
    df_final["RPT_%"] = df_final[["RPT-inc(%)", "RPT-pay(%)"]].max(axis=1)
    df_final["Emp_Cost_%"] = df_final["Compensation to employees"] / (df_final["OR"] - df_final["OP"])*100
    df_final["Gross_Int_%"] = df_final["Gross intangible assets"] / df_final["OR"]*100
    df_final["Int_Dep_%"] = df_final["Depreciation on intangible assets for the year"] / (df_final["OR"] - df_final["OP"])*100

    df_final["Persistent Loss Filter"] = 1 - df_final["Persistent Loss Filter"]

    # cap percentages
    pct_cols = ["Service_Inc_%","Export_Inc_%","RPT_%","Emp_Cost_%","Gross_Int_%","Int_Dep_%"]
    for col in pct_cols:
        df_final[col] = np.clip(df_final[col], 0, 100)

    # ---------- Cell 4: Final subset + renaming ----------
    required_cols = [
        "CIN","Company Name","Industry group","Segment",
        "Net worth","Persistent Loss Filter","OR","PLI1(OP/OC)(%)",
        "Emp_Cost_%","RPT_%","Export_Inc_%","Service_Inc_%","Gross_Int_%"
    ]
    keep_cols = [col for col in required_cols if col in df_final.columns]
    df_final = df_final[keep_cols]

    rename_dict = {
        "CIN":"cin",
        "Company Name": "company_name",
        "Industry group": "industry_sector",
        "Segment": "segment",
        "Net worth": "net_worth",
        "Persistent Loss Filter": "persistent_loss",
        "OR": "or",
        "PLI1(OP/OC)(%)": "pli",
        "Emp_Cost_%": "emp_cost_pct",
        "RPT_%": "rpt_pct",
        "Export_Inc_%": "export_income_pct",
        "Service_Inc_%": "service_income_pct",
        "Gross_Int_%": "gross_intbl_pct"
    }
    df_final = df_final.rename(columns={col: rename_dict[col] for col in keep_cols if col in rename_dict})

    # ---------- Cell 5: Last filters ----------
    df_final = df_final[df_final["or"] >= 1]
    df_final = df_final[df_final["segment"] != "Other"].reset_index(drop=True)

    return df_final


# ----------------------------
# Master loop over 3 years
# ----------------------------
years = [2022, 2023, 2024]
dfs = []

for year in years:
    temp = process_year(df, str(year))

    # add FY column
    fy = f"{year-1}-{str(year)[-2:]}"
    temp["fy"] = fy

    dfs.append(temp)

# ----------------------------
# Combine & Save
# ----------------------------
df_final_all = pd.concat(dfs, ignore_index=True)
# build output path
out_path = BASE_DIR / "1. data" / "processed" / "Prowess_IT_Data_Processed.csv"
df_final_all.to_csv(out_path, index=False)

print(df_final_all.shape)
df_final_all.head()


(5787, 14)


Unnamed: 0,cin,company_name,industry_sector,segment,net_worth,persistent_loss,or,pli,emp_cost_pct,rpt_pct,export_income_pct,service_income_pct,gross_intbl_pct,fy
0,U63090TN1995PLC033513,Shriram Value Services Ltd.,ITES,ITES,604.88,0,296.34,4079.69,25.528914,97.45,0.0,100.0,0.0,2021-22
1,U72900PN2014PTC152592,Advanced Risk Analytics Pvt. Ltd.,ITES,ITES,34.42,0,14.26,883.45,84.137931,10.9,99.509116,99.509116,0.070126,2021-22
2,U72200KA2014PTC077244,Fonepaisa Payment Solutions Pvt. Ltd.,Computer software,SDS,18.69,0,38.91,782.31,46.485261,2.29,0.0,100.0,0.0,2021-22
3,U74210WB2006PTC110217,Agrawal & Agrawal Environmental Engg. Pvt. Ltd.,Business services & consultancy,ITES,8.87,0,3.98,552.46,14.754098,43.0,0.0,100.0,0.0,2021-22
4,U73100DL2004PLC131109,P I Life Science Research Ltd.,Business services & consultancy,ITES,34.25,0,5.66,332.06,33.587786,86.52,0.0,100.0,0.0,2021-22
