In [11]:
import pandas as pd
import numpy as np

# === Step 1: Load raw lines from the file ===
file_path = 'NAVAll.txt'  # or 'your_file.csv' if applicable
with open(file_path, 'r', encoding='utf-8') as f:
    raw_lines = f.readlines()

# === Step 2: Parse the true header ===
column_headers = raw_lines[0].strip().split(";")

# === Step 3: Extract rows with subheader and header context ===
data_rows = []
current_fund_house = None
current_scheme_category = None

for line in raw_lines[1:]:
    line = line.strip()
    if not line:
        continue
    if ";" not in line:
        if "Open Ended Schemes" in line or "Close Ended Schemes" in line:
            current_scheme_category = line
        else:
            current_fund_house = line
    else:
        fields = line.split(";")
        fields += [current_fund_house, current_scheme_category]
        data_rows.append(fields)

# === Step 4: Build DataFrame ===
df = pd.DataFrame(data_rows, columns=column_headers + ["Fund House", "Scheme Category"])

# === Step 5: Clean and convert data types ===
# Remove non-numeric NAV rows
df = df[df["Net Asset Value"].str.replace(".", "", 1).str.isnumeric()]
df["Net Asset Value"] = df["Net Asset Value"].astype(float)
df["date"] = pd.to_datetime(df["Date"], errors='coerce')
df["Scheme Code"] = df["Scheme Code"].astype(str)

# Drop the original 'Date' column (we renamed it to 'date')
df = df.drop(columns=["Date"])

# === Step 6: Reorder and rename columns ===
column_order = [
    "date", "Scheme Code", "Scheme Name", "Net Asset Value",
    "Scheme Category", "Fund House", 
    "ISIN Div Payout/ ISIN Growth", "ISIN Div Reinvestment"
]

df = df[column_order]

# Replace dash-only values with proper NaNs
df.replace("-", np.nan, inplace=True)

# Sort by date (most recent first)
df.sort_values(by="date", ascending=False, inplace=True)

# Optional: reset index if needed
df.reset_index(drop=True, inplace=True)

# Prefix all except 'date'
prefix = "NAV__"
df.columns = ["date" if col == "date" else f"{prefix}{col}" for col in df.columns]

# === Optional: Save cleaned CSV ===
df.to_csv("cleaned_mutual_fund_nav.csv", index=False)