In [None]:
import pandas as pd

file_path = "DATASET.xlsx"

xls = pd.ExcelFile(file_path)
print("✅ Available sheets:", xls.sheet_names)

# Read both sheets
sheet1_df = pd.read_excel(file_path, sheet_name="Sheet1", header=0)
sheet2_df = pd.read_excel(file_path, sheet_name="Sheet2", header=0)
yearly_rainfall_df = pd.read_excel(file_path, sheet_name="yearly_rainfall", header=0)

# ---------- 2. Inspect raw data ----------
print("\n📊 Sheet1 shape:", sheet1_df.shape)
print("📋 Sheet1 columns:", sheet1_df.columns.tolist())
print(sheet1_df.head())

print("\n📊 Sheet2 shape:", sheet2_df.shape)
print("📋 Sheet2 columns:", sheet2_df.columns.tolist())
print(sheet2_df.head())

print("\n📊 yearly_rainfall shape:", yearly_rainfall_df.shape)
print("📋 yearly_rainfall columns:", yearly_rainfall_df.columns.tolist())
print(yearly_rainfall_df.head())


# ---------- 3. Define cleaning function ----------
def clean_df(df):

    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            if df[col].isnull().any():
                df[col] = df[col].fillna(df[col].mean())
            df[col] = df[col].apply(lambda x: x if x >= 0 else 0)
            df[col] = df[col].round(2)
        else:
            if df[col].isnull().any():
                df[col] = df[col].fillna(df[col].mode()[0])
    return df

# ---------- 4. Clean both sheets ----------
sheet1_cleaned = clean_df(sheet1_df.copy())
sheet2_cleaned = clean_df(sheet2_df.copy())
yearly_rainfall_cleaned = yearly_rainfall_df.copy()


# ---------- 6. Saving cleaned data  ----------
output_file = "processed_data.xlsx"
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
    sheet1_cleaned.to_excel(writer, sheet_name="Sheet1", index=False)
    sheet2_cleaned.to_excel(writer, sheet_name="Sheet2", index=False)
    yearly_rainfall_cleaned.to_excel(writer, sheet_name="yearly_rainfall", index=False)
   

print(f"\n✅ Processing complete! Cleaned data saved to '{output_file}'")

✅ Available sheets: ['Sheet1', 'Sheet2', 'yearly_rainfall']

📊 Sheet1 shape: (71, 21)
📋 Sheet1 columns: ['Time_Index', 'Year', 'Period', 'Months_Covered', 'Period_Length', 'District', 'No_of_HNS', 'Rise_<2m', 'Rise_<2m_%', 'Rise_2-4m', 'Rise_2-4m_%', 'Rise_>4m', 'Rise_>4m_%', 'Fall_<2m', 'Fall_<2m_%', 'Fall_2-4m', 'Fall_2-4m_%', 'Fall_>4m', 'Fall_>4m_%', 'Total_Rise', 'Total_Fall']
   Time_Index     Year       Period Months_Covered  Period_Length District  \
0           1  2023-24  Pre-Monsoon        May–Aug              4   Bokaro   
1           2  2023-24  Pre-Monsoon        May–Aug              4   Chatra   
2           3  2023-24  Pre-Monsoon        May–Aug              4  Devghar   
3           4  2023-24  Pre-Monsoon        May–Aug              4  Dhanbad   
4           5  2023-24  Pre-Monsoon        May–Aug              4    Dumka   

   No_of_HNS  Rise_<2m  Rise_<2m_%  Rise_2-4m  ...  Rise_>4m  Rise_>4m_%  \
0         14         4       28.57          4  ...         5       35.