In [1]:
# !pip install pandas

import pandas as pd
import numpy as np

In [2]:
# Load the CSV file
file_path = "HydroMet Data/FINAL/DhapDamRainfalFinal.csv"  # Replace with the actual file path
df = pd.read_csv(file_path)

print(df.head(10))
print(df.shape)
print("initial df shape : ",df.shape)

         Date  Hour Hourly rainfall value
0  2024-04-15     0                     0
1  2024-04-15     1                     0
2  2024-04-15     2                     0
3  2024-04-15     3                     0
4  2024-04-15     4                     0
5  2024-04-15     5                     0
6  2024-04-15     7                     0
7  2024-04-15     7                     0
8  2024-04-15     8                     0
9  2024-04-15     9                     0
(4901, 3)
initial df shape :  (4901, 3)


In [3]:
# Check for duplicates based on 'Full_date', 'Hour', and 'waterLevel'
duplicates = df[df.duplicated(subset=['Date', 'Hour'])]

# Get the number of duplicate rows
num_duplicates = len(duplicates)
print("DUPLICATES",num_duplicates)
df= df.drop_duplicates(subset=['Date', 'Hour'])

print("df shape after removing duplicates:", df.shape)
# print(df.head(20))

DUPLICATES 50
df shape after removing duplicates: (4851, 3)


In [4]:
# Find rows where Full_date contains the value "#VALUE!"
invalid_rows = df[df['Date'] == "#VALUE!"]

# Get the number of rows with the issue
num_invalid_rows = len(invalid_rows)
print(f"Number of rows with '#VALUE!' in Full_date: {num_invalid_rows}")

df = df[df['Date'] != "#VALUE!"]
print(df.shape)

Number of rows with '#VALUE!' in Full_date: 0
(4851, 3)


In [5]:
count_hyphen = (df['Hourly rainfall value'] == '-').sum()

# Print the count
print(f"Number of '-' values: {count_hyphen}")

df['Hourly rainfall value'] = df['Hourly rainfall value'].replace('-', 0)


Number of '-' values: 62


In [6]:
# sort data based on date, hour and minute

# Ensure the date column is in datetime format for correct sorting
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

# Sort the data by Full_date, Hour, and Minute
df_sorted = df.sort_values(by=['Date', 'Hour'], ascending=[True, True])

# Reset index for a clean DataFrame
df = df_sorted.reset_index(drop=True)

print(df.tail(10))


           Date  Hour Hourly rainfall value
4841 2024-11-26    10                     0
4842 2024-11-26    11                     0
4843 2024-11-26    12                     0
4844 2024-11-26    13                     0
4845 2024-11-26    14                     0
4846 2024-11-26    15                     0
4847 2024-11-26    16                     0
4848 2024-11-26    17                     0
4849 2024-11-26    18                     0
4850 2024-11-26    19                     0


In [7]:
# Ensure Date is in datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Generate full range of dates (April to August)
full_dates = pd.date_range(start='2024-04-15', end='2024-11-25', freq='D')

# Generate all hours (0 to 23)
hours = pd.DataFrame({'Hour': range(24)})

# Create all combinations of dates and hours
full_date_hour = pd.MultiIndex.from_product([full_dates, hours['Hour']], names=['Date', 'Hour']).to_frame(index=False)

# Merge with existing data
df_full = pd.merge(full_date_hour, df, on=['Date', 'Hour'], how='left')

# Fill missing Hourly rainfall value with 0
df_full['Hourly rainfall value'] = df_full['Hourly rainfall value'].fillna(0)

# Print or save the final DataFrame
print(df_full)

           Date  Hour Hourly rainfall value
0    2024-04-15     0                     0
1    2024-04-15     1                     0
2    2024-04-15     2                     0
3    2024-04-15     3                     0
4    2024-04-15     4                     0
...         ...   ...                   ...
5395 2024-11-25    19                     0
5396 2024-11-25    20                     0
5397 2024-11-25    21                     0
5398 2024-11-25    22                     0
5399 2024-11-25    23                     0

[5400 rows x 3 columns]


In [8]:
# print(merged_df.head(25))
print(df_full.shape,"shape after interpolation")

df_full['DateTime'] = pd.to_datetime(df_full['Date'].dt.date.astype(str) + ' ' + df_full['Hour'].astype(str) + ':00')


(5400, 3) shape after interpolation


In [9]:
# Save the cleaned data back to a CSV file
output_file_path = "cleaned_rainfall_data_14_dec.csv"  # Replace with desired output file path
df_full.to_csv(output_file_path, index=False)