In [2]:
import pandas as pd
import numpy as np
import os 

In [10]:
# Placeholder function to read CSV data
def load_data(file_path, city_name):
    try:
        df = pd.read_csv(file_path)
        df['city'] = city_name  # Add city name as a new column
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None

In [11]:
# Directory where all city CSV files are stored
data_directory = "CityDatasets"

# List of city names and their corresponding file names
city_data = [
    {"city": "Bengaluru", "file": "bengaluru-temp-rains.csv"},
    {"city": "Mumbai", "file": "mumbai-temp-rains.csv"},
    {"city": "Hyderabad", "file": "hyd-temp-rains.csv"},
    {"city": "Chennai", "file": "chennai-temp-rains.csv"},
    {"city": "Kolkata", "file": "kolkata-temp-rains.csv"},
    {"city": "Pune", "file": "pune-temp-rains.csv"},
    {"city": "Ahmedabad", "file": "amd-temp-rains.csv"},
    {"city": "Delhi", "file": "delhi-temp-rains.csv"}
]

# Initialize an empty list to store dataframes
dfs = []

In [None]:
# Loop through each city and load data
for city in city_data:
    file_name = city["file"]  # Use just the file name
    city_name = city["city"]
    
    # Construct the full file path
    file_path = os.path.join(data_directory, file_name)
    
    # Load data using the full file path and add city name
    df = load_data(file_path, city_name)
    
    # Append to list if dataframe loaded successfully
    if df is not None:
        dfs.append(df)

In [17]:
combined_df = pd.concat(dfs, ignore_index=True)
combined_df.to_csv("combined_city_data.csv", index=False)

# Data Cleaning

In [3]:
df = pd.read_csv("combined_city_data.csv")

def convert_mixed_date(date):
    try:
        return pd.to_datetime(date, format="%d-%m-%Y")  
    except:
        try:
            return pd.to_datetime("1899-12-30") + pd.to_timedelta(float(date), unit="D")  
        except:
            return pd.NaT

In [4]:
df['Date'] = df['Date'].astype(str).apply(convert_mixed_date)
df['Date'] = df['Date'].ffill()
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Rain'] = df['Rain'].replace('Tr', 0.05)

In [5]:
invalid_values = ["----", "---", " ", "", "NaN", "nan"]
df[['Rain', 'Temp Max', 'Temp Min']] = df[['Rain', 'Temp Max', 'Temp Min']].replace(invalid_values, np.nan)
df[['Rain', 'Temp Max', 'Temp Min']] = df[['Rain', 'Temp Max', 'Temp Min']].apply(pd.to_numeric, errors='coerce')

In [6]:
df['Rain'] = df.groupby(['city', 'Month'])['Rain'].transform(lambda x: x.fillna(x.median()))
df['Temp Max'] = df.groupby(['city', 'Month'])['Temp Max'].transform(lambda x: x.fillna(x.median()))
df['Temp Min'] = df.groupby(['city', 'Month'])['Temp Min'].transform(lambda x: x.fillna(x.median()))

In [7]:
df.drop(columns=['Year', 'Month'], inplace=True)
df.to_csv("cleaned_data.csv", index=False)

In [11]:
df.head()

Unnamed: 0,Date,Rain,Temp Max,Temp Min,city
0,1951-01-01,0.0,26.969999,15.65,Bengaluru
1,1951-01-02,0.0,26.18,14.69,Bengaluru
2,1951-01-03,0.0,26.309999,14.95,Bengaluru
3,1951-01-04,0.0,26.73,14.84,Bengaluru
4,1951-01-05,0.0,26.93,14.53,Bengaluru
