In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('weather_data.csv')
print("Before Cleaning:\n", df.head())

Before Cleaning:
          date      city  temperature_celsius  humidity_percent  \
0  2023-01-01  New York                  5.0              60.0   
1  01/02/2023  New York                  NaN              65.0   
2  03-01-2023  New York                  7.0               NaN   
3         NaN    London                  8.0              70.0   
4  2023-01-02    London                  6.0              75.0   

   wind_speed_kph weather_condition  
0            10.0             Sunny  
1            12.0            Cloudy  
2             8.0             Rainy  
3            15.0           Unknown  
4            20.0             Snowy  


### data cleaning and transformation

In [6]:
# Replace missing temperature_celsius with avg temp for that city
df['temperature_celsius'] = df.groupby('city')['temperature_celsius']\
    .transform(lambda x: x.fillna(x.mean()))
# Drop rows where date is missing
df = df.dropna(subset=['date'])


### Standardize Dates (to YYYY-MM-DD)

In [7]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')  
df = df.dropna(subset=['date']) 
df['date'] = df['date'].dt.strftime('%Y-%m-%d')

### Add temperature_fahrenheit column

In [8]:
df['temperature_fahrenheit'] = df['temperature_celsius'] * 9/5 + 32

### Filter: Drop rows with Unknown or null weather_condition

In [9]:
df = df[~df['weather_condition'].isin(['Unknown', np.nan])]
df = df[df['weather_condition'].notna()]

# 🔎 Check after cleaning
print("\nAfter Cleaning:\n", df.head())



After Cleaning:
           date      city  temperature_celsius  humidity_percent  \
0   2023-01-01  New York             5.000000              60.0   
4   2023-01-02    London             6.000000              75.0   
6   2023-01-01     Tokyo            10.000000              50.0   
16  2023-01-12     Tokyo             7.164706              61.0   
23  2023-01-05    London             9.912500              60.0   

    wind_speed_kph weather_condition  temperature_fahrenheit  
0             10.0             Sunny               41.000000  
4             20.0             Snowy               42.800000  
6              5.0             Sunny               50.000000  
16             NaN             RAINY               44.896471  
23             NaN             Rainy               49.842500  
