In [161]:
import requests
import pandas as pd


In [177]:
# 1.Fetch Weather Details from Open Meteo 
url = "https://api.open-meteo.com/v1/forecast"
params = {
	"latitude": 52.52,
	"longitude": 13.41,
    "start_date": "2024-01-01",
    "end_date": "2025-01-31",
	"daily": "temperature_2m_max,temperature_2m_min,precipitation_sum,weather_code",
	"timezone": "America/New_York"
}

response = requests.get(url, params=params)
if response.status_code == 200:
    data = response.json()
    daily_data = []
    for i in range(len(data["daily"]["time"])):
        daily_data.append({
            "date": data["daily"]["time"][i],
            "temperature_max": data["daily"]["temperature_2m_max"][i],
            "temperature_min": data["daily"]["temperature_2m_min"][i],
            "precipitation": data["daily"]["precipitation_sum"][i],
            "weather_code": data["daily"]["weather_code"][i],  # WMO weather code
        })
    weather_data = pd.DataFrame(daily_data)
else:
    print("Failed ")
    
weather_data.to_csv("./output/open_meteo_weather_data.csv", index=False)

In [None]:
WEATHER_DATA_FILE = "..\weather data\open_meteo_weather_data.csv"
weather_data = pd.read_csv(WEATHER_DATA_FILE)

# Initialize variables
nan_ranges = []
start_date = None
end_date = None

# Iterate through the DataFrame
for i, row in weather_data.iterrows():
    if pd.isna(row['temperature_max']):  # Check if temperature_max is NaN
        if start_date is None:  # Start of a new NaN block
            start_date = row['date']
    else:
        if start_date is not None:  # End of a NaN block
            nan_ranges.append((start_date, weather_data.at[i - 1, 'date']))
            start_date = None

# Handle case where the last rows are NaN
if start_date is not None:
    nan_ranges.append((start_date, weather_data.at[len(df) - 1, 'date']))

# Print the result
# print(nan_ranges)
for start, end in nan_ranges:
    start_date = start
    end_date = end
   
    
## Drop Nan Values
weather_data = weather_data.dropna()
weather_data.head()

Unnamed: 0,date,temperature_max,temperature_min,precipitation,weather_code
317,2024-11-13,7.7,3.2,0.0,3.0
318,2024-11-14,8.3,4.7,0.0,3.0
319,2024-11-15,9.3,5.0,0.0,3.0
320,2024-11-16,8.0,5.8,2.1,61.0
321,2024-11-17,7.2,4.4,4.8,80.0


In [189]:
# Feature Engineering
# Adding Weather Description

weather_code_mapping = {
    0: "Clear sky",
    1: "Mainly clear",
    2: "Partly cloudy",
    3: "Overcast",
    45: "Fog",
    48: "Depositing rime fog",
    51: "Light drizzle",
    56: "Light freezing drizzle",
    61: "Light rain",
    63: "Rain",
    66: "Light freezing rain",
    71: "Slight snow fall",
    73: "moderate snow fall",
    75: "heavy snow fall",
    77: "Snow grains",
    80: "Light rain showers",
    81: "Moderate rain showers",
    82: "Heavy rain showers",
    85: "Light snow showers",
    86: "Heavy snow showers",
}



# Map weather codes to seasons
season_mapping = {
    "Summer": [0, 1, 2],  
    "Winter": [3, 45, 48, 56, 66, 71, 73, 75, 77, 85, 86],  
    "Rainy": [51, 61, 80, 81, 82,63],   
    "Autumn": [3, 45, 51],
}

weather_code_to_season = {}
for season, codes in season_mapping.items():
    for code in codes:
        weather_code_to_season[code] = season
        
# print(weather_code_to_season)

for code, description in weather_code_mapping.items():
    season = weather_code_to_season.get(code, "Unknown")
        
# weather_data["weather_description"] = weather_data["weather_code"].map(weather_code_mapping)
# weather_data["season"] = weather_data["weather_code"].map(weather_code_to_season)
# weather_data.head(50)
 
weather_data.loc[:, "weather_description"] = weather_data["weather_code"].map(weather_code_mapping)
weather_data.loc[:, "season"] = weather_data["weather_code"].map(weather_code_to_season)
weather_data.to_csv("./output/mapped_weather_data.csv", index=False)
weather_data.head()


Unnamed: 0,date,temperature_max,temperature_min,precipitation,weather_code,weather_description,season
317,2024-11-13,7.7,3.2,0.0,3.0,Overcast,Autumn
318,2024-11-14,8.3,4.7,0.0,3.0,Overcast,Autumn
319,2024-11-15,9.3,5.0,0.0,3.0,Overcast,Autumn
320,2024-11-16,8.0,5.8,2.1,61.0,Light rain,Rainy
321,2024-11-17,7.2,4.4,4.8,80.0,Light rain showers,Rainy


In [190]:
## Generate Synthetic data for the missing dates

import numpy as np

# Define date range for the full year 2024
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

seasons = {
    "Winter": ((1, 1), (3, 15)), 
    "Rainy": ((3, 16), (6, 15)),  
    "Summer": ((6, 16), (9, 15)),  
    "Autumn": ((9, 16), (12, 15)),
    "Winter_end": ((12, 16), (12, 31))
}

weather_patterns = {
    "Winter": {"temp_max": (-2, 6), "temp_min": (-10, 1), "precip": (0, 15),
               "codes": [3, 45, 48, 56, 66, 71, 73, 75, 77, 85, 86], 
               "desc": ["Overcast", "Fog", "Depositing rime fog", "Light freezing drizzle", "Light freezing rain","Slight snow fall","moderate snow fall","heavy snow fall","Snow grains","Light snow showers","Heavy snow showers"]},
    "Rainy": {"temp_max": (8, 20), "temp_min": (2, 12), "precip": (0, 10), 
              "codes": [51, 61, 80, 81, 82,63], 
              "desc": ["Light drizzle","Light rain","Light rain showers","Moderate rain showers","Heavy rain showers","Rain"]},
    "Summer": {"temp_max": (22, 34), "temp_min": (14, 24), "precip": (0, 12), 
               "codes": [0, 1, 2],
               "desc":  ["Clear sky", "Mainly clear", "Partly cloudy"]},
    "Autumn": {"temp_max": (10, 22), "temp_min": (4, 14), "precip": (0, 8), 
               "codes": [3, 45, 51], 
               "desc":  ["Overcast", "Fog", "Light drizzle"]}
}

# Generate synthetic data
data = []
for date in date_range:
    month, day = date.month, date.day
    
    # Determine season
    season = None
    for s, (start, end) in seasons.items():
        if (month, day) >= start and (month, day) <= end:
            season = "Winter" if s == "Winter_end" else s
            break
    
    # Generate weather data based on season
    pattern = weather_patterns[season]
    temp_max = round(np.random.uniform(*pattern["temp_max"]), 1)
    temp_min = round(np.random.uniform(*pattern["temp_min"]), 1)
    precip = round(np.random.uniform(*pattern["precip"]), 1)
    weather_idx = np.random.randint(0, len(pattern["codes"]))
    weather_code = pattern["codes"][weather_idx]
    weather_description = pattern["desc"][weather_idx]
    

    data.append([date.strftime("%Y-%m-%d"), temp_max, temp_min, precip, weather_code, weather_description, season])


df_synthetic = pd.DataFrame(data, columns=["date", "temperature_max", "temperature_min", "precipitation", "weather_code", "weather_description", "season"])


df_synthetic.to_csv("./output/synthetic_data.csv", index=False)

In [191]:
# Combine Original weather data and Synthetic data

combined_df = pd.concat([df_synthetic, weather_data], ignore_index=True)
combined_df.head(20)

combined_df.to_csv("./output/nyc_weather_2024.csv", index=False)

In [None]:
# Get USA Holiday Details

EVENT_DATA_FILE = "..\weather data\data\USHolidayDates.csv"
event_data = pd.read_csv(EVENT_DATA_FILE)
event_data

## Data Cleaning
# 1. Filter specific year data
filtered_df = event_data[event_data["Year"] == 2004].copy()


# 2. Change year
# Convert 'Date' column to datetime
filtered_df['Date'] = pd.to_datetime(filtered_df['Date'])

# Replace the year with 2024
filtered_df['Date'] = filtered_df['Date'].apply(lambda x: x.replace(year=2024))
filtered_df["Year"] = 2024

filtered_df.sort_values(by='Date', inplace = True)
event_data = filtered_df
event_data.to_csv("./output/filtered_US_holiday_2024.csv", index=False)

In [None]:
# Get Location based Footfall

FOOTFALL_DATA_FILE = "..\weather data\data\FootfallDaily.csv"
footfall_data = pd.read_csv(FOOTFALL_DATA_FILE)

# Filter 2024 and 2025 year data
footfall_df = footfall_data[(footfall_data["BRCYear"] == 2024) | (footfall_data["BRCYear"] == 2025)].copy()

# Removing unwanted columns
footfall_df = footfall_df.drop(["LocationGroup","BRCWeek","BRCQuarter","BRCMonth","BusinessInCount","BusinessOutCount","BusinessTotalCount","FactoredInCount","FactoredOutCount","FactoredTotalCount"], axis=1)
footfall_df

# Change Location name
'''
Micklegate -> ShoppingMall
Parliament Street -> ATM Outlets

'''
footfall_df["SiteName"] = footfall_df["SiteName"].replace("York", "NewYork")
footfall_df["LocationName"] = footfall_df["LocationName"].replace("Micklegate", "ATM Outlets")
footfall_df["LocationName"] = footfall_df["LocationName"].replace("Parliament Street", "ShoppingMall")

footfall_df.to_csv("./output/filtered_footfall_2024.csv", index=False)
footfall_df.head(10)



Unnamed: 0,Date,SiteName,LocationName,WeekDay,BRCYear,InCount,OutCount,TotalCount
16535,01/01/2024,NewYork,ATM Outlets,Monday,2024,2373.0,2838.0,5211.0
16536,01/01/2024,NewYork,ShoppingMall,Monday,2024,3901.0,4124.0,8025.0
16537,02/01/2024,NewYork,ATM Outlets,Tuesday,2024,2393.0,2361.0,4754.0
16538,02/01/2024,NewYork,ShoppingMall,Tuesday,2024,5685.0,6114.0,11799.0
16539,03/01/2024,NewYork,ATM Outlets,Wednesday,2024,3411.0,3306.0,6717.0
16540,03/01/2024,NewYork,ShoppingMall,Wednesday,2024,6525.0,6656.0,13181.0
16541,04/01/2024,NewYork,ATM Outlets,Thursday,2024,3128.0,3023.0,6151.0
16542,04/01/2024,NewYork,ShoppingMall,Thursday,2024,6451.0,6271.0,12722.0
16543,05/01/2024,NewYork,ATM Outlets,Friday,2024,3315.0,3144.0,6459.0
16544,05/01/2024,NewYork,ShoppingMall,Friday,2024,6818.0,6665.0,13483.0


In [None]:
###########################################
###########################################


WEATHER_DATA_FILE = "..\weather data\nyc_weather_2024.csv"
weather_data = pd.read_csv(WEATHER_DATA_FILE)

# Map Holiday
# Convert 'date' columns to datetime format for accurate comparison
weather_data["date"] = pd.to_datetime(weather_data["date"])
event_data["Date"] = pd.to_datetime(event_data["Date"])

# Map holidays: If a date in df1 exists in df2, mark it as 1; else, mark as 0
weather_data["holiday"] = weather_data["date"].isin(event_data["Date"]).astype(int)
 
# weather_data

## Combine Weather and Footfall data
mall_df = footfall_df[footfall_df["LocationName"] == "ShoppingMall"].copy()
atm_outlet_df = footfall_df[footfall_df["LocationName"] == "ATM Outlets"].copy()

mall_df.rename(columns={"Date": "date"}, inplace=True)
atm_outlet_df.rename(columns={"Date": "date"}, inplace=True)

weather_data.head()


Unnamed: 0,date,temperature_max,temperature_min,precipitation,weather_code,weather_description,season,holiday
0,2024-01-01,2.7,-4.8,9.0,85.0,Light snow showers,Winter,1
1,2024-01-02,2.3,-0.1,14.4,85.0,Light snow showers,Winter,0
2,2024-01-03,5.7,-0.2,10.7,56.0,Light freezing drizzle,Winter,0
3,2024-01-04,1.7,-1.5,2.0,3.0,Overcast,Winter,0
4,2024-01-05,-1.7,-6.7,6.2,56.0,Light freezing drizzle,Winter,0


In [144]:
# Convert to datetime format (optional but recommended)
weather_data["date"] = pd.to_datetime(weather_data["date"])
mall_df["date"] = pd.to_datetime(mall_df["date"], format="%d/%m/%Y")
atm_outlet_df["date"] = pd.to_datetime(atm_outlet_df["date"], format="%d/%m/%Y")

mall_df["mall_footfall"] = mall_df["TotalCount"]
atm_outlet_df["atm_outlet_footfall"] = atm_outlet_df["TotalCount"]


weather_data = weather_data.merge(mall_df[['date', 'mall_footfall']], on='date', how='left')
weather_data = weather_data.merge(atm_outlet_df[['date', 'atm_outlet_footfall']], on='date', how='left')

weather_data.tail(50)

Unnamed: 0,date,temperature_max,temperature_min,precipitation,weather_code,weather_description,season,holiday,mall_footfall,atm_outlet_footfall
347,2024-12-13,2.4,-1.4,0.0,3.0,Overcast,Autumn,0,10919.0,34771.0
348,2024-12-14,4.3,-1.3,0.8,61.0,Light rain,Autumn,0,12862.0,40235.0
349,2024-12-15,10.1,4.3,2.8,61.0,Light rain,Autumn,0,9738.0,17114.0
350,2024-12-16,10.9,8.7,0.0,61.0,Light rain,Autumn,0,8801.0,33875.0
351,2024-12-17,9.4,6.3,0.1,61.0,Light rain,Autumn,0,8884.0,27509.0
352,2024-12-18,11.6,5.9,0.6,61.0,Light rain,Autumn,0,8016.0,30691.0
353,2024-12-19,12.9,5.5,3.5,80.0,Light rain showers,Rainy,0,7967.0,26706.0
354,2024-12-20,6.1,2.1,0.0,3.0,Overcast,Autumn,0,9351.0,33238.0
355,2024-12-21,7.0,2.5,3.0,61.0,Light rain,Autumn,0,10969.0,47251.0
356,2024-12-22,7.2,3.5,2.2,80.0,Light rain showers,Rainy,0,8024.0,36708.0


In [146]:
## Remove row if fallout Nan

df_cleaned = weather_data.dropna(subset=['mall_footfall'])
df_cleaned = weather_data.dropna(subset=['atm_outlet_footfall'])

df_cleaned.tail(10)


Unnamed: 0,date,temperature_max,temperature_min,precipitation,weather_code,weather_description,season,holiday,mall_footfall,atm_outlet_footfall
382,2025-01-17,1.6,-1.8,0.0,48.0,Depositing rime fog,Winter,0,6461.0,12543.0
383,2025-01-18,-1.0,-2.7,0.0,48.0,Depositing rime fog,Winter,0,7961.0,20955.0
384,2025-01-19,3.9,-3.0,0.0,48.0,Depositing rime fog,Winter,0,5008.0,10992.0
385,2025-01-20,5.6,-1.0,0.0,48.0,Depositing rime fog,Winter,0,4747.0,10567.0
386,2025-01-21,-0.6,-2.7,0.0,48.0,Depositing rime fog,Winter,0,5412.0,10655.0
387,2025-01-22,2.0,-2.6,0.0,48.0,Depositing rime fog,Winter,0,5630.0,10975.0
388,2025-01-23,4.6,0.4,5.0,61.0,Light rain,Autumn,0,5132.0,10108.0
389,2025-01-24,7.4,2.6,2.7,61.0,Light rain,Autumn,0,6270.0,11244.0
390,2025-01-25,13.6,6.6,3.0,61.0,Light rain,Autumn,0,8768.0,19582.0
391,2025-01-26,9.9,3.3,1.4,80.0,Light rain showers,Rainy,0,4857.0,10752.0


In [147]:
# Save the cleaned data
df_cleaned.to_csv("final_nyc_weather_2024.csv", index=False)