In [19]:
import zipfile
import os

# Corrected paths (from inside notebooks/)
zip_paths = {
    'weather': '../data/raw/weather_data.zip',
    'traffic': '../data/raw/traffic_data.zip',
    'disaster': '../data/raw/disaster_messages.zip'
}

# Path to extract the data
extracted_folder = '../data/extracted/'

# Create the extracted folder if it doesn't exist
if not os.path.exists(extracted_folder):
    os.makedirs(extracted_folder)

# Function to extract a zip file with error handling
def extract_zip(zip_path, extract_to):
    try:
        if not os.path.exists(zip_path):
            raise FileNotFoundError(f"The zip file {zip_path} does not exist.")
        
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print(f" Extracted {zip_path} to {extract_to}")
    except FileNotFoundError as e:
        print(f" Error: {e}")
    except zipfile.BadZipFile:
        print(f" Error: The file {zip_path} is not a valid zip file.")
    except Exception as e:
        print(f" An unexpected error occurred: {e}")

# Extract weather, traffic, and disaster data
extract_zip(zip_paths['weather'], extracted_folder)
extract_zip(zip_paths['traffic'], extracted_folder)
extract_zip(zip_paths['disaster'], extracted_folder)


 Extracted ../data/raw/weather_data.zip to ../data/extracted/
 Extracted ../data/raw/traffic_data.zip to ../data/extracted/
 Extracted ../data/raw/disaster_messages.zip to ../data/extracted/


In [31]:
import os

# List the files in the extracted folder
extracted_files = os.listdir(extracted_folder)
print("Extracted files:", extracted_files)

Extracted files: ['.ipynb_checkpoints', 'disaster_messages', 'traffic_data.csv', 'weather_data.csv']


In [36]:

import pandas as pd
import os

# Paths to the disaster message folder
disaster_messages_folder = '../data/extracted/disaster_messages'

# List files in the disaster message folder
disaster_message_files = os.listdir(disaster_messages_folder)
print("\nDisaster Messages Folder Files:", disaster_message_files)

# Load the CSVs for train, test, and sample_submission
try:
    train_df = pd.read_csv(os.path.join(disaster_messages_folder, 'train.csv'))
    test_df = pd.read_csv(os.path.join(disaster_messages_folder, 'test.csv'))
    sample_submission_df = pd.read_csv(os.path.join(disaster_messages_folder, 'sample_submission.csv'))

    print("Successfully loaded disaster message datasets")
except Exception as e:
    print(f" Error while loading disaster message files: {e}")



Disaster Messages Folder Files: ['sample_submission.csv', 'test.csv', 'train.csv']
Successfully loaded disaster message datasets


In [42]:
import os
import pandas as pd

# Set the paths relative to the 'notebooks' folder
weather_data_path = r'D:\jupyter_nbk_project\disaster_relief_ai\data\extracted\weather_data.csv'
traffic_data_path = r'D:\jupyter_nbk_project\disaster_relief_ai\data\extracted\traffic_data.csv'
disaster_messages_folder = r'D:\jupyter_nbk_project\disaster_relief_ai\data\extracted\disaster_messages'

# Check if the files exist at the given paths
print("Weather Data Exists:", os.path.exists(weather_data_path))
print("Traffic Data Exists:", os.path.exists(traffic_data_path))
print("Disaster Messages Folder Exists:", os.path.exists(disaster_messages_folder))

# Load the data
weather_df = pd.read_csv(weather_data_path)
traffic_df = pd.read_csv(traffic_data_path)

# Load the disaster message data (train, test, sample_submission)
train_df = pd.read_csv(os.path.join(disaster_messages_folder, 'train.csv'))
test_df = pd.read_csv(os.path.join(disaster_messages_folder, 'test.csv'))
sample_submission_df = pd.read_csv(os.path.join(disaster_messages_folder, 'sample_submission.csv'))

# Print to verify
print(weather_df.head())
print(traffic_df.head())
print(train_df.head())


Weather Data Exists: True
Traffic Data Exists: True
Disaster Messages Folder Exists: True
                  Formatted Date        Summary Precip Type  Temperature (C)  \
0  2006-04-01 00:00:00.000 +0200  Partly Cloudy        rain         9.472222   
1  2006-04-01 01:00:00.000 +0200  Partly Cloudy        rain         9.355556   
2  2006-04-01 02:00:00.000 +0200  Mostly Cloudy        rain         9.377778   
3  2006-04-01 03:00:00.000 +0200  Partly Cloudy        rain         8.288889   
4  2006-04-01 04:00:00.000 +0200  Mostly Cloudy        rain         8.755556   

   Apparent Temperature (C)  Humidity  Wind Speed (km/h)  \
0                  7.388889      0.89            14.1197   
1                  7.227778      0.86            14.2646   
2                  9.377778      0.89             3.9284   
3                  5.944444      0.83            14.1036   
4                  6.977778      0.83            11.0446   

   Wind Bearing (degrees)  Visibility (km)  Loud Cover  Pressure (mi

In [41]:
import os
print(os.getcwd())  # Check the current working directory

D:\jupyter_nbk_project\disaster_relief_ai\notebooks


In [52]:
import pandas as pd

# Corrected paths assuming current folder is notebooks/
weather_path = '../data/extracted/weather_data.csv'
traffic_path = '../data/extracted/traffic_data.csv'

# Load datasets
weather_df = pd.read_csv(weather_path)
traffic_df = pd.read_csv(traffic_path)

# Display column names to check for datetime column
print(" Weather Data Columns:")
print(weather_df.columns)

print("\nTraffic Data Columns:")
print(traffic_df.columns)

 Weather Data Columns:
Index(['Formatted Date', 'Summary', 'Precip Type', 'Temperature (C)',
       'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)',
       'Wind Bearing (degrees)', 'Visibility (km)', 'Loud Cover',
       'Pressure (millibars)', 'Daily Summary'],
      dtype='object')

Traffic Data Columns:
Index(['traffic_volume', 'holiday', 'temp', 'rain_1h', 'snow_1h', 'clouds_all',
       'weather_main', 'weather_description', 'date_time'],
      dtype='object')


In [57]:
import pandas as pd

# Corrected paths assuming current folder is notebooks/
weather_path = '../data/extracted/weather_data.csv'
traffic_path = '../data/extracted/traffic_data.csv'

# Load datasets
weather_df = pd.read_csv(weather_path)
traffic_df = pd.read_csv(traffic_path)

# Convert 'Formatted Date' in weather_df to datetime with UTC handling
weather_df['datetime'] = pd.to_datetime(weather_df['Formatted Date'], errors='coerce', utc=True)

# Convert 'date_time' in traffic_df to datetime
traffic_df['datetime'] = pd.to_datetime(traffic_df['date_time'], errors='coerce')

# Drop rows with invalid datetime values
weather_df.dropna(subset=['datetime'], inplace=True)
traffic_df.dropna(subset=['datetime'], inplace=True)

# Preview the data
print(" Weather Data:")
print(weather_df.head())

print("\n Traffic Data:")
print(traffic_df.head())


 Weather Data:
                  Formatted Date        Summary Precip Type  Temperature (C)  \
0  2006-04-01 00:00:00.000 +0200  Partly Cloudy        rain         9.472222   
1  2006-04-01 01:00:00.000 +0200  Partly Cloudy        rain         9.355556   
2  2006-04-01 02:00:00.000 +0200  Mostly Cloudy        rain         9.377778   
3  2006-04-01 03:00:00.000 +0200  Partly Cloudy        rain         8.288889   
4  2006-04-01 04:00:00.000 +0200  Mostly Cloudy        rain         8.755556   

   Apparent Temperature (C)  Humidity  Wind Speed (km/h)  \
0                  7.388889      0.89            14.1197   
1                  7.227778      0.86            14.2646   
2                  9.377778      0.89             3.9284   
3                  5.944444      0.83            14.1036   
4                  6.977778      0.83            11.0446   

   Wind Bearing (degrees)  Visibility (km)  Loud Cover  Pressure (millibars)  \
0                   251.0          15.8263         0.0         

In [61]:
weather_df['datetime'] = weather_df['datetime'].dt.tz_localize(None)

In [63]:
merged_df = pd.merge(traffic_df, weather_df, on='datetime', how='inner')

In [64]:
merged_df.head()

Unnamed: 0,traffic_volume,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,datetime,...,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,5545,,288.28,0.0,0.0,40,Clouds,scattered clouds,02-10-2012 09:00,2012-02-10 09:00:00,...,snow,-14.927778,-14.927778,0.9,4.3631,2.0,1.5939,0.0,1034.81,Foggy starting in the morning continuing until...
1,4516,,289.36,0.0,0.0,75,Clouds,broken clouds,02-10-2012 10:00,2012-02-10 10:00:00,...,snow,-13.888889,-19.05,0.83,8.05,330.0,1.932,0.0,1035.2,Foggy starting in the morning continuing until...
2,4767,,289.58,0.0,0.0,90,Clouds,overcast clouds,02-10-2012 11:00,2012-02-10 11:00:00,...,snow,-11.111111,-11.111111,0.73,4.83,20.0,2.093,0.0,1035.3,Foggy starting in the morning continuing until...
3,5026,,290.13,0.0,0.0,90,Clouds,overcast clouds,02-10-2012 12:00,2012-02-10 12:00:00,...,snow,-11.161111,-15.916667,0.7,8.211,352.0,2.0608,0.0,1035.06,Foggy starting in the morning continuing until...
4,4918,,291.14,0.0,0.0,75,Clouds,broken clouds,02-10-2012 13:00,2012-02-10 13:00:00,...,snow,-8.888889,-8.888889,0.56,1.61,320.0,2.576,0.0,1034.7,Foggy starting in the morning continuing until...


In [65]:
merged_df.to_csv('../data/merged/final_dataset.csv', index=False)
print(" Merged dataset saved to data/merged/final_dataset.csv")

 Merged dataset saved to data/merged/final_dataset.csv
