In [2]:
import pandas as pd
import os


In [3]:
# This points to the "metadata" folder on your Desktop
meta_folder = os.path.expanduser("~/Desktop/metadata")


In [4]:
# List the two metadata file paths
meta_files = [
    os.path.join(meta_folder, "d04_text_meta_2024_10_19.txt"),
    os.path.join(meta_folder, "d04_text_meta_2025_01_15.txt")
]

# Load both files into a list
meta_dfs = []
for file in meta_files:
    print(f"📄 Reading: {file}")
    df = pd.read_csv(file, delimiter="\t")  # Tab-separated values
    meta_dfs.append(df)

# Combine both files into one table
metadata = pd.concat(meta_dfs, ignore_index=True)

print("✅ Metadata files loaded. Rows:", metadata.shape[0])


📄 Reading: /Users/spartan/Desktop/metadata/d04_text_meta_2024_10_19.txt
📄 Reading: /Users/spartan/Desktop/metadata/d04_text_meta_2025_01_15.txt
✅ Metadata files loaded. Rows: 8307


In [5]:
print("📋 First 5 rows of the metadata file:")
print(metadata.head())


📋 First 5 rows of the metadata file:
       ID  Fwy Dir  District  County     City State_PM   Abs_PM   Latitude  \
0  400000  101   S         4      41  52582.0    19.23  459.835  38.081498   
1  400001  101   N         4      85  68000.0    38.26  387.897  37.364085   
2  400002  101   S         4      81  68252.0    14.67  416.893  37.584097   
3  400006  880   S         4       1  81204.0    13.54   23.767  37.605003   
4  400007  101   N         4      81  68252.0    15.25  417.437  37.586936   

    Longitude  Length Type  Lanes                      Name User_ID_1  \
0 -122.547963   0.415   ML      4         SR-37 conn-s-diag     DT5A2   
1 -121.901149   0.265   ML      5          NB 880 rm-n-loop     DT697   
2 -122.328465   0.305   ML      5          Peninsula Ave OC     DT243   
3 -122.065542   0.340   ML      4      Whipple Rd rm-s-diag     DTA20   
4 -122.337721   0.365   ML      5  2000' N of Pennisula Ave     DT275   

           User_ID_2 User_ID_3  User_ID_4  
0  L4-S-27-

In [6]:
# Rename columns for consistency
metadata.rename(columns={
    'ID': 'StationID',
    'Latitude': 'Lat',
    'Longitude': 'Lon'
}, inplace=True)

# Keep only the 3 needed columns
metadata = metadata[['StationID', 'Lat', 'Lon']]

# Drop duplicate rows (if any)
metadata.drop_duplicates(inplace=True)

print("✅ Cleaned metadata:")
print(metadata.head())


✅ Cleaned metadata:
   StationID        Lat         Lon
0     400000  38.081498 -122.547963
1     400001  37.364085 -121.901149
2     400002  37.584097 -122.328465
3     400006  37.605003 -122.065542
4     400007  37.586936 -122.337721


In [7]:
# Path to your cleaned traffic file on Desktop
traffic_path = os.path.expanduser("~/Desktop/pems_5min_cleaned.csv")
traffic = pd.read_csv(traffic_path)

print("✅ Cleaned traffic data loaded.")
print("🧾 Shape:", traffic.shape)


✅ Cleaned traffic data loaded.
🧾 Shape: (36399420, 8)


In [8]:
# Merge the two datasets
merged = pd.merge(traffic, metadata, on='StationID', how='left')

print("✅ Merging completed.")
print("📍 Rows with GPS info:", merged['Lat'].notnull().sum())
print("⚠️ Rows missing GPS:", merged['Lat'].isnull().sum())


✅ Merging completed.
📍 Rows with GPS info: 36428040
⚠️ Rows missing GPS: 0


In [9]:
print("🔍 First 5 rows of merged dataset:")
print(merged.head())


🔍 First 5 rows of merged dataset:
             Timestamp  StationID  Freeway Direction LaneType  TotalFlow  \
0  2025-01-01 00:00:00     401151       80         E       ML       95.0   
1  2025-01-01 00:00:00     403459       80         W       ML       38.0   
2  2025-01-01 00:00:00     403460       80         E       ML       52.0   
3  2025-01-01 00:00:00     403461       80         W       ML       57.0   
4  2025-01-01 00:00:00     403465        1         S       ML       46.0   

   AvgOccupancy  AvgSpeed        Lat         Lon  
0        0.0262      70.2  38.251746 -122.067825  
1        0.0069      67.2  38.033934 -122.251887  
2        0.0100      69.3  38.042156 -122.242272  
3        0.0125      65.9  38.042178 -122.242478  
4        0.0117      68.2  36.986035 -122.024092  


In [None]:
final_path = os.path.expanduser("~/Desktop/pems_5min_cleaned_with_location.csv")
merged.to_csv(final_path, index=False)

print("✅ Final file saved at:")
print(final_path)


✅ Final file saved at:
/Users/spartan/Desktop/pems_5min_cleaned_with_location.csv


In [12]:
import pandas as pd
import os

# Load the final merged file from Desktop
merged_path = os.path.expanduser("~/Desktop/pems_5min_cleaned_with_location.csv")
merged = pd.read_csv(merged_path)

print("✅ File loaded successfully.")


✅ File loaded successfully.


In [13]:
# Step 1: Extract only the columns you need
station_locations = merged[['StationID', 'Lat', 'Lon']]

# Step 2: Drop rows where lat/lon is missing (just in case)
station_locations = station_locations.dropna(subset=['Lat', 'Lon'])

# Step 3: Drop duplicate StationIDs (so each station appears only once)
station_locations = station_locations.drop_duplicates()

# Step 4: Print them all
print("📍 List of all stations with Latitude and Longitude:\n")
for index, row in station_locations.iterrows():
    print(f"StationID: {int(row['StationID'])} | Lat: {row['Lat']} | Lon: {row['Lon']}")


📍 List of all stations with Latitude and Longitude:

StationID: 401151 | Lat: 38.251746 | Lon: -122.067825
StationID: 403459 | Lat: 38.033934 | Lon: -122.251887
StationID: 403460 | Lat: 38.042156 | Lon: -122.242272
StationID: 403461 | Lat: 38.042178 | Lon: -122.242478
StationID: 403465 | Lat: 36.986035 | Lon: -122.024092
StationID: 403471 | Lat: 37.06113 | Lon: -122.004466
StationID: 403475 | Lat: 37.061194 | Lon: -122.004672
StationID: 403524 | Lat: 37.703392 | Lon: -121.508962
StationID: 403525 | Lat: 37.703668 | Lon: -121.508801
StationID: 403744 | Lat: 36.943913 | Lon: -121.502887
StationID: 403745 | Lat: 36.943913 | Lon: -121.502887
StationID: 403902 | Lat: 37.702855 | Lon: -122.472093
StationID: 403904 | Lat: 37.638284 | Lon: -122.440981
StationID: 403906 | Lat: 37.644736 | Lon: -122.448925
StationID: 403908 | Lat: 37.662373 | Lon: -122.465652
StationID: 403910 | Lat: 37.675831 | Lon: -122.470236
StationID: 403458 | Lat: 38.033961 | Lon: -122.251633
StationID: 403946 | Lat: 38.02