## Stations Map Parsing

Data was scraped from this source https://www.aviationweather.gov/docs/metar/stations.txt 

Logic to generate a CSV per location + date combo inside this text file

In [0]:
import json
import pandas 
import numpy
import os
import re
import csv
import pandas as pd

### Weather Station Map Guideline

`
!   CD = 2 letter state (province) abbreviation
!   STATION = 16 character station long name
!   ICAO = 4-character international id
!   IATA = 3-character (FAA) id
!   SYNOP = 5-digit international synoptic number
!   LAT = Latitude (degrees minutes)
!   LON = Longitude (degree minutes)
!   ELEV = Station elevation (meters)
!   M = METAR reporting station.   Also Z=obsolete? site
!   N = NEXRAD (WSR-88D) Radar site
!   V = Aviation-specific flag (V=AIRMET/SIGMET end point, A=ARTCC T=TAF U=T+V)
!   U = Upper air (rawinsonde=X) or Wind Profiler (W) site
!   A = Auto (A=ASOS, W=AWOS, M=Meso, H=Human, G=Augmented) (H/G not yet impl.)
!   C = Office type F=WFO/R=RFC/C=NCEP Center
!   Digit that follows is a priority for plotting (0=highest)
!   Country code (2-char) is last column`


**Total of 16 different fields**

In [0]:
!ls

### Just exploring here

In [0]:
# Using readlines() 
station_maps = open('station_maps/station_map.txt', 'r') 
lines = station_maps.readlines() 
  
count = 0

# Strips the newline character 
for line in lines:
    print("Line{}: {}".format(count, line.strip())) 

### Approach

Was gonna try something fancy, with regex, but every line has a strict number of bytes, so we can just use that to delimit. 

There are 4 different lengths of lines



| Line Typ | Length      |
|-----|---------------|
| Place + Date   | 29 |
| Header   | 79      |
| Data Row   | 84 |
| NewLineCharacter   | 1 |

In [0]:
print("Example 1")
print()
print(len(lines[0]))
print(lines[0])
print()
print(len(lines[1]))
print(lines[1])
print()
print(len(lines[2]))
print(lines[2])

print()
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
print()

print("Example 2")
print()

print(len(lines[272])) #This is a new line character
print(lines[272])
print()
print(len(lines[273]))
print(lines[273])
print()
print(len(lines[274]))
print(lines[274])
print(len(lines[275]))
print(lines[275])

### These are all the uppper bound index slots for each field in a given line.

* WEATHER_REV - is treated a bit differently
* PRIORITY - Reverse Indexed
* COUNTRY - Reverse Indexed

In [0]:
#LEN 29
PLACE_i = 19
DATE_i = 29

#LEN 84
CD_i = 2
STATION_i = 20
ICAO_i = 26
IATA_i = 32
SYNOP_i = 39
LAT_i = 47
LONG_i = 57
ELEV_i = 62
WEATHER_REV = 84-ELEV_i

### The Parsing!

Function that reads in lines from .txt file that we've set and generates CSV's for each table,

Total **256** tables, therefore 256 CSV's get generated

In [0]:
#If you pass verbose => True, print debug as the loop runs
def parse_tables(lines, verbose=False):
    
    ####
    #Just for sanity, these values aren't used for anything
    tot_tables = 0
    tot_headers = 0  
    ####
    
    cur_table_name = ""
    cur_table_rows = []
    us_table_names = set()
    
    for cur_line_num, line in enumerate(lines, 1):
        
        #New Table Name
        if len(line) == 29:
            PLACE = re.sub("/","-", line[:PLACE_i].strip())
            PLACE = re.sub(" ", "_", PLACE)
            DATE = re.sub("-","_",line[PLACE_i:DATE_i].strip())
            cur_table_name = PLACE + "-" + DATE
            
            if verbose:
                print(cur_table_name)
                
            tot_tables+=1
            
        #Table Header
        elif len(line) == 79:
            headers = re.sub('\s{2,}',',',line.strip()).split(",")
            headers.extend(["PRIOR", "COUNTRY"])
            cur_table_rows.append(headers)
            tot_headers+=1
            
            if verbose:
                print(headers)
            
            
        #Row Value
        elif len(line) == 84:
            
            CD = line[0:CD_i].strip()
            STATION = re.sub("/","-",line[CD_i:STATION_i].strip())
            ICAO = line[STATION_i:ICAO_i].strip()
            IATA = line[ICAO_i:IATA_i].strip()
            SYNOP = line[IATA_i:SYNOP_i].strip()
            LAT = line[SYNOP_i:LAT_i].strip()
            LONG = line[LAT_i:LONG_i].strip()
            ELEV = line[LONG_i:ELEV_i].strip()
         
            WEATHER_STUFF = line[-WEATHER_REV:-6]
            M, N, V, U, A, C= [WEATHER_STUFF[i:i+3].strip() for i in range(0, len(WEATHER_STUFF), 3)]
               
            PRIOR = line[-5:-3].strip()          
            COUNTRY = line[-3:].strip()
            
            ROW = [CD,STATION,ICAO,IATA,SYNOP,LAT,LONG,ELEV,M,N,V,U,A,C,PRIOR,COUNTRY]

            if verbose:
                print(ROW)

            #Add Row to tracker
            cur_table_rows.append(ROW)
            
            #Not most efficient but works LOL
            if COUNTRY == "US":
                us_table_names.add(cur_table_name)
        
        #New Line character, indicates we are about to start on a new table
        else:

            if verbose:
                print()
                print()
                
            #Just ensuring that we have all the correct number of columns or else something is wrong!
            good = True
            for row in cur_table_rows:
                if len(row) != 16:
                    good = False
                    print(f"SOMETHING IS WRONG - {cur_table_name}")
                    break
            
            #If good then make the CSV!
            if good:
                print(f"GOOD TO MAKE CSV - {cur_table_name}")
                print()
                
                name = cur_table_name
                if cur_table_name in us_table_names:
                    name = "US_STATIONS/"+cur_table_name

                #Make this path if it doesn't exist
                if not os.path.exists('station_csvs'):
                    os.makedirs('station_csvs')
                    os.makedirs('station_csvs/US_STATIONS')                    
                    
                #Write all the rows of current table to a CSV file
                with open(f'station_csvs/{name}.csv', 'w', newline='') as file:
                    writer = csv.writer(file)
                    for liner in cur_table_rows:
                        writer.writerow(liner)
                
                cur_table_name = ""
                cur_table_rows = []             

                
    print("tot_headers",tot_headers)
    print("tot_tables",tot_tables)    

### Generate The CSV's

In [0]:
parse_tables(lines, True)

### One FAT U.S. Stations Table

Figured it might make it easier to join if we just have one huge US table, and filter then join as needed, so now that we have all the individual CSV's figured might as well build one joined table

In [0]:
path = "station_csvs/US_STATIONS"
us_station_filenames = os.listdir(path)
print(len(us_station_filenames))
us_station_filenames


In [0]:
fout=open("us_stations_all.csv","a")

# now the rest:    
for file_ind, file_name in enumerate(us_station_filenames):
    with open("station_csvs/US_STATIONS/"+file_name) as f:
        for ind,line in enumerate(f):
            if ind == 0 and file_ind !=   0:
                pass
            else:
                fout.write(line)
fout.close()

### Verify the number of states is covered by FAT table

In [0]:
us_all = pd.read_csv("us_stations_all.csv")
us_all.head()

Unnamed: 0,CD,STATION,ICAO,IATA,SYNOP,LAT,LONG,ELEV,M,N,V,U,A,C,PRIOR,COUNTRY
0,MT,ALPHA,K1AM,1AM,,47 17N,110 48W 12,28,X,,,,,,9,US
1,MT,ANACONDA,K3U3,,,46 09N,112 52W 15,34,X,,,,,,7,US
2,MT,BAKER,KBHK,,,46 21N,104 15W 9,4,X,,,,A,,7,US
3,MT,BIG TIMBER,K6S0,6S0,,45 48N,109 58W 13,64,X,,,,,,6,US
4,MT,BILLINGS,KBIL,BIL,72677.0,45 48N,108 33W 10,89,X,,U,,,,6,US


In [0]:
print(len(us_all["CD"].unique()))
us_all["CD"].unique()