In [5]:
# import
import pandas as pd
import numpy as np
from itertools import *
import os

In [6]:
# keep columns that are relevant

data_directory = 'data/'
filename = 'states_2017-08-28-00.csv'

usecols=['time','icao24','lat','lon','velocity','heading','baroaltitude']
month_df = pd.read_csv(data_directory + filename, usecols=usecols)
month_df = month_df.sort_values(['icao24','time'])

## Generates one big CSV file of NMAC data

## Old code that was used to generate CSVs of a flight path for each plane

In [7]:
import math
import geopy
import geopy.distance

def compute_distance(point_1, point_2):
    '''    
    Parameters:
        - point_1 -> list in order of [lat,long, alt]
        - point_2 -> list in order of [lat,long, alt]
        
    Returns:
        Distance between two points in km using geopy
    '''    
    p1 = geopy.point.Point(point_1)
    p2 = geopy.point.Point(point_2)

    return geopy.distance.vincenty(p1, p2).km

In [8]:
# generating NMAC data
def gen_NMAC():
    # get rows that were in an NMAC
    NMAC_icao = df[df['alert'] == True]
    # get rows that were in an NMAC and have readily available data (i.e. remove rows with NaN for lat long etc)
    cleaned_NMAC_icao = NMAC_icao.dropna()
    unique_NMAC_icao = cleaned_NMAC_icao.icao24.unique()
    

    print('There are {} unique potential NMAC icao planes'.format(len(unique_NMAC_icao)))
    count = 0
    
    # ultimate final NMAC csv dataframe
    df_final = pd.DataFrame()
 
    NMAC_id = 0
    # for each unique icao
    for name in unique_NMAC_icao:
        # print stats
        if count % 100 == 0:
            print('Iterated through {} icao planes so far...'.format(count))
        count += 1
        # get rows where icao == name (boolean vector)
        selector = cleaned_NMAC_icao['icao24'] == name
        current_icao = cleaned_NMAC_icao[selector]

        for second_name in unique_NMAC_icao:
            second_selector = cleaned_NMAC_icao['icao24'] == second_name
            second_icao = cleaned_NMAC_icao[second_selector]

            if name == second_name:
                continue

            incremented_NMAC_id = False
            for index, row in current_icao.iterrows():
                for index2, row2 in second_icao.iterrows():
                    lat_1 = row['lat']
                    lon_1 = row['lon']
                    alt_1 = row['baroaltitude']

                    lat_2 = row2['lat']
                    lon_2 = row2['lon']
                    alt_2 = row2['baroaltitude']

                    point_1 = [lat_1, lon_1, alt_1]
                    point_2 = [lat_2, lon_2, alt_2]

                    dist = compute_distance(point_1, point_2)

                    if dist < 200 and row['time'] == row2['time']:
                        if not incremented_NMAC_id:
                            NMAC_id += 1
                            incremented_NMAC_id = True
                        
                        NMAC_row = pd.DataFrame({
                            'NMAC_id': [NMAC_id],
                            'time_1': [row['time']], #first plane
                            'icao24_1':[row['icao24']],
                            'lat_1':[row['lat']],
                            'lon_1':[row['lon']],
                            'velocity_1':[row['velocity']],
                            'heading_1':[row['heading']],
                            'vertrate_1':[row['vertrate']],
                            'onground_1':[row['onground']],
                            'alert_1':[row['alert']],
                            'baroaltitude_1':[row['baroaltitude']],
                            'lastposupdate_1':[row['lastposupdate']],
                            'lastcontact_1':[row['lastcontact']],
                            'time_2': [row2['time']], #second plane
                            'icao24_2':[row2['icao24']],
                            'lat_2':[row2['lat']],
                            'lon_2':[row2['lon']],
                            'velocity_2':[row2['velocity']],
                            'heading_2':[row2['heading']],
                            'vertrate_2':[row2['vertrate']],
                            'onground_2':[row2['onground']],
                            'alert_2':[row2['alert']],
                            'baroaltitude_2':[row2['baroaltitude']],
                            'lastposupdate_2':[row2['lastposupdate']],
                            'lastcontact_2':[row2['lastcontact']],
                            })
                        # add to data frame
                        df_final = df_final.append(NMAC_row)

    # write to csv file
    df_final.to_csv("NMAC.csv".format(name), index=False)
    print('Saved files')

gen_NMAC()

NameError: name 'df' is not defined

In [None]:
# get unique icao (unique plane IDs)
unique_icao = df.icao24.unique()
print(unique_icao, len(unique_icao))

In [None]:
# generating CSVs for flight path data for each plane (might not be used)
# num_unique = len(unique_icao)

csv_count = 0
iteration = 0
# for each unique icao, create CSV for it
for name in unique_icao:
    # print stats
    if iteration % 500 == 0:
        print('Iterated through {} icao names'.format(iteration))
    iteration +=1
    
    # get rows where icao == name (boolean vector)
    selector = df['icao24'] == name
    
    # Make sure columns have data, otherwise skip
    if (df[selector].isnull().values.any()):
        continue
    
    # print stats
    if csv_count % 100 == 0:
        print('Wrote {} csvs files'.format(csv_count))
    csv_count += 1
    
    # write only rows to csv where the vector is true:
    df[selector].to_csv("cleaned/{}.csv".format(name), index=False)
    
print('Done')
print('Saved {} csv files'.format(csv_count))

## Clean month_df, add cols for displacement instead of lat and lon, save to csv for simple RNN network input

In [None]:
# get unique icao (unique plane IDs)
unique_icao = df.icao24.unique()
print(unique_icao, len(unique_icao))

In [None]:
# make df and write to csv

# final csv dataframe
df_final = pd.DataFrame()

iteration = 0
# for each unique icao, create CSV for it
for name in unique_icao:
    # print stats
    if iteration % 100 == 0:
        print('Iterated through {} icao names'.format(iteration))
    iteration +=1
    
    # get rows where icao == name (boolean vector)
    selector = month_df['icao24'] == name
    one_plane_df = month_df[selector]
    
    # Make sure columns have data, otherwise skip plane
    if (one_plane_df.isnull().values.any()):
        continue
    
    # calculate displacement columns

    # add first row with 0s for displacement columns
    first_row = one_plane_df.iloc[0]
    prev_lat = first_row['lat']
    prev_lon = first_row['lon']

    row_prime = pd.DataFrame({
            'time': [first_row['time']], 
            'icao24':[first_row['icao24']],
            'x_displacement':[0],
            'y_displacement':[0],
            'velocity':[first_row['velocity']],
            'heading':[first_row['heading']],
            'baroaltitude':[first_row['baroaltitude']]
    })

    df_final = df_final.append(row_prime)

    # for all except for first row, calculate displacement based on previous row's lat and lon
    for index, row in islice(one_plane_df.iterrows(), 1, None):
        curr_lat = row['lat']
        curr_lon = row['lon']

        # compute distance for lat and then lon
        x_disp = curr_lon - prev_lon
        y_disp = curr_lat - prev_lat

        # save curr lat and lon as prev for next iteration
        prev_lat = curr_lat
        prev_lon = curr_lon

        # add the row
        row_prime = pd.DataFrame({
            'time': [row['time']], 
            'icao24':[row['icao24']],
            'x_displacement':[x_disp],
            'y_displacement':[y_disp],
            'velocity':[row['velocity']],
            'heading':[row['heading']],
            'baroaltitude':[row['baroaltitude']]
        })

        df_final = df_final.append(row_prime)

print('Shape of df_final: {}'.format(df_final.shape))    


# write to file

clean_dir = 'cleaned/'
if not os.path.exists(clean_dir):
    os.makedirs(clean_dir)
    
# write only rows to csv where the vector is true:
df_final.to_csv(clean_dir+'{}'.format(filename), index=False)
    
print('Saved file')

In [None]:
# just write to file

clean_dir = 'cleaned/'
if not os.path.exists(clean_dir):
    os.makedirs(clean_dir)
    
# write only rows to csv where the vector is true:
df_final.to_csv(clean_dir+'{}'.format(filename), index=False)
    
print('Saved file')

## Padding data

After submitting our initial finding results, we now need to collect data and pad the data in such a way that each training example has the same number of sequence lengths (of size BATCH_SIZE). We need to do this because PyTorch and TensorFlow need to have inputs of same sequence length. Each training example is placed in the same csv.

First, iterate through each icao, get data points where there was a nonzero displacement between two time steps. 

In [9]:
BATCH_SIZE = 15 # length of training example

In [10]:
# get unique icao (unique plane IDs)
unique_icao = month_df.icao24.unique()
print(unique_icao, len(unique_icao))

['00741d' '008fff' '00fee0' ..., 'e8407d' 'e90d0b' 'e90d0f'] 6097


In [11]:

def pad_buffered_rows(buffered_rows):
    padded_buffered_rows = []
    for i in range(BATCH_SIZE - len(buffered_rows)):
        padded_buffered_rows.append(buffered_rows[0])
    
    for row in buffered_rows:
        padded_buffered_rows.append(row)
    
    return padded_buffered_rows

# final csv dataframe
df_final = pd.DataFrame()

iteration = 0
# for each unique icao, create CSV for it
for name in unique_icao:
    # print stats
    if iteration % 100 == 0:
        print('Iterated through {} icao names'.format(iteration))
    iteration +=1
    
    # get rows where icao == name (boolean vector)
    selector = month_df['icao24'] == name
    one_plane_df = month_df[selector]
    
    # Make sure columns have data, otherwise skip plane
    if (one_plane_df.isnull().values.any()):
        continue
    
    # List to hold pending rows to add to dataframe
    buffered_rows = []
    
    ## calculate displacement columns
    
    # add first row with 0s for displacement columns
    first_row = one_plane_df.iloc[0]
    prev_lat = first_row['lat']
    prev_lon = first_row['lon']

#     row_prime = pd.DataFrame({
#             'time': [first_row['time']], 
#             'icao24':[first_row['icao24']],
#             'lat':[first_row['lat']],
#             'lon':[first_row['lon']],
#             'x_displacement':[0],
#             'y_displacement':[0],
#             'velocity':[first_row['velocity']],
#             'heading':[first_row['heading']],
#             'baroaltitude':[first_row['baroaltitude']]
#     })
#     buffered_rows.append(row_prime)

    # for all except for first row, calculate displacement based on previous row's lat and lon
    for index, row in islice(one_plane_df.iterrows(), 1, None):
        curr_lat = row['lat']
        curr_lon = row['lon']

        # compute distance for lat and then lon
        x_disp = curr_lon - prev_lon
        y_disp = curr_lat - prev_lat

        # if zero displacement found, pad and add to df_final
        if (x_disp == 0 or y_disp == 0): 
            # if greater than 2 in buffer, pad and add them all to df_final
            if (len(buffered_rows) > 2):
                padded_buffered_rows = pad_buffered_rows(buffered_rows)
                for row in padded_buffered_rows:    
                    df_final = df_final.append(row)
            
            buffered_rows = []
            continue
        
        # save curr lat and lon as prev for next iteration
        prev_lat = curr_lat
        prev_lon = curr_lon

        # add the row
        row_prime = pd.DataFrame({
            'time': [row['time']], 
            'icao24':[row['icao24']],
            'lat':[row['lat']],
            'lon':[row['lon']],
            'x_displacement':[x_disp],
            'y_displacement':[y_disp],
            'velocity':[row['velocity']],
            'heading':[row['heading']],
            'baroaltitude':[row['baroaltitude']]
        })
        buffered_rows.append(row_prime)
        
        if (len(buffered_rows) >= BATCH_SIZE):
            for row in buffered_rows:
                df_final = df_final.append(row)
            buffered_rows = []
        
    # if greater than 2 in buffer, pad and add them all to df_final
    if (len(buffered_rows) > 2):
        padded_buffered_rows = pad_buffered_rows(buffered_rows)
        for row in padded_buffered_rows:    
            df_final = df_final.append(row)
            

print('Shape of df_final: {}'.format(df_final.shape))    

# write to file

clean_dir = 'cleaned/'
if not os.path.exists(clean_dir):
    os.makedirs(clean_dir)
    
# write only rows to csv where the vector is true:
df_final.to_csv(clean_dir+'{}'.format(filename), index=False)
    
print('Saved file')

Iterated through 0 icao names
Iterated through 100 icao names
Iterated through 200 icao names
Iterated through 300 icao names
Iterated through 400 icao names
Iterated through 500 icao names
Iterated through 600 icao names
Iterated through 700 icao names
Iterated through 800 icao names
Iterated through 900 icao names
Iterated through 1000 icao names
Iterated through 1100 icao names
Iterated through 1200 icao names
Iterated through 1300 icao names
Iterated through 1400 icao names
Iterated through 1500 icao names
Iterated through 1600 icao names
Iterated through 1700 icao names
Iterated through 1800 icao names
Iterated through 1900 icao names
Iterated through 2000 icao names
Iterated through 2100 icao names
Iterated through 2200 icao names
Iterated through 2300 icao names
Iterated through 2400 icao names
Iterated through 2500 icao names
Iterated through 2600 icao names
Iterated through 2700 icao names
Iterated through 2800 icao names
Iterated through 2900 icao names
Iterated through 3000 