## Introduction

We'll be working with a data set that contains 77,282 aviation accidents that occurred in the U.S., and the metadata associated with them. The data in our AviationData.txt

In [1]:
aviation_list = list()
aviation_data = list()

with open('AviationData.txt', 'r') as f:
    for line in f:
        aviation_data.append(line)   
    for entry in aviation_data:
        words = entry.split('|')
        words_mod = [word.strip() for word in words]
        aviation_list.append(words_mod)

print(aviation_list[0], end='\n\n') # header
print(aviation_data[1])
print(aviation_list[1])

['Event Id', 'Investigation Type', 'Accident Number', 'Event Date', 'Location', 'Country', 'Latitude', 'Longitude', 'Airport Code', 'Airport Name', 'Injury Severity', 'Aircraft Damage', 'Aircraft Category', 'Registration Number', 'Make', 'Model', 'Amateur Built', 'Number of Engines', 'Engine Type', 'FAR Description', 'Schedule', 'Purpose of Flight', 'Air Carrier', 'Total Fatal Injuries', 'Total Serious Injuries', 'Total Minor Injuries', 'Total Uninjured', 'Weather Condition', 'Broad Phase of Flight', 'Report Status', 'Publication Date', '']

20150908X74637 | Accident | CEN15LA402 | 09/08/2015 | Freeport, IL | United States | 42.246111 | -89.581945 | KFEP | albertus Airport | Non-Fatal | Substantial | Unknown | N24TL | CLARKE REGINALD W | DRAGONFLY MK |  |  |  | Part 91: General Aviation |  | Personal |  |  | 1 |  |  | VMC | TAKEOFF | Preliminary | 09/09/2015 | 

['20150908X74637', 'Accident', 'CEN15LA402', '09/08/2015', 'Freeport, IL', 'United States', '42.246111', '-89.581945', 'KFEP'

## Linear Search

In [2]:
def linear_search(list_, code): #O(n^2)
    holder = list()
    for row in list_:
        for item in row:
            if item == code:
                holder.append(row)
                break
    return holder

lax_code = linear_search(aviation_list, "LAX94LA336")
print(lax_code)

[['20001218X45447', 'Accident', 'LAX94LA336', '07/19/1962', 'BRIDGEPORT, CA', 'United States', '', '', '', '', 'Fatal(4)', 'Destroyed', '', 'N5069P', 'PIPER', 'PA24-180', 'No', '1', 'Reciprocating', '', '', 'Personal', '', '4', '0', '0', '0', 'UNK', 'UNKNOWN', 'Probable Cause', '09/19/1996', '']]


We are able to search the data for a particular value with linear search, but return data is messy

## Binary Search

In [3]:
def to_dictionary(list_):
    names = list()
    data = list()
    
    for entry in list_:
        words = entry.split('|')
        words_mod = [word.strip() for word in words] 
        words_mod.pop()      # remove last ''
        data.append(words_mod)
    
    names = data[0]
    data = data[1:]
    
    dict_list = list()
    for row in data:
        dict_ = dict() # hold each row
        for k in range(len(names)):
            # row[k] is names[k] corresponding value
            dict_[names[k]] = row[k]
        dict_list.append(dict_)
    return dict_list

adl = to_dictionary(aviation_data)
adl[1]

{'Event Id': '20150906X32704',
 'Investigation Type': 'Accident',
 'Accident Number': 'ERA15LA339',
 'Event Date': '09/05/2015',
 'Location': 'Laconia, NH',
 'Country': 'United States',
 'Latitude': '43.606389',
 'Longitude': '-71.452778',
 'Airport Code': 'LCI',
 'Airport Name': 'Laconia Municipal Airport',
 'Injury Severity': 'Fatal(1)',
 'Aircraft Damage': 'Substantial',
 'Aircraft Category': 'Weight-Shift',
 'Registration Number': 'N2264X',
 'Make': 'EVOLUTION AIRCRAFT INC',
 'Model': 'REVO',
 'Amateur Built': 'No',
 'Number of Engines': '1',
 'Engine Type': 'Reciprocating',
 'FAR Description': 'Part 91: General Aviation',
 'Schedule': '',
 'Purpose of Flight': 'Personal',
 'Air Carrier': '',
 'Total Fatal Injuries': '1',
 'Total Serious Injuries': '',
 'Total Minor Injuries': '',
 'Total Uninjured': '',
 'Weather Condition': 'VMC',
 'Broad Phase of Flight': 'MANEUVERING',
 'Report Status': 'Preliminary',
 'Publication Date': '09/10/2015'}

In [4]:
aviation_dict_list = sorted(adl, key=lambda k: k['Accident Number']) # O(nlogn)
print(adl[:3]); print();
print(adl[-3:])

[{'Event Id': '20150908X74637', 'Investigation Type': 'Accident', 'Accident Number': 'CEN15LA402', 'Event Date': '09/08/2015', 'Location': 'Freeport, IL', 'Country': 'United States', 'Latitude': '42.246111', 'Longitude': '-89.581945', 'Airport Code': 'KFEP', 'Airport Name': 'albertus Airport', 'Injury Severity': 'Non-Fatal', 'Aircraft Damage': 'Substantial', 'Aircraft Category': 'Unknown', 'Registration Number': 'N24TL', 'Make': 'CLARKE REGINALD W', 'Model': 'DRAGONFLY MK', 'Amateur Built': '', 'Number of Engines': '', 'Engine Type': '', 'FAR Description': 'Part 91: General Aviation', 'Schedule': '', 'Purpose of Flight': 'Personal', 'Air Carrier': '', 'Total Fatal Injuries': '', 'Total Serious Injuries': '1', 'Total Minor Injuries': '', 'Total Uninjured': '', 'Weather Condition': 'VMC', 'Broad Phase of Flight': 'TAKEOFF', 'Report Status': 'Preliminary', 'Publication Date': '09/09/2015'}, {'Event Id': '20150906X32704', 'Investigation Type': 'Accident', 'Accident Number': 'ERA15LA339', '

In [5]:
def bin_search(dict_list, target):  #O(logn)
    length = len(dict_list) 
    
    ub = length - 1
    lb = 0
    
    index = (ub + lb) // 2
    guess = dict_list[index]['Accident Number']
    while target != guess:
        if lb >= ub:
            return None
        
        if target < guess:
            ub = index - 1
        else:
            lb = index + 1
        
        index = (ub + lb) // 2
        guess = dict_list[index]['Accident Number']
    return dict_list[index]

print(bin_search(aviation_dict_list, 'LAX94LA336'))

{'Event Id': '20001218X45447', 'Investigation Type': 'Accident', 'Accident Number': 'LAX94LA336', 'Event Date': '07/19/1962', 'Location': 'BRIDGEPORT, CA', 'Country': 'United States', 'Latitude': '', 'Longitude': '', 'Airport Code': '', 'Airport Name': '', 'Injury Severity': 'Fatal(4)', 'Aircraft Damage': 'Destroyed', 'Aircraft Category': '', 'Registration Number': 'N5069P', 'Make': 'PIPER', 'Model': 'PA24-180', 'Amateur Built': 'No', 'Number of Engines': '1', 'Engine Type': 'Reciprocating', 'FAR Description': '', 'Schedule': '', 'Purpose of Flight': 'Personal', 'Air Carrier': '', 'Total Fatal Injuries': '4', 'Total Serious Injuries': '0', 'Total Minor Injuries': '0', 'Total Uninjured': '0', 'Weather Condition': 'UNK', 'Broad Phase of Flight': 'UNKNOWN', 'Report Status': 'Probable Cause', 'Publication Date': '09/19/1996'}


## Exploring Data

### Find accidents that happened in USA

In [6]:
import numpy as np

check = []
for event in aviation_dict_list:
    check.append(event['Country'])
countries = pd.Series(list(set(check)))
us_name = ['usa', 'United States', 'USA', 'America', 'United States of America']
countries[countries.isin(us_name)]

NameError: name 'pd' is not defined

In [None]:
from collections import Counter

def count_usa(dl): # input: dict_list
    states = list()
    for event in dl:
        if event['Country'] == 'United States':
            states.append(event['Location'].split(',')[-1].strip())
    state_count = Counter(states)
    return state_count

usa_accidents = count_usa(aviation_dict_list)
usa_accidents.most_common(5)

### Count Injuries by Month and Year

In [None]:
# for i in range(5):
#     print(aviation_dict_list[i]['Event Date'].split('/'))
#     print(aviation_dict_list[i]['Event Id'][4:6])
#     print(aviation_dict_list[i]['Event Id'][0:4])

In [None]:
def count_month_injuries(dl):
    mapping = {"01":"January",
                "02":"February",
                "03":"March",
                "04":"April",
                "05":"May",
                "06":"June",
                "07":"July",
                "08":"August",
                "09":"September",
                "10":"October",
                "11":"November",
                "12":"December"}
    
    counter_by_month = {}
    
    for event in dl:
        injuries = 0
        if event['Event Date'] != '':     # skip '' date
            month = event['Event Date'].split('/')[0]
            year = event['Event Date'].split('/')[-1]           
            try:
                month = mapping[month]
            except KeyError:
                month = event['Event Id'][4:6]
                month = mapping[month]
                year = event['Event Id'][0:4]
            
            full_month = ' '.join((month, year))
            
            if event['Total Fatal Injuries'] == '':
                event['Total Fatal Injuries'] = '0'
            if event['Total Serious Injuries'] == '':
                event['Total Serious Injuries'] = '0'
                
            injuries += int(event['Total Fatal Injuries']) + int(event['Total Serious Injuries'])
            counter_by_month[full_month] = injuries
            counter_by_month = Counter(counter_by_month)

    return counter_by_month

month_injuries = count_month_injuries(aviation_dict_list)
month_injuries.most_common(5)            

## Count Accidents by Month and Year

In [None]:
def count_month_accidents(dl):
    mapping = {"01":"January",
                "02":"February",
                "03":"March",
                "04":"April",
                "05":"May",
                "06":"June",
                "07":"July",
                "08":"August",
                "09":"September",
                "10":"October",
                "11":"November",
                "12":"December"}
    
    months = list()
    
    for event in dl:
        injuries = 0
        if event['Event Date'] != '':     # skip '' date
            month = event['Event Date'].split('/')[0]
            year = event['Event Date'].split('/')[-1]           
            try:
                month = mapping[month]
            except KeyError:
                month = event['Event Id'][4:6]
                month = mapping[month]
                year = event['Event Id'][0:4]
            full_month = ' '.join((month, year))
            months.append(full_month)
    count_by_month = Counter(months)

    return count_by_month

month_accidents = count_month_accidents(aviation_dict_list)
month_accidents.most_common(5)        

Worst months for injuries and for accidents are not corresponding!

it appears we can study the data about:
- Map out accidents using the basemap library for matplotlib.
- Count the number of accidents by air carrier.
- Count the number of accidents by airplane make and model.
- Figure out what percentage of accidents occur under adverse weather conditions.
