# Load in Data

In [None]:
# Load in data
import pandas as pd
df = pd.read_csv('../Data/Aviation_Data.csv', parse_dates=['Event.Date', 'Publication.Date'])
df.head()

# Data Description

In [None]:
df['Number.of.Engines'].value_counts()

In [None]:
df.shape

In [None]:
df.describe(datetime_is_numeric=True)

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df['Air.carrier'].value_counts()

In [None]:
df['Purpose.of.flight'].value_counts()

In [None]:
df['FAR.Description'].value_counts()

In [None]:
df['Schedule'].value_counts()

In [None]:
df['Report.Status'].value_counts()

# Making Subset

In [None]:
df_subset = df[['Event.Date', 'Location', 'Country', 'Number.of.Engines',
                'Aircraft.damage', 'Aircraft.Category', 'Make', 'Model', 
                'Amateur.Built', 'Engine.Type', 'FAR.Description',
                'Total.Fatal.Injuries', 'Total.Minor.Injuries', 'Total.Uninjured', 'Total.Serious.Injuries', 
                'Weather.Condition']]


In [None]:
df_subset = df_subset.rename(columns = lambda x: x.lower())

In [None]:
# Keeping only airplanes
df_subset = df_subset.loc[df_subset['aircraft.category'] == 'Airplane']

In [None]:
# Keeping only non-amateur built
df_subset = df_subset.loc[df_subset['amateur.built'] == 'No']

In [None]:
# Dropping amateur built and aircraft category
df_subset = df_subset[['event.date', 'location', 'country', 'number.of.engines',
                'aircraft.damage', 'make', 'model', 'total.serious.injuries',
                'engine.type', 'total.fatal.injuries', 'total.minor.injuries', 'total.uninjured', 
                'weather.condition']]

In [None]:
# Keeping only the year (first 4 characters) from event.date
df_subset = df_subset.rename(columns = lambda x: x.lower())
df_subset['event.date'] = pd.to_datetime(df_subset['event.date']).dt.year


In [None]:
# Keeping only the state abbreviations from 'location' column into a new column
df_subset['state'] = df_subset['location'].str.split(",").str[1]
df_subset = df_subset.drop(columns=['location'])

In [None]:
# Cleaning typos from weather.condition. Same value entered differently
replace_dict = {'Unk':'UNK'}
df_subset['weather.condition'] = df_subset['weather.condition'].replace(replace_dict)
df_subset['weather.condition'].value_counts()

In [None]:
# Cleaning typos from engine.type. Same value entered differently
df_subset['engine.type'].value_counts()

In [None]:
replace_dict3 = {'UNK':'Unknown'}
df_subset['engine.type'] = df_subset['engine.type'].replace(replace_dict3)
df_subset['engine.type'].value_counts()

In [None]:
df_subset.shape

In [None]:
df_subset.isna().sum()

In [None]:
df_subset.dtypes

In [None]:
df_subset.describe()

In [None]:
df_subset.info()

In [None]:
df_subset['number.of.engines'].value_counts()

# Dealing with Nulls


In [None]:
# Filling the NaN's in the 4 injuries columns with 0's

df_subset['total.serious.injuries'].fillna(0, inplace=True)
df_subset['total.fatal.injuries'].fillna(0, inplace=True)
df_subset['total.minor.injuries'].fillna(0, inplace=True)
df_subset['total.uninjured'].fillna(0, inplace=True)

df_subset.isna().sum()

In [None]:
# Dropping rows in columns that have very few nulls

df_subset2 = df_subset.dropna(subset=['country', 'make', 'model', 'state'])
df_subset2.isna().sum()


In [None]:
df_subset2.shape

In [None]:
# Replacing nulls in aircraft.damage column
df_subset2['aircraft.damage'].value_counts()

In [None]:
df_subset2['aircraft.damage'] = df_subset2['aircraft.damage'].fillna('N/A')

In [None]:
# Combining 'unknown's in aircraft.damage with n/a
replace_dict2 = {'Unknown':'N/A'}
df_subset2['aircraft.damage'] = df_subset2['aircraft.damage'].replace(replace_dict2)
df_subset2['aircraft.damage'].value_counts()

In [None]:
df_subset2.isna().sum()

In [None]:
# Replacing nulls in engine.type column
df_subset2['engine.type'].value_counts()

In [None]:
df_subset2['engine.type'] = df_subset2['engine.type'].fillna('Unknown')

In [None]:
df_subset2.isna().sum()

In [None]:
# Replacing nulls in weather.condition column
df_subset2['weather.condition'].value_counts()


In [None]:
df_subset2['weather.condition'] = df_subset2['weather.condition'].fillna('N/A')

In [None]:
df_subset2.isna().sum()

In [None]:
# dealing with number.of.engines nulls
df_subset2['number.of.engines'].median()

In [None]:
df_subset2['number.of.engines'] = df_subset2['number.of.engines'].fillna(df_subset2['number.of.engines'].median())

In [None]:
df_subset2['number.of.engines'] = df_subset2['number.of.engines'].astype(int)


In [None]:
df_subset2['number.of.engines'].value_counts()

# Changing injuries str columns to int columns

In [None]:
df_subset2.dtypes

In [None]:
df_subset2['total.serious.injuries'] = df_subset2['total.serious.injuries'].astype(int)
df_subset2['total.fatal.injuries'] = df_subset2['total.fatal.injuries'].astype(int)
df_subset2['total.minor.injuries'] = df_subset2['total.minor.injuries'].astype(int)
df_subset2['total.uninjured'] = df_subset2['total.uninjured'].astype(int)


In [None]:
df_subset2.dtypes

# Combining serious and minor injuries columns into 'nonfatal' columns

In [None]:
df_subset2['total.nonfatal.injuries'] = df_subset2['total.minor.injuries'] + df_subset2['total.serious.injuries']

In [None]:
df_subset2 = df_subset2[['event.date', 'country', 'state', 'number.of.engines',
                'aircraft.damage', 'make', 'model', 'engine.type', 'total.fatal.injuries', 
                 'total.nonfatal.injuries',
                'total.uninjured', 
                'weather.condition']]
df_subset2.info()

# Calculating/creating new injuries percentage columns

In [None]:
# Turning injury columns into percentage of total passengers
df_subset2['fatal.injuries.perc'] = round(((df_subset2['total.fatal.injuries'] / (df_subset2['total.fatal.injuries'] + 
                                                                             df_subset2['total.nonfatal.injuries'] + 
                                                                             df_subset2['total.uninjured'])) * 100), 1)
df_subset2['nonfatal.injuries.perc'] = round(((df_subset2['total.nonfatal.injuries'] / (df_subset2['total.fatal.injuries'] + 
                                                                             df_subset2['total.nonfatal.injuries'] + 
                                                                             df_subset2['total.uninjured'])) * 100), 1)
df_subset2['uninjured.perc'] = round(((df_subset2['total.uninjured'] / (df_subset2['total.fatal.injuries'] + 
                                                                             df_subset2['total.nonfatal.injuries'] + 
                                                                             df_subset2['total.uninjured'])) * 100), 1)

In [None]:
df_subset2.head()

In [None]:
df_subset2.shape

In [None]:
df_subset2.describe()

In [None]:
df_subset2.info()

In [None]:
df_subset2.info()

In [None]:
df_subset2.isna().sum()

In [None]:
df_subset2['fatal.injuries.perc'].fillna(0, inplace=True)
df_subset2['nonfatal.injuries.perc'].fillna(0, inplace=True)
df_subset2['uninjured.perc'].fillna(0, inplace=True)

In [None]:
df_subset2.isna().sum()

In [None]:
df_subset2.info()

In [None]:
df_subset2['fatal.injuries.perc'] = df_subset2['fatal.injuries.perc'].astype(int)
df_subset2['nonfatal.injuries.perc'] = df_subset2['nonfatal.injuries.perc'].astype(int)
df_subset2['uninjured.perc'] = df_subset2['uninjured.perc'].astype(int)

In [None]:
df_subset2.info()

In [None]:
df_subset2 = df_subset2[['event.date', 'country', 'state', 'number.of.engines',
                'aircraft.damage', 'make', 'model', 'engine.type',  
                 'fatal.injuries.perc', 'nonfatal.injuries.perc', 'uninjured.perc', 
                'weather.condition']]
df_subset2.info()

# Translating aircraft.damage into numerical scale

In [None]:
df_subset2['aircraft.damage'].value_counts()

In [None]:
def aircraft_damage_numbers(y):
    if y == "Substantial":
        return 3
    elif y == "Destroyed":
        return 2
    elif y == "Minor":
        return 1
    else:
        return 0

In [None]:
df_subset2['aircraft.damage.scale'] = df_subset2['aircraft.damage'].map(aircraft_damage_numbers)
df_subset2['aircraft.damage.scale'].value_counts()

# Cleaning the 'Make' series as a separate df


In [None]:
df_subset2['make'] = df_subset2['make'].astype(str).str.lower()
pd.set_option('display.max_rows', None)
df_subset2['make'].value_counts().head(50)

In [None]:
def typos(x):
    if "air tractor" in x:
        return "air tractor"
    elif "cessna" in x:
        return "cessna"
    elif "piper" in x:
        return "piper"
    elif "beech" in x:
        return "beech"
    elif "boeing" in x:
        return "boeing"
    elif "mooney" in x:
        return "mooney"
    elif "grumman" in x:
        return "grumman"
    elif "airbus" in x:
        return "airbus"
    elif "aeronca" in x:
        return "aeronca"
    elif "cirrus" in x:
        return "cirrus"
    elif "champion" in x:
        return "american champion"
    elif "embraer" in x:
        return "embraer"
    elif "havilland" in x:
        return "dehavilland"
    elif "aviat" in x:
        return "aviat"
    elif "diamond" in x:
        return "diamond"
    elif "ercoupe" in x:
        return "ercoupe"
    else:
        return x
    
df_subset2['make'] = df_subset2['make'].map(typos)

# Re-indexing and saving the df_subset2 

In [None]:
df_subset2.to_csv('../Data/subset2.csv')

# New dataset with just makes with over 100 rows

In [None]:
make_value_counts = df_subset2['make'].value_counts()

In [None]:
make_over_100 = make_value_counts.loc[make_value_counts > 100]

In [None]:
make_over_100 = list(make_over_100.index)

In [None]:
make_over_100

In [None]:
df_subset2_makes = df_subset2.loc[df_subset2['make'].isin(make_over_100)]
df_subset2_makes.head()

In [None]:
df_subset2_makes.to_csv('../Data/df_subset2_makes.csv')

In [None]:
df_subset2_makes.info()

# Groupby Experiments

In [None]:
df_subset2.groupby('event.date').mean()

In [None]:
df_subset2.groupby('country').mean()

In [None]:
df_subset2.groupby('state').mean()

In [None]:
df_subset2.groupby('number.of.engines').mean()

In [None]:
df_subset2['number.of.engines'].value_counts()

In [None]:
df_subset2_makes.groupby('make').mean()

In [None]:
df_subset2.groupby('engine.type').mean()

In [None]:
df_subset2.groupby('weather.condition').mean()

In [None]:
df_subset2['aircraft.damage'].value_counts()

In [None]:
df_subset2['number.of.engines'].value_counts()

# Exploratory Visuals: Scatter Plots


In [None]:
number_of_categories = 5

import seaborn as sns
import matplotlib.pyplot as plt

def plot_top_n_categories(df, column, title, ax):
    top_categories = df[column].value_counts().head(number_of_categories)
    sns.barplot(x=top_categories.values, y=top_categories.index, ax=ax)
    ax.set_title(title)
    ax.set_xlabel('Count')
    ax.set_ylabel(column)


fig, axes = plt.subplots(3, 1, figsize=(12, 18))
plt.subplots_adjust(hspace=0.5)

plot_top_n_categories(df_subset2_makes, 'make', 'Top 5 Aircraft Makes in Incidents', axes[0])

plot_top_n_categories(df_subset2_makes, 'model', 'Top 5 Aircraft Models in Incidents', axes[1])

plot_top_n_categories(df_subset2_makes, 'engine.type', 'Top 5 Engine Types in Incidents', axes[2])

plt.show()