In [1]:
import json
import mysql.connector
import pandas as pd
from utils import sql_table_to_pandas
import matplotlib.pyplot as plt
import seaborn as sns
import re
import time
import numpy as np
import datetime

In [2]:
pd.set_option(
    'display.max_rows', 100,
    'display.max_columns', None,
    'display.max_colwidth', 200,
    'display.width', 100,
    'display.float_format', '{:4,.2f}'.format
)


## Dataset description

- *status*: investigation status
- *time*: the time of accident
- *day*: the day of accident
- *month*: the monthe of accident
- *year*: the year of accdient
- *first_flight*: the year of aircraft first flight
- *total_airframe_hrs*: the hours that aircraft flied
- *aircraft_type*: the model of aircraft
- *operator*: the company, organisation or individual operating the aircraft at the time of the accident
- *country*: the country of accident
- *location*: more detailed location of accident
- *phase*: the phase of flight
- *nature*: the nature of the flight
- *engines*: number and type (model and mark) of engines
- *narrative*: the description of the occurrence
- *probable_cause*: the probable cause of the accident as established by the accident investigators
- *aircraft_damage*: describes the amount of damage to the airplane as a result of the occurrence
- *departure_airport*: the last airport of departure before the accident
- *destination_airport*: the scheduled destination airport
- *crew_occupants*: the exact number of flight- and cabincrew members aboard the aircraft at the time of departure
- *crew_fatalities*: the number of crew members who fatally injured as a direct result of the accident
- *passengers_occupants*: the number of passengers aboard the aircraft at the time of departure
- *passengers_fatalities*: the number of passengers who fatally injured as a direct result of the accident
- *total_occupants*: crew_occupants + passengers_occupants
- *total_fatalities*: crew_fatalities + passengers_fatalities

## Dataset cleaning

In [3]:
df = sql_table_to_pandas('../db_config.JSON', 'accidents')

In [4]:
df = df.replace({None: np.nan, 'None': np.nan})

In [5]:
df.shape

(22767, 27)

###### time

In [6]:
def correct_time(x) -> datetime.time or np.nan:
    """Removes times which are not in local time of accident."""
    if pd.isnull(x):
        return np.nan
    
    # ca or c. means circa which translated from latin means approximate
    x = x.replace('ca', '').replace('c.', '').replace(' ', '')
    

    if re.match(r'^(2[0-3]|[01]?[0-9]):([0-5]?[0-9])$', x):
        return datetime.datetime.strptime(x, '%H:%M').time()
    elif re.match(r'^(2[0-3]|[01]?[0-9]):([0-5]?[0-9]):([0-5]?[0-9])$', x):
        return datetime.datetime.strptime(x, '%H:%M:%S').time()
    else:
        return np.nan


def get_time_range(x) -> str or np.nan:
    if pd.isnull(x):
        return np.nan

    if x < datetime.time(4, 0):
        return '00:00 - 03:59'
    elif x < datetime.time(8, 0):
        return '04:00 - 07:59'
    elif x < datetime.time(12, 0):
        return '08:00 - 11:59'
    elif x < datetime.time(16, 0):
        return '12:00 - 15:59'
    elif x < datetime.time(20, 0):
        return '16:00 - 19:59'
    else:
        return '20:00 - 23:59'

In [7]:
df['time_range'] = df['time'].apply(correct_time).apply(get_time_range)
df.drop(columns=['time'], inplace=True)

In [8]:
pd.notna(df['time_range']).sum()

8144

In [9]:
df['time_range'].value_counts()

12:00 - 15:59    1926
08:00 - 11:59    1892
16:00 - 19:59    1749
20:00 - 23:59    1118
04:00 - 07:59     888
00:00 - 03:59     571
Name: time_range, dtype: int64

###### weekday

No correction

In [10]:
df['weekday'].value_counts()

Friday       3513
Thursday     3359
Wednesday    3300
Tuesday      3298
Monday       3151
Saturday     3030
Sunday       2606
Name: weekday, dtype: int64

###### day

No correction

In [11]:
df['day'].value_counts()

10.00    890
24.00    791
15.00    778
27.00    769
19.00    763
23.00    753
6.00     750
13.00    745
18.00    742
17.00    742
11.00    739
12.00    738
22.00    736
9.00     736
4.00     735
28.00    732
14.00    728
7.00     722
26.00    719
21.00    708
25.00    707
16.00    706
2.00     705
8.00     698
5.00     683
3.00     679
30.00    676
20.00    671
1.00     662
29.00    654
31.00    400
Name: day, dtype: int64

###### month

No correction

In [12]:
df['month'].value_counts()

12.00    2013
1.00     2011
3.00     2001
9.00     1988
5.00     1956
7.00     1884
8.00     1835
6.00     1833
11.00    1828
4.00     1789
10.00    1746
2.00     1639
Name: month, dtype: int64

###### year

No correction

In [13]:
df['year'].value_counts()

1944    1494
1945    1418
1943     751
1969     389
1946     370
        ... 
1922       3
1924       3
1927       2
1919       2
1925       2
Name: year, Length: 102, dtype: int64

###### first_flight

In [14]:
def get_age_range(age) -> str or np.nan:
    if pd.isnull(age):
        return np.nan
    
    if age < 1:
        return '0 - 1'
    elif age < 3:
        return '1 - 3'
    elif age < 5:
        return '3 - 5'
    elif age < 10:
        return '5 - 10'
    elif age < 20:
        return '10 - 20'
    elif age < 30:
        return '20 - 30'
    elif age < 50:
        return '30 - 50'
    else:
        return '> 50'

In [15]:
df['age'] = (df['year'] - df['first_flight']).apply(lambda x: np.nan if x < 0 or np.isnan(x) else x)
df['age_range'] = df['age'].apply(get_age_range)
df.drop(columns=['first_flight'], inplace=True)

In [16]:
df['age_range'].value_counts()

10 - 20    3781
1 - 3      3234
5 - 10     2778
20 - 30    2628
30 - 50    1825
0 - 1      1481
3 - 5      1375
> 50        225
Name: age_range, dtype: int64

###### total_airframe_hrs

In [17]:
df['total_airframe_hrs'] = df['total_airframe_hrs'].apply(lambda x: x if isinstance(x, float) else np.nan)

In [18]:
df['aircraft_type'].value_counts()

Douglas C-47-DL (DC-3)                      461
Junkers Ju-52/3m                            431
Antonov An-2R                               364
Douglas Dakota III (DC-3)                   260
de Havilland Canada DHC-6 Twin Otter 300    246
                                           ... 
Short Sunderland Mk III                       1
Convair CV-990-30A-8 Coronado                 1
Douglas C-47A-30-DK                           1
Boeing 747-212BSF                             1
Hawker Siddeley HS-748-286 Srs. 2A LFD        1
Name: aircraft_type, Length: 3492, dtype: int64

In [None]:
df[df['age'] > 60]['probable_cause'].iloc[0]

In [None]:
df['age'].value_counts()

In [None]:
df.head()

In [None]:
pd.isna(df['weekday'].iloc[:5])

In [None]:
df.head()

In [None]:
a = 

In [None]:
df['time'].iloc[623]

In [None]:
pd.notna(a).sum()

In [None]:
a.iloc[1]

In [None]:
datetime.time(24, 0) < a.iloc[1]

In [None]:
(df['time'] != 'None').sum()

In [None]:
df

In [None]:
df['time'] = df['time'].astype('str')
df['time']

In [None]:
df['time'].iloc[0]

In [None]:
df[df['time'].str.contains('UTC')].shape

In [None]:
df['time'].dtype

In [None]:
df.head()

In [None]:
temp = df.groupby('year')[['id']].count() #Temp is going to be temporary data frame 
temp = temp.rename(columns={"id": "Count"})

plt.figure(figsize=(12,6))
plt.style.use('bmh')
plt.plot(temp.index, 'Count', data=temp, color='blue', marker = ".", linewidth=1)
se
plt.xlabel('Year', fontsize=10)
plt.ylabel('Count', fontsize=10)
plt.title('Count of accidents by Year', loc='Center', fontsize=14)
plt.show()