# Cleaning: Flight Delays

This data set comes from the featured Kaggle datasets, containing three tables that relate to commercial airline flights. The flight delay and cancellation data was collected and published by the DOT's Bureau of Transportation Statistics.

**There are three tables:**

* airlines : contains airline IATA codes with corresponding airline names
* airports : contains
    IATA_CODE:
    Location Identifier
    String

    AIRPORT:
    Airport's Name
    String

    CITY:
    City Name of the Airport
    String

    STATE:
    State Name of the Airport
    String

    COUNTRY:
    Country Name of the Airport
    String

    LATITUDE:
    Latitude of the Airport
    Numeric

    LONGITUDE:
    Longitude of the Airport
    Numeric
* flights : contains many features with descriptions that can be found [here](https://www.kaggle.com/usdot/flight-delays), by selecting flights.csv in the data and clicking on Edit descriptions above the data preview.

In [1]:
# Import modules
import numpy as np
import pandas as pd
from pandas import DataFrame as DF
from pandas import Series

from itertools import combinations
from datetime import datetime

Use pandas to read in airline data and preview.

In [2]:
airlines = pd.read_csv('flights/airlines.csv')
airlines.head(3)

Unnamed: 0,IATA_CODE,AIRLINE
0,UA,United Air Lines Inc.
1,AA,American Airlines Inc.
2,US,US Airways Inc.


Use pandas to read in airport data and preview.

In [3]:
airports = pd.read_csv('flights/airports.csv')
airports.head(3)

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.4404
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.6819
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919


Use pandas to read in flight data and preview.

In [4]:
flights = pd.read_csv('flights/flights_sm_raw.csv')
flights.head(3)

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,US,2013,N584UW,LAX,CLT,30,...,753.0,-10.0,0,0,,,,,,
1,2015,1,1,4,AA,1674,N853AA,LAS,MIA,35,...,753.0,-10.0,0,0,,,,,,
2,2015,1,1,4,DL,1560,N3743H,ANC,SEA,45,...,451.0,-24.0,0,0,,,,,,


## 1. Convert column names to lower case (it's a pain to keep typing upper case column names)

In [5]:
# Map lower case function to column names
airlines.columns = map(str.lower, airlines.columns)
airports.columns = map(str.lower, airports.columns)
flights.columns = map(str.lower, flights.columns)

# Print results
print("airlines: {}".format(list(airlines.columns)))
print("\nairports: {}".format(list(airports.columns)))
print("\nflights: {}".format(list(flights.columns)))

airlines: ['iata_code', 'airline']

airports: ['iata_code', 'airport', 'city', 'state', 'country', 'latitude', 'longitude']

flights: ['year', 'month', 'day', 'day_of_week', 'airline', 'flight_number', 'tail_number', 'origin_airport', 'destination_airport', 'scheduled_departure', 'departure_time', 'departure_delay', 'taxi_out', 'wheels_off', 'scheduled_time', 'elapsed_time', 'air_time', 'distance', 'wheels_on', 'taxi_in', 'scheduled_arrival', 'arrival_time', 'arrival_delay', 'diverted', 'cancelled', 'cancellation_reason', 'air_system_delay', 'security_delay', 'airline_delay', 'late_aircraft_delay', 'weather_delay']


## 2. Remove all rows that have null values for critical features
- critical features are those that are needed to compute missing values
- you need to determine the sets of features that are go together for computing values

In [6]:
# Split data into flights that were cancelled vs not cancelled
cancelled = flights[flights.cancelled == 1].copy()
not_cancelled = flights[flights.cancelled == 0].copy()

In [7]:
# Create lists of subsets needed for computing critical features
subset1 = ['departure_delay', 'taxi_out', 'wheels_off'] # guarantee departure time
subset2 = ['scheduled_departure', 'departure_delay'] # determine departure delay
subset3 = ['wheels_off', 'air_time', 'wheels_on'] # determine air time
subset4 = ['taxi_in', 'wheels_on', 'arrival_time'] # guarantee arrival time
subset5 = ['scheduled_arrival', 'arrival_delay'] # determine arrival delay

# Use itertools to create all size 2 combinations from subsets 1/3/4. Drop any cases where both are NA.
for subset in [subset1, subset3, subset4]:
    for c in combinations(subset, 2):
        not_cancelled.dropna(how = 'all', subset = list(c), inplace = True)
        
# Drop NA from subsets 2/5
for subset in [subset2, subset5]:
    not_cancelled.dropna(how = 'all', subset = subset, inplace = True)
    
# Combine cancelled with not_cancelled and randomize order using 'sample' method
flights = pd.concat([cancelled, not_cancelled]).sample(frac = 1).reset_index(drop = True)

## 3. Convert times from float values to full 24-hour time format strings

In [8]:
# Define list of columns containing numeric time values
time_vals = ['scheduled_departure', 'departure_time', 'wheels_off', 'wheels_on', 'scheduled_arrival', 'arrival_time']

Define function to convert time values from float to 24-hour time format strings

In [9]:
def time_to_str(time):
    if time != time:
        return time
    else:
        return '{:04}'.format(int(time))

Apply time_to_str function to each column in time_vals

In [10]:
for col in time_vals:
    flights.loc[:, col] = flights.loc[:, col].apply(lambda x: time_to_str(x))
flights[time_vals].head()

Unnamed: 0,scheduled_departure,departure_time,wheels_off,wheels_on,scheduled_arrival,arrival_time
0,625,626.0,657.0,1216.0,1208,1224.0
1,1216,1216.0,1254.0,1422.0,1424,1449.0
2,528,,,,619,
3,940,931.0,954.0,1140.0,1146,1144.0
4,910,906.0,918.0,1025.0,1040,1031.0


## 4. Convert new hour strings to datetime objects
- Test the function and determine the cause of (and fix) any errors

In [11]:
# Define function to convert hour strings to datetime objects
def strings_to_dates(row, col):
    # Year/month/day info in first 3 data columns
    year = row[1]
    month = row[2]
    day = row[3]
    
    # Determine hour/minute based on specified column
    if row[col] != row[col]:
        return np.nan
    else:
        # Format hour strings to datetimes
        FMT = '%H%M' 
            
        # Convert '2400' to '0000' for datetime compatibility
        if row[col] == '2400':
            time = datetime.strptime('0000', FMT)
        else:
            time = datetime.strptime(row[col], FMT)
            
        # Add year/month/day to datetime
        return time.replace(year = year, month = month, day = day)

In [12]:
# Loop through time_vals and apply function
for col in time_vals:
    # Find column number (accounting for blank column number at beginning)
    col_number = flights.columns.get_loc(col) + 1
    
    # Change all rows for that column
    flights[col] = [strings_to_dates(row, col_number) for row in flights.itertuples()]

# Preview results
flights.head()

Unnamed: 0,year,month,day,day_of_week,airline,flight_number,tail_number,origin_airport,destination_airport,scheduled_departure,...,arrival_time,arrival_delay,diverted,cancelled,cancellation_reason,air_system_delay,security_delay,airline_delay,late_aircraft_delay,weather_delay
0,2015,4,17,5,DL,815,N916DN,SAN,MSP,2015-04-17 06:25:00,...,2015-04-17 12:24:00,16.0,0,0,,16.0,0.0,0.0,0.0,0.0
1,2015,8,13,4,AA,129,N4YNAA,RIC,DFW,2015-08-13 12:16:00,...,2015-08-13 14:49:00,25.0,0,0,,25.0,0.0,0.0,0.0,0.0
2,2015,5,29,5,MQ,3432,N907MQ,TYR,DFW,2015-05-29 05:28:00,...,NaT,,0,1,B,,,,,
3,2015,6,13,6,OO,2885,N869AS,LAX,EUG,2015-06-13 09:40:00,...,2015-06-13 11:44:00,-2.0,0,0,,,,,,
4,2015,11,20,5,WN,1019,N555LV,BNA,MDW,2015-11-20 09:10:00,...,2015-11-20 10:31:00,-9.0,0,0,,,,,,


## 5. Fill in missing values that can be calculated from subsets (our critical features)

Determine which attributes need to be filled

In [13]:
subsets = [subset1, subset2, subset3, subset4, subset5]
to_fill = {}
for i, sub in enumerate(subsets):
    for attr in sub:
        temp = flights[flights[attr].isnull() & (flights.cancelled == 0)]
        if len(temp) > 0:
            k = 'subset{}'.format(i + 1)
            to_fill.setdefault(k, [])
            to_fill[k].append(attr)
to_fill

{'subset3': ['air_time'], 'subset5': ['arrival_delay']}

Fill in missing attributes

In [14]:
flights.air_time = [d.seconds/60 for d in (flights.wheels_on - flights.wheels_off)]
flights.arrival_delay = [d.seconds/60 for d in (flights.arrival_time - flights.scheduled_arrival)]

## 6. Filter out rows that contain numeric airport codes
- there is a mix of character IATA codes and numeric strings

In [15]:
# Use regular expressions to identify and remove numeric IATA codes
flights = flights[-((flights['origin_airport'].str.contains('[0-9]')) &
                    (flights['destination_airport'].str.contains('[0-9]')))]
flights.head()

Unnamed: 0,year,month,day,day_of_week,airline,flight_number,tail_number,origin_airport,destination_airport,scheduled_departure,...,arrival_time,arrival_delay,diverted,cancelled,cancellation_reason,air_system_delay,security_delay,airline_delay,late_aircraft_delay,weather_delay
0,2015,4,17,5,DL,815,N916DN,SAN,MSP,2015-04-17 06:25:00,...,2015-04-17 12:24:00,16.0,0,0,,16.0,0.0,0.0,0.0,0.0
1,2015,8,13,4,AA,129,N4YNAA,RIC,DFW,2015-08-13 12:16:00,...,2015-08-13 14:49:00,25.0,0,0,,25.0,0.0,0.0,0.0,0.0
2,2015,5,29,5,MQ,3432,N907MQ,TYR,DFW,2015-05-29 05:28:00,...,NaT,,0,1,B,,,,,
3,2015,6,13,6,OO,2885,N869AS,LAX,EUG,2015-06-13 09:40:00,...,2015-06-13 11:44:00,1438.0,0,0,,,,,,
4,2015,11,20,5,WN,1019,N555LV,BNA,MDW,2015-11-20 09:10:00,...,2015-11-20 10:31:00,1431.0,0,0,,,,,,


## 7. BONUS: Write a function and script to correct dates for arrivals that are overnight flights
- don't actually run this code (time consuming)

In [16]:
def correct_dates(departure, arrival):
    if arrival.hour < departure.hour:
        try:
            # Add another day to arrival
            arrival.replace(day = day + 1)
        except ValueError:
            # Error if day went into next month
            if arrival.month == 12:
                # New day goes into next year
                arrival.replace(month = 1, day = 1)
            else:
                # Any other month
                arrival.replace(month = month + 1, day = 1)
    # Return arrival whether altered or not
    return arrival      

In [17]:
# Would run code if uncommented
# s = flights.columns.get_loc['scheduled_departure'] + 1
# for r in flights.itertuples():
#     for col in time_vals:
#         c = flights.columns.get_loc(col) + 1
#         flights.loc[r[0], col] = correct_dates(r[s], r[c])

# Cleaning: Legos

colors : This file contains information on LEGO colors, including a unique ID for each color, its name, and approximate RGB value, and whether it's transparent

inventories : This table contains information on inventories, including a unique ID, it's version and the set number.

inventory_parts : This table contains information part inventories, including a unique ID number, the part number, the color of the part, how many are included and whether it's a spare.

inventory_sets : This file contains information on what inventory is included in which sets, including the inventory ID, the set number and the quantity of that inventory that are included.

part_categories : This dataset includes information on the part category (what type of part it is) and a unique ID for that part category.

parts : This dataset includes information on lego parts, including a unique ID number, the name of the part, and what part category it's from.

sets : This file contains information on LEGO sets, including a unique ID number, the name of the set, the year it was released, its theme and how many parts it includes.

themes : This file includes information on lego themes. Each theme is given a unique ID number, a name, and (if it's part of a bigger theme) which theme it's part of.

## Schema

!['lego data schema'](legos/images/downloads_schema.png)

## 1. Efficiently import all data files into a dictionary for easy access

In [18]:
# Create list of data files
files = ['colors', 'inventories', 'inventory_parts', 'inventory_sets', 'part_categories', 'parts', 'sets', 'themes']

Import data files into dictionary by looping through file list

In [19]:
legos = {}
for name in files:
    filename = 'legos/{}.csv'.format(name)
    file = pd.read_csv(filename)
    legos[name] = file

Examine dictionary keys

In [20]:
list(legos.keys())

['colors',
 'inventories',
 'inventory_parts',
 'inventory_sets',
 'part_categories',
 'parts',
 'sets',
 'themes']

View example dataframe from dictionary

In [21]:
legos['colors'].head()

Unnamed: 0,id,name,rgb,is_trans
0,-1,Unknown,0033B2,f
1,0,Black,05131D,f
2,1,Blue,0055BF,f
3,2,Green,237841,f
4,3,Dark Turquoise,008F9B,f


## 2. Create a table that provides data that can be used to analyze colors by category
* We do not want detailed part names, part numbers, or category ID
* We also do not want any other ID values
* Make sure the color name and category name columns are labeled clearly

In [22]:
lego_colors = legos['inventory_parts'].merge(legos['colors'], left_on = 'color_id', right_on = 'id')

lego_partid = lego_colors.merge(legos['parts'], on = 'part_num') \
    .drop(['part_num', 'name_y', 'inventory_id', 'color_id', 'id'], axis = 1)

lego_color_cat = lego_partid.merge(legos['part_categories'], left_on = 'part_cat_id', right_on = 'id') \
    .drop(['part_cat_id', 'id'], axis = 1)
    
lego_color_cat.rename(columns = {'name_x': 'color_name', 'name': 'cat_name'}, inplace = True)

In [23]:
lego_color_cat.head()

Unnamed: 0,quantity,is_spare,color_name,rgb,is_trans,cat_name
0,1,f,Dark Bluish Gray,6C6E68,f,Minifig Accessories
1,1,f,Dark Bluish Gray,6C6E68,f,Minifig Accessories
2,1,f,Black,05131D,f,Minifig Accessories
3,1,f,White,FFFFFF,f,Minifig Accessories
4,1,f,Blue,0055BF,f,Minifig Accessories


In [24]:
legos['inventory_parts'].shape, lego_color_cat.shape

((580251, 5), (580069, 6))

The loss of some rows between the original 'inventory parts' data frame in the 'legos' list and the 'lego_color_cat' data frame shows the lack of matching IDs within one or more merges.

## 3. Create a table that allows us to analyze set names and their themes
* Merge everything in one command
* Do not include data that will not help with this analysis (remove all of this in a second command)
    + do consider that we might later want to look at specific colors or other part details
* Rename columns where necessary

In [25]:
# Create needed merges in one command
set_themes = legos['sets'].merge(legos['themes'], left_on = 'theme_id', right_on = 'id') \
    .merge(legos['inventories'], on = 'set_num') \
    .merge(legos['inventory_parts'], left_on = 'id_y', right_on = 'inventory_id')

In [26]:
# Remove data that will not help in this analysis
set_themes.drop(['set_num', 'num_parts', 'theme_id', 'id_x', 'id_y', 'is_spare'], axis = 1, inplace = True)

In [27]:
# Rename columns where necessary
set_themes.rename(columns = {'name_x': 'set_name', 'name_y': 'theme_name'}, inplace = True)

In [28]:
# View results
set_themes.head()

Unnamed: 0,set_name,year,theme_name,parent_id,version,inventory_id,part_num,color_id,quantity
0,Weetabix Castle,1970,Castle,411.0,1,5574,29c01,4,8
1,Weetabix Castle,1970,Castle,411.0,1,5574,29c01,15,6
2,Weetabix Castle,1970,Castle,411.0,1,5574,3001a,15,25
3,Weetabix Castle,1970,Castle,411.0,1,5574,3001a,4,9
4,Weetabix Castle,1970,Castle,411.0,1,5574,3001a,1,4


## 4. Create a copy of the parts table and modify it to show what sets the parts belong to

In [29]:
# Copy parts table and rename columns
parts = legos['parts'].copy()
parts.rename(columns = {'name': 'part_name'}, inplace = True)

In [30]:
# Merge parts table with inventories, inventory parts and sets
inv_parts_sets = legos['inventory_parts'].merge(legos['inventories'], left_on = 'inventory_id', right_on = 'id') \
    .merge(legos['sets'], on = 'set_num')
inv_parts_sets.rename(columns = {'name': 'set_name'}, inplace = True)    
    
parts_sets = inv_parts_sets.merge(parts, on = 'part_num')

keep = list(parts.columns) + ['set_name']
parts_sets = parts_sets[keep]

In [31]:
parts_sets.shape, parts.shape

((580069, 4), (25993, 3))

In [32]:
parts_sets.head()

Unnamed: 0,part_num,part_name,part_cat_id,set_name
0,48379c01,Sports Promo Figure Base with Feet,27,McDonald's Sports Set Number 6 - Orange Vest S...
1,48379c01,Sports Promo Figure Base with Feet,27,McDonald's Sports Set Number 5 - Blue Hockey P...
2,48379c01,Sports Promo Figure Base with Feet,27,McDonald's Sports Set Number 4 - White Hockey ...
3,48379c01,Sports Promo Figure Base with Feet,27,McDonald's Sports Set Number 1 - White Soccer ...
4,48379c01,Sports Promo Figure Base with Feet,27,McDonald's Sports Set Number 8 - Green Basketb...


## 5. Create a copy of the set_themes table created earlier. Modify it to create a new table indicating how many transparent parts there are for each set name

In [33]:
set_themes_cpy = set_themes.copy()
set_themes_cpy.drop(['theme_name', 'parent_id'], axis = 1, inplace = True)

set_themes_cpy = set_themes_cpy.merge(legos['colors'][['id', 'is_trans']], left_on = 'color_id', right_on = 'id') \
    .drop('id', axis = 1)

set_themes_cpy['trans'] = [1 if it == 't' else 0 for it in set_themes_cpy.is_trans]
set_names_trans = DF(set_themes_cpy.groupby(['set_name']).trans.sum()).rename(columns = {'trans': 'n_trans'})

In [34]:
set_names_trans.tail()

Unnamed: 0_level_0,n_trans
set_name,Unnamed: 1_level_1
{Red Race Car Number 3},1
{Roadplates and Scenery},0
{Rock Saw Vehicle},0
{Town Vehicles},1
{Yellow Cab},5


In [35]:
set_names_trans.describe()

Unnamed: 0,n_trans
count,9457.0
mean,3.84033
std,6.64498
min,0.0
25%,0.0
50%,1.0
75%,5.0
max,171.0
