# Project 2

In [3]:
import pandas as pd   # to load and transform data
import numpy as np    # for math/stat calculations
import altair as alt
import json

In [4]:
# load flights data
flight_url = 'https://raw.githubusercontent.com/byuidatascience/data4missing/master/data-raw/flights_missing/flights_missing.json'
flights = pd.read_json(flight_url)
flights.head()

Unnamed: 0,airport_code,airport_name,month,year,num_of_flights_total,num_of_delays_carrier,num_of_delays_late_aircraft,num_of_delays_nas,num_of_delays_security,num_of_delays_weather,num_of_delays_total,minutes_delayed_carrier,minutes_delayed_late_aircraft,minutes_delayed_nas,minutes_delayed_security,minutes_delayed_weather,minutes_delayed_total
0,ATL,"Atlanta, GA: Hartsfield-Jackson Atlanta Intern...",January,2005.0,35048,1500+,-999,4598,10,448,8355,116423.0,104415,207467.0,297,36931,465533
1,DEN,"Denver, CO: Denver International",January,2005.0,12687,1041,928,935,11,233,3153,53537.0,70301,36817.0,363,21779,182797
2,IAD,,January,2005.0,12381,414,1058,895,4,61,2430,,70919,35660.0,208,4497,134881
3,ORD,"Chicago, IL: Chicago O'Hare International",January,2005.0,28194,1197,2255,5415,5,306,9178,88691.0,160811,364382.0,151,24859,638894
4,SAN,"San Diego, CA: San Diego International",January,2005.0,7283,572,680,638,7,56,1952,27436.0,38445,21127.0,218,4326,91552


## Question 1

In [5]:
# average of minutes_delayed_total for each airport
q1_table = (flights
    .groupby('airport_code')
    .agg(total_flights = ('num_of_flights_total', sum),
        total_delays = ('num_of_delays_total', sum),
        total_delay_min = ('minutes_delayed_total', sum),
        )
    .assign(prop_delays = lambda x: x.total_delays / x.total_flights,
            avg_delay_hrs = lambda x: x.total_delay_min / x.total_delays / 60
            )
    .reset_index()
)

# print makes table more organized 
# index=False gets rid of index column 
print(q1_table.to_markdown(index=False))

| airport_code   |   total_flights |   total_delays |   total_delay_min |   prop_delays |   avg_delay_hrs |
|:---------------|----------------:|---------------:|------------------:|--------------:|----------------:|
| ATL            |         4430047 |         902443 |          53983926 |      0.20371  |        0.996996 |
| DEN            |         2513974 |         468519 |          25173381 |      0.186366 |        0.895495 |
| IAD            |          851571 |         168467 |          10283478 |      0.197831 |        1.01736  |
| ORD            |         3597588 |         830825 |          56356129 |      0.230939 |        1.13053  |
| SAN            |          917862 |         175132 |           8276248 |      0.190804 |        0.78762  |
| SFO            |         1630945 |         425604 |          26550493 |      0.260955 |        1.03972  |
| SLC            |         1403384 |         205160 |          10123371 |      0.146189 |        0.822396 |


In [6]:
# find max aver delay hrs for airport code
highest_delay_avg = max(q1_table.avg_delay_hrs)
print(highest_delay_avg)

1.130525461639535


In [7]:
# create new table so can edit data
delay_dif_table = q1_table
# replace highest value with nan
delay_dif_table.avg_delay_hrs.replace(highest_delay_avg, np.nan, inplace=True)
# print(delay_dif_table)
# find difference between ORD avg and all other airports
delay_dif = delay_dif_table.avg_delay_hrs - highest_delay_avg
avg_dif = delay_dif.mean()
print(avg_dif)


-0.20392816278753778


In [8]:
# worst weather delay
q1_weather = (flights
    .groupby('airport_code')
    .agg(total_delay_weather = ('num_of_delays_weather', sum)
        )
    .assign()
    .reset_index()
)

print(q1_weather.to_markdown(index=False))
print(max(q1_weather.total_delay_weather))

| airport_code   |   total_delay_weather |
|:---------------|----------------------:|
| ATL            |                 32375 |
| DEN            |                 13836 |
| IAD            |                  4794 |
| ORD            |                 20765 |
| SAN            |                  4320 |
| SFO            |                 10377 |
| SLC            |                  6831 |
32375


## Question 2

In [9]:
# fill in na months
# should've worked but didn't
# flights.month.replace('n/a', np.nan)
# flights.month.fillna(method='bfill')
# mdata = flights
# mdata.month.replace('n/a', np.nan).fillna(method='bfill', inplace=True)
mdata = flights.query('month != "n/a"')

In [10]:
mdata.month.value_counts(dropna=False)

April        77
July         77
October      77
Febuary      76
November     76
June         75
August       75
September    74
January      73
May          73
December     73
March        71
Name: month, dtype: int64

In [11]:
q2_table = (mdata
    .groupby('month')
    .agg(total_flights = ('num_of_flights_total', sum),
        total_delays = ('num_of_delays_total', sum),
        total_delay_min = ('minutes_delayed_total', sum),
        # av_delay_mins = ('minutes_delayed_total', 'mean')
        )
    .assign(prop_delays = lambda x: x.total_delays / x.total_flights,
            avg_delay_hrs = lambda x: x.total_delay_min / x.total_delays / 60
            )
    .reset_index()
)

# print makes table more organized 
# index=False gets rid of index column 
print(q2_table.to_markdown(index=False))

| month     |   total_flights |   total_delays |   total_delay_min |   prop_delays |   avg_delay_hrs |
|:----------|----------------:|---------------:|------------------:|--------------:|----------------:|
| April     |         1259723 |         231408 |          13667654 |      0.183698 |        0.984384 |
| August    |         1335158 |         279699 |          16906565 |      0.209488 |        1.00743  |
| December  |         1180278 |         303133 |          18821267 |      0.256832 |        1.03482  |
| Febuary   |         1115814 |         248033 |          14753955 |      0.222289 |        0.991397 |
| January   |         1193018 |         265001 |          16152667 |      0.222127 |        1.01589  |
| July      |         1371741 |         319960 |          20465456 |      0.233251 |        1.06604  |
| June      |         1305663 |         317895 |          20338750 |      0.243474 |        1.06632  |
| March     |         1213370 |         250142 |          14942262 |     

In [12]:
# chart code from q1
q2_chart = (alt.Chart(q2_table)
    .mark_bar(color='green', opacity=0.6)
    .encode(
        x = alt.X('total_delays', axis=alt.Axis(title = 'Total Delays')), 
        y = alt.Y('month', axis=alt.Axis(title='Month'), sort=['January', 'Febuary', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']))
    .properties(width=400,
                title= {'text': 'Total Delays', 'subtitle': 'Amount of delays by month'}
                )
    )
q2_chart

## Question 3

In [13]:
flights.num_of_delays_weather.value_counts(dropna=False)

27     17
20     15
15     14
37     14
34     13
       ..
243     1
173     1
712     1
233     1
152     1
Name: num_of_delays_weather, Length: 269, dtype: int64

In [14]:
# make copy of data so don't mess it up/change OG data
q3_data = flights

# replace -999 with NaN
q3_data.num_of_delays_late_aircraft.replace(-999, np.nan, inplace=True)

# find mean of data
late_delay_mean = q3_data.num_of_delays_late_aircraft.mean()

# replace NaN data with the mean
new_data = q3_data.num_of_delays_late_aircraft.fillna(value=late_delay_mean, inplace=True)

weather = q3_data.assign(
    severe = q3_data.num_of_delays_weather,
    mild_late = 0.3*q3_data.num_of_delays_late_aircraft,
    mild_nas = np.where(q3_data.month.isin(['April', 'May', 'June', 'July', 'August']), 
                0.4*q3_data.num_of_delays_nas,
                0.65*q3_data.num_of_delays_nas),
    total = lambda x: x.severe + x.mild_late + x.mild_nas
)

# average of minutes_delayed_total for each airport
q3_table = (weather
    .groupby('airport_code')
    .agg(total_weather_delays = ('total', sum),
        total_flights = ('num_of_flights_total', sum),
        severe = ('severe', sum),
        mild_late = ('mild_late', sum),
        mild_nas = ('mild_nas', sum)
        )
    .assign(percent_delay_weather = lambda x: x.total_weather_delays / x.total_flights * 100)
    .sort_values(by = 'percent_delay_weather', ascending = False)
    .reset_index()
)

# print makes table more organized 
# index=False gets rid of index column 
print(q3_table.to_markdown())

|    | airport_code   |   total_weather_delays |   total_flights |   severe |   mild_late |   mild_nas |   percent_delay_weather |
|---:|:---------------|-----------------------:|----------------:|---------:|------------:|-----------:|------------------------:|
|  0 | SFO            |               159594   |         1630945 |    10377 |     37251   |   111966   |                 9.78535 |
|  1 | ORD            |               309954   |         3597588 |    20765 |     81045.1 |   208144   |                 8.61561 |
|  2 | ATL            |               314801   |         4430047 |    32375 |     70680.5 |   211745   |                 7.10603 |
|  3 | IAD            |                50842.7 |          851571 |     4794 |     19451.1 |    26597.5 |                 5.97045 |
|  4 | DEN            |               149107   |         2513974 |    13836 |     54000.3 |    81270.7 |                 5.93113 |
|  5 | SAN            |                48920.6 |          917862 |     4320 |     2

## Question 4

In [15]:
(alt.Chart(q3_table)
    .mark_bar(color='#98AC5D', opacity=0.8)
    .encode(x = alt.X('total_weather_delays', axis=alt.Axis(title='Total Weather Delays')),
        y = alt.Y('airport_code', axis=alt.Axis(title='Airport Code'))
        )
    .properties(
        height = 250,
        width = 500,
        title = {'text': 'Total Weather Delays', 'subtitle': 'Mild to Severe'}
    )    
)

## Question 5

### Data That is Missing/Incorrect

In [16]:
# create new variable for data
updated_data = flights
updated_data.head(100)

Unnamed: 0,airport_code,airport_name,month,year,num_of_flights_total,num_of_delays_carrier,num_of_delays_late_aircraft,num_of_delays_nas,num_of_delays_security,num_of_delays_weather,num_of_delays_total,minutes_delayed_carrier,minutes_delayed_late_aircraft,minutes_delayed_nas,minutes_delayed_security,minutes_delayed_weather,minutes_delayed_total
0,ATL,"Atlanta, GA: Hartsfield-Jackson Atlanta Intern...",January,2005.0,35048,1500+,1109.104072,4598,10,448,8355,116423.0,104415,207467.0,297,36931,465533
1,DEN,"Denver, CO: Denver International",January,2005.0,12687,1041,928.000000,935,11,233,3153,53537.0,70301,36817.0,363,21779,182797
2,IAD,,January,2005.0,12381,414,1058.000000,895,4,61,2430,,70919,35660.0,208,4497,134881
3,ORD,"Chicago, IL: Chicago O'Hare International",January,2005.0,28194,1197,2255.000000,5415,5,306,9178,88691.0,160811,364382.0,151,24859,638894
4,SAN,"San Diego, CA: San Diego International",January,2005.0,7283,572,680.000000,638,7,56,1952,27436.0,38445,21127.0,218,4326,91552
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,SAN,"San Diego, CA: San Diego International",Febuary,2006.0,6962,520,584.000000,558,6,72,1740,22801.0,30032,22548.0,147,4974,80502
96,SFO,"San Francisco, CA: San Francisco International",Febuary,2006.0,9742,757,645.000000,1573,4,66,3048,,40924,,191,4435,161355
97,SLC,"Salt Lake City, UT: Salt Lake City International",Febuary,2006.0,10375,789,626.000000,675,16,96,2200,35886.0,31434,21827.0,587,5123,94857
98,ATL,"Atlanta, GA: Hartsfield-Jackson Atlanta Intern...",March,2006.0,34988,1500+,1596.000000,4458,13,286,7870,92476.0,82823,194182.0,1024,15803,386308


In [17]:
# tried to fill in missing airport name with their airport code 
# for i in updated_data:
#     if updated_data.airport_name == updated_data.airport_name.empty:
#         updated_data.airport_name.replace(updated_data.airport_name.empty, updated_data.airport_code[i])
#     i += 1

In [18]:
# replace 1500+ string with number
updated_data.num_of_delays_carrier.replace('1500+', 1500)

0      1500
1      1041
2       414
3      1197
4       572
       ... 
919     182
920     923
921     480
922     757
923     483
Name: num_of_delays_carrier, Length: 924, dtype: object

In [19]:
# fill in missing months
updated_data.month.replace('n/a', np.nan).fillna(method='bfill')

0       January
1       January
2       January
3       January
4       January
         ...   
919    December
920    December
921    December
922    December
923    December
Name: month, Length: 924, dtype: object

In [20]:
# replace nan for num_of_delays_late_aircraft
# replace -999 with NaN
updated_data.num_of_delays_late_aircraft.replace(-999, np.nan, inplace=True)

# find mean of data
late_delay_mean = updated_data.num_of_delays_late_aircraft.mean()

# replace NaN data with the mean
new_data = updated_data.num_of_delays_late_aircraft.replace(np.nan, late_delay_mean, inplace=True)
updated_data.head()

Unnamed: 0,airport_code,airport_name,month,year,num_of_flights_total,num_of_delays_carrier,num_of_delays_late_aircraft,num_of_delays_nas,num_of_delays_security,num_of_delays_weather,num_of_delays_total,minutes_delayed_carrier,minutes_delayed_late_aircraft,minutes_delayed_nas,minutes_delayed_security,minutes_delayed_weather,minutes_delayed_total
0,ATL,"Atlanta, GA: Hartsfield-Jackson Atlanta Intern...",January,2005.0,35048,1500+,1109.104072,4598,10,448,8355,116423.0,104415,207467.0,297,36931,465533
1,DEN,"Denver, CO: Denver International",January,2005.0,12687,1041,928.0,935,11,233,3153,53537.0,70301,36817.0,363,21779,182797
2,IAD,,January,2005.0,12381,414,1058.0,895,4,61,2430,,70919,35660.0,208,4497,134881
3,ORD,"Chicago, IL: Chicago O'Hare International",January,2005.0,28194,1197,2255.0,5415,5,306,9178,88691.0,160811,364382.0,151,24859,638894
4,SAN,"San Diego, CA: San Diego International",January,2005.0,7283,572,680.0,638,7,56,1952,27436.0,38445,21127.0,218,4326,91552


In [21]:
# replace nan for minutes_delayed_carrier
# find mean of data
delay_mean = updated_data.minutes_delayed_carrier.mean()

# replace NaN data with the mean
new_data = updated_data.minutes_delayed_carrier.replace(np.nan, delay_mean, inplace=True)
updated_data.head()

Unnamed: 0,airport_code,airport_name,month,year,num_of_flights_total,num_of_delays_carrier,num_of_delays_late_aircraft,num_of_delays_nas,num_of_delays_security,num_of_delays_weather,num_of_delays_total,minutes_delayed_carrier,minutes_delayed_late_aircraft,minutes_delayed_nas,minutes_delayed_security,minutes_delayed_weather,minutes_delayed_total
0,ATL,"Atlanta, GA: Hartsfield-Jackson Atlanta Intern...",January,2005.0,35048,1500+,1109.104072,4598,10,448,8355,116423.0,104415,207467.0,297,36931,465533
1,DEN,"Denver, CO: Denver International",January,2005.0,12687,1041,928.0,935,11,233,3153,53537.0,70301,36817.0,363,21779,182797
2,IAD,,January,2005.0,12381,414,1058.0,895,4,61,2430,51902.25344,70919,35660.0,208,4497,134881
3,ORD,"Chicago, IL: Chicago O'Hare International",January,2005.0,28194,1197,2255.0,5415,5,306,9178,88691.0,160811,364382.0,151,24859,638894
4,SAN,"San Diego, CA: San Diego International",January,2005.0,7283,572,680.0,638,7,56,1952,27436.0,38445,21127.0,218,4326,91552


In [24]:
# json_data = updated_data.to_json()
# json_data

# by "record" or "row"
json_data = flights.to_json(orient="records")
json_object = json.loads(json_data)
json_formatted_str = json.dumps(json_object, indent = 4)
print(json_formatted_str)

[
    {
        "airport_code": "ATL",
        "airport_name": "Atlanta, GA: Hartsfield-Jackson Atlanta International",
        "month": "January",
        "year": 2005.0,
        "num_of_flights_total": 35048,
        "num_of_delays_carrier": "1500+",
        "num_of_delays_late_aircraft": 1109.1040723982,
        "num_of_delays_nas": 4598,
        "num_of_delays_security": 10,
        "num_of_delays_weather": 448,
        "num_of_delays_total": 8355,
        "minutes_delayed_carrier": 116423.0,
        "minutes_delayed_late_aircraft": 104415,
        "minutes_delayed_nas": 207467.0,
        "minutes_delayed_security": 297,
        "minutes_delayed_weather": 36931,
        "minutes_delayed_total": 465533
    },
    {
        "airport_code": "DEN",
        "airport_name": "Denver, CO: Denver International",
        "month": "January",
        "year": 2005.0,
        "num_of_flights_total": 12687,
        "num_of_delays_carrier": "1041",
        "num_of_delays_late_aircraft": 928.0,
   