In [75]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Let's read in some data

In [127]:
df = pd.read_csv('nycflights.csv')

We'll look at the head and some info about the dataset

In [128]:
print(df.head())

   year  month  day  dep_time  sched_dep_time  dep_delay  arr_time  \
0  2013      1    1     517.0             515        2.0     830.0   
1  2013      1    1     533.0             529        4.0     850.0   
2  2013      1    1     542.0             540        2.0     923.0   
3  2013      1    1     544.0             545       -1.0    1004.0   
4  2013      1    1     554.0             600       -6.0     812.0   

   sched_arr_time  arr_delay carrier  flight tailnum origin dest  air_time  \
0             819       11.0      UA    1545  N14228    EWR  IAH     227.0   
1             830       20.0      UA    1714  N24211    LGA  IAH     227.0   
2             850       33.0      AA    1141  N619AA    JFK  MIA     160.0   
3            1022      -18.0      B6     725  N804JB    JFK  BQN     183.0   
4             837      -25.0      DL     461  N668DN    LGA  ATL     116.0   

   distance  hour  minute      time_hour  
0      1400     5      15  1/1/2013 5:00  
1      1416     5      2

In [129]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336776 entries, 0 to 336775
Data columns (total 19 columns):
year              336776 non-null int64
month             336776 non-null int64
day               336776 non-null int64
dep_time          328521 non-null float64
sched_dep_time    336776 non-null int64
dep_delay         328521 non-null float64
arr_time          328063 non-null float64
sched_arr_time    336776 non-null int64
arr_delay         327346 non-null float64
carrier           336776 non-null object
flight            336776 non-null int64
tailnum           334264 non-null object
origin            336776 non-null object
dest              336776 non-null object
air_time          327346 non-null float64
distance          336776 non-null int64
hour              336776 non-null int64
minute            336776 non-null int64
time_hour         336776 non-null object
dtypes: float64(5), int64(9), object(5)
memory usage: 48.8+ MB


Lets try to fix up the time columns to be more time-like

In [150]:
df.shape

(336776, 19)

In [154]:
print(df['origin'].unique())

['EWR' 'LGA' 'JFK']


In [130]:
df['date'] = df['year'].astype(str) + "-" + df['month'].astype(str) + "-" + df['day'].astype(str)

In [131]:
print(df['date'][:10])

0    2013-1-1
1    2013-1-1
2    2013-1-1
3    2013-1-1
4    2013-1-1
5    2013-1-1
6    2013-1-1
7    2013-1-1
8    2013-1-1
9    2013-1-1
Name: date, dtype: object


In [132]:
df['date'] = pd.to_datetime(df['date'])

In [133]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336776 entries, 0 to 336775
Data columns (total 20 columns):
year              336776 non-null int64
month             336776 non-null int64
day               336776 non-null int64
dep_time          328521 non-null float64
sched_dep_time    336776 non-null int64
dep_delay         328521 non-null float64
arr_time          328063 non-null float64
sched_arr_time    336776 non-null int64
arr_delay         327346 non-null float64
carrier           336776 non-null object
flight            336776 non-null int64
tailnum           334264 non-null object
origin            336776 non-null object
dest              336776 non-null object
air_time          327346 non-null float64
distance          336776 non-null int64
hour              336776 non-null int64
minute            336776 non-null int64
time_hour         336776 non-null object
date              336776 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(5), int64(9), object(5)
memory

In [134]:
df = df.set_index(['date'])

In [135]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-01', '2013-01-01', '2013-01-01',
               '2013-01-01', '2013-01-01', '2013-01-01', '2013-01-01',
               '2013-01-01', '2013-01-01',
               ...
               '2013-09-30', '2013-09-30', '2013-09-30', '2013-09-30',
               '2013-09-30', '2013-09-30', '2013-09-30', '2013-09-30',
               '2013-09-30', '2013-09-30'],
              dtype='datetime64[ns]', name='date', length=336776, freq=None)

In [136]:
df.head()

Unnamed: 0_level_0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2013-01-01,2013,1,1,517.0,515,2.0,830.0,819,11.0,UA,1545,N14228,EWR,IAH,227.0,1400,5,15,1/1/2013 5:00
2013-01-01,2013,1,1,533.0,529,4.0,850.0,830,20.0,UA,1714,N24211,LGA,IAH,227.0,1416,5,29,1/1/2013 5:00
2013-01-01,2013,1,1,542.0,540,2.0,923.0,850,33.0,AA,1141,N619AA,JFK,MIA,160.0,1089,5,40,1/1/2013 5:00
2013-01-01,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,B6,725,N804JB,JFK,BQN,183.0,1576,5,45,1/1/2013 5:00
2013-01-01,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,461,N668DN,LGA,ATL,116.0,762,6,0,1/1/2013 6:00


In [139]:
df.groupby('carrier')

<pandas.core.groupby.DataFrameGroupBy object at 0x0000022C59FC1400>

In [140]:
df.groupby('carrier')['dest']

<pandas.core.groupby.SeriesGroupBy object at 0x0000022C59FC10B8>

In [147]:
df.groupby('carrier')['dest'].unique()

carrier
9E    [MSP, IAD, BUF, SYR, ROC, BWI, ORD, IND, BNA, ...
AA    [MIA, ORD, DFW, SJU, MCO, FLL, LAX, SFO, BOS, ...
AS                                                [SEA]
B6    [BQN, FLL, MCO, PBI, TPA, BOS, RSW, SJU, BUF, ...
DL    [ATL, MSP, PBI, SLC, SFO, MIA, FLL, DTW, SEA, ...
EV    [IAD, MSP, JAX, CHS, MEM, CLE, MYR, RDU, DCA, ...
F9                                                [DEN]
FL                                      [MKE, ATL, CAK]
HA                                                [HNL]
MQ    [ATL, DTW, ORD, MSP, XNA, RDU, CMH, CLT, DCA, ...
OO                            [ORD, MSP, IAD, DTW, CLE]
UA    [IAH, ORD, LAX, SFO, LAS, MIA, PBI, FLL, DEN, ...
US                       [PHX, CLT, PHL, DCA, BOS, LGA]
VX                            [LAX, SFO, LAS, PSP, SJC]
WN    [BWI, DEN, MDW, MKE, STL, HOU, PHX, BNA, AUS, ...
YV                                      [IAD, CLT, PHL]
Name: dest, dtype: object

In [149]:
df.groupby('carrier')['origin'].unique()

carrier
9E    [JFK, EWR, LGA]
AA    [JFK, LGA, EWR]
AS              [EWR]
B6    [JFK, EWR, LGA]
DL    [LGA, JFK, EWR]
EV    [LGA, EWR, JFK]
F9              [LGA]
FL              [LGA]
HA              [JFK]
MQ    [LGA, EWR, JFK]
OO         [LGA, EWR]
UA    [EWR, LGA, JFK]
US    [EWR, JFK, LGA]
VX         [JFK, EWR]
WN         [LGA, EWR]
YV              [LGA]
Name: origin, dtype: object

In [168]:
# busiest origin?
print(df.groupby('origin')['dep_time'].count())  # has NaNs, and they are not counted
print(df.groupby('origin')['dest'].count()) # no NaNs

origin
EWR    117596
JFK    109416
LGA    101509
Name: dep_time, dtype: int64
origin
EWR    120835
JFK    111279
LGA    104662
Name: dest, dtype: int64


In [159]:
lax_df = df[ df['dest']=='LAX']

In [160]:
lax_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 16174 entries, 2013-01-01 to 2013-09-30
Data columns (total 19 columns):
year              16174 non-null int64
month             16174 non-null int64
day               16174 non-null int64
dep_time          16076 non-null float64
sched_dep_time    16174 non-null int64
dep_delay         16076 non-null float64
arr_time          16058 non-null float64
sched_arr_time    16174 non-null int64
arr_delay         16026 non-null float64
carrier           16174 non-null object
flight            16174 non-null int64
tailnum           16125 non-null object
origin            16174 non-null object
dest              16174 non-null object
air_time          16026 non-null float64
distance          16174 non-null int64
hour              16174 non-null int64
minute            16174 non-null int64
time_hour         16174 non-null object
dtypes: float64(5), int64(9), object(5)
memory usage: 2.5+ MB


In [162]:
sum(df['dest'] == 'LAX')

16174