In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
dtype = {'DayOfWeek': np.uint8, 'DayofMonth': np.uint8, 'Month': np.uint8 , 
         'Cancelled': np.uint8, 'Year': np.uint16, 'FlightNum': np.uint16 , 
         'Distance': np.uint16, 'UniqueCarrier': str, 'CancellationCode': str, 
         'Origin': str, 'Dest': str, 'ArrDelay': np.float16, 
         'DepDelay': np.float16, 'CarrierDelay': np.float16, 
         'WeatherDelay': np.float16, 'NASDelay': np.float16, 
         'SecurityDelay': np.float16, 'LateAircraftDelay': np.float16, 
         'DepTime': np.float16}

In [None]:
%%time
# path = '../data/flights_2008.csv.bz2'
path = "../input/us-flights-data-2008/2008.csv"
flights_df = pd.read_csv(path, usecols=dtype.keys(), dtype=dtype)

In [None]:
flights_df.shape

In [None]:
flights_df.head()

In [None]:
print(flights_df.columns)

In [None]:
flights_df.head().T

In [None]:
flights_df.info()

In [None]:
flights_df.describe().T

In [None]:
flights_df['UniqueCarrier'].nunique()

In [None]:
flights_df['UniqueCarrier'].unique()

In [None]:
flights_df.groupby('UniqueCarrier').size().plot(kind='bar')

In [None]:
flights_df.groupby('UniqueCarrier').size()

In [None]:
flights_df.groupby('UniqueCarrier')['Distance'].sum().sort_values(ascending=False)

In [None]:
flights_df.groupby('UniqueCarrier')['Distance'].sum().sort_values(ascending=False).iloc[:5]

In [None]:
flights_df.groupby('UniqueCarrier')['Distance'].sum().sort_values(ascending=False).iloc[:1]

In [None]:
pd.crosstab(flights_df.Month, flights_df.DayOfWeek)

In [None]:
plt.imshow(pd.crosstab(flights_df.Month, flights_df.DayOfWeek), cmap='seismic')

In [None]:
plt.imshow(pd.crosstab(flights_df.Month, flights_df.DayOfWeek), cmap='seismic', interpolation='none')

In [None]:
flights_df.hist('Distance', bins=20)

In [None]:
flights_df['Date'] = pd.to_datetime(flights_df.rename(
    columns={'DayofMonth': 'Day'})[['Year', 'Month', 'Day']])

In [None]:
num_flights_by_date = flights_df.groupby('Date').size()

In [None]:
num_flights_by_date.plot()

In [None]:
num_flights_by_date.rolling(window=7).mean().plot()

In [None]:
flights_df['DepTime']

In [None]:
flights_df['DepHour'] = flights_df['DepTime'] // 100

In [None]:
flights_df['DepHour'].replace(to_replace=24, value=0, inplace=True)

In [None]:
flights_df['DepHour'].describe()

In [None]:
flights_df.columns

**q1 How many unique carriers are there in our dataset?**

In [None]:
flights_df['UniqueCarrier'].nunique()

In [None]:
flights_df['UniqueCarrier'].describe()

In [None]:
flights_df['Cancelled'].unique()

In [None]:
flights_df['FlightNum']

In [None]:
flights_df.head()

In [None]:
flights_df['Cancelled'].value_counts()

In [None]:
flights_df.groupby('Cancelled')['FlightNum'].unique()

In [None]:
flights_df['FlightNum'].nunique()

**q2 We have both cancelled and completed flights in the dataset. Check if there are more completed or cancelled flights. What is the difference? **

In [None]:
flights_df['Cancelled'].value_counts()[0]-flights_df['Cancelled'].value_counts()[1]

In [None]:
flights_df.groupby(['UniqueCarrier','FlightNum']).agg({'Cancelled': np.sum}).sum()

In [None]:
flights_df.groupby(['UniqueCarrier','FlightNum'])['Cancelled']

In [None]:
flights_df[(flights_df['Cancelled'] == 1)]['Cancelled'].sum()

In [None]:
pd.crosstab(flights_df['Cancelled'], flights_df['FlightNum'])

In [None]:
len(flights_df[flights_df['Cancelled']==0][['Cancelled','FlightNum']].groupby(['Cancelled','FlightNum']))

In [None]:
flights_df.columns

In [None]:
flights_df['DepDelay'].max()

In [None]:
flights_df['ArrDelay'].describe()

In [None]:
flights_df.sort_values('DepDelay', ascending=False).sort_values('ArrDelay', ascending=False).iloc[:5]

**q3 Find a flight with the longest departure delays and a flight with the longest arrival delay. Do they have the same destination airport, and if yes, what is its code?**

In [None]:
flights_df.groupby('Dest')['DepDelay'].max().sort_values(
    ascending=False)[0:5]

In [None]:
flights_df.groupby('Dest')['ArrDelay'].max().sort_values(
    ascending=False)[0:5]

**q4 Find the carrier that has the greatest number of cancelled flights.**

In [None]:
flights_df.groupby('UniqueCarrier')['Cancelled'].sum().sort_values(
    ascending=False)[0:5]

In [None]:
flights_df[flights_df['Cancelled'] == 1].groupby('UniqueCarrier')['Cancelled'].count().sort_values(
    ascending=False)[0:5]

In [None]:
flights_df.columns

In [None]:
flights_df['DepHour'].describe()

In [None]:
flights_df['DepHour'].sort_values().iloc[:9]

**q5 Let's examine departure time and consider distribution by hour (column DepHour that we've created earlier). Which hour has the highest percentage of flights?**

In [None]:
flights_df['DepHour'].value_counts().sort_values(ascending=False)

**q6,q7
OK, now let's examine cancelled flight distribution by time. Which hour has the least percentage of cancelled flights?
Is there any hour that didn't have any cancelled flights at all? Check all that apply.**

In [None]:
flights_df.groupby('DepHour')['Cancelled'].sum().sort_values()

In [None]:
flights_df.groupby('DepHour')['Cancelled'].sum().sort_values()[:5]

In [None]:
flights_df[flights_df['Cancelled'] == 0].groupby('DepHour')['Cancelled'].count().sort_values()[:5]

In [None]:
flights_df[(flights_df['DepHour'] == 22) & (flights_df['Cancelled'] == 1)]['Cancelled'].count()

In [None]:
completed_flights_df = flights_df[flights_df['Cancelled'] == 0]

**q8 Find the busiest hour, or in other words, the hour when the number of departed flights reaches its maximum.**

In [None]:
completed_flights_df['DepHour'].value_counts().sort_values(ascending=False)

In [None]:
completed_flights_df.shape

In [None]:
flights_df.shape

In [None]:
flights_df.columns

In [None]:
completed_flights_df['DepDelay'].describe()

In [None]:
completed_flights_df[completed_flights_df['DepDelay'] < 0]['DepDelay'].mean()

**q9 Since we know the departure hour, it might be interesting to examine the average delay for corresponding hour. Are there any cases, when the planes on average departed earlier than they should have done? And if yes, at what departure hours did it happen?**

In [None]:
completed_flights_df.groupby('DepHour')['DepDelay'].mean()

In [None]:
completed_flights_df.head()

**q10 Considering only the completed flights by the carrier, that you have found in Question 4, find the distribution of these flights by hour. At what time does the greatest number of its planes depart?**

In [None]:
completed_flights_df['DepHour'].value_counts().sort_values(ascending=False)

**q11 Find top-10 carriers in terms of the number of completed flights (UniqueCarrier column)?

Which of the listed below is not in your top-10 list?**

In [None]:
completed_flights_df['UniqueCarrier'].value_counts().sort_values(ascending=False)[:10]

**q12 Plot distributions of flight cancellation reasons (_CancellationCode_). What is the most frequent reason for flight cancellation? **

Weather conditions

In [None]:
flights_df.groupby('CancellationCode').size().plot(kind='bar');

In [None]:
flights_df['CancellationCode'].value_counts().sort_values()

**q13 Which route is the most frequent, in terms of the number of flights?**

In [None]:
flights_df.groupby('Origin')['Dest'].value_counts().sort_values(ascending=False)

**q14 Find top-5 delayed routes (count how many times they were delayed on departure). From all flights on these 5 routes, count all flights with weather conditions contributing to a delay.**

In [None]:
flights_df[flights_df['DepDelay'] > 0].groupby(['Origin','Dest'])['DepDelay'].count().sort_values(ascending=False).iloc[:5]
        

In [None]:
sub_df = flights_df[flights_df['DepDelay'] > 0].groupby(['Origin','Dest'])['DepDelay'].count().sort_values(ascending=False).iloc[:5]

In [None]:
sub_df

In [None]:
flights_df[(flights_df['Origin'] == 'LAX') & 
           (flights_df['Dest'] == 'SFO') &
           (flights_df['WeatherDelay'] > 0)
          ].count()

In [None]:
flights_df[(flights_df['Origin'] == 'DAL') & 
           (flights_df['Dest'] == 'HOU') &
           (flights_df['WeatherDelay'] > 0)
          ].count()

In [None]:
flights_df[(flights_df['Origin'] == 'SFO') & 
           (flights_df['Dest'] == 'LAX') &
           (flights_df['WeatherDelay'] > 0)
          ].count()

In [None]:
flights_df[(flights_df['Origin'] == 'ORD') & 
           (flights_df['Dest'] == 'LGA') &
           (flights_df['WeatherDelay'] > 0)
          ].count()

In [None]:
flights_df[(flights_df['Origin'] == 'HOU') & 
           (flights_df['Dest'] == 'DAL') &
           (flights_df['WeatherDelay'] > 0)
          ].count()

In [None]:
flights_df['Route'] = flights_df['Origin'] + '->' + flights_df['Dest']
flights_df['Route'].value_counts().head()
top5_delayed = flights_df[flights_df['DepDelay'] > 0].groupby('Route')\
    .size().sort_values(ascending=False).head(5)
top5_delayed

In [None]:
flights_df_top5_delays = flights_df[flights_df['Route'].isin(top5_delayed.index)]
(flights_df_top5_delays['WeatherDelay'] > 0).sum()

**q15  Examine the hourly distribution of departure times.**

In [None]:
flights_df.columns

In [None]:
flights_df['DepHour'].value_counts(normalize=True)

In [None]:
flights_df['DepHour'].value_counts().sort_values()

In [None]:
flights_df['DepHour'].hist(label='all')

In the period from 0 am to 4 am there are considerably less flights than from 7 pm to 8 pm.

**q16 Show how the number of flights changes through time (on the daily/weekly/monthly basis) and interpret the findings.**

In [None]:
flights_df['DayOfWeek'].value_counts().sort_values()

In [None]:
flights_df.columns

In [None]:
flights_df['Month'].value_counts().sort_values()

In [None]:
flights_df[flights_df['Month'] == 1]['Month'].count() + flights_df[flights_df['Month'] == 2]['Month'].count() + flights_df[flights_df['Month'] == 12]['Month'].count()

In [None]:
flights_df[flights_df['Month'] == 6]['Month'].count() + flights_df[flights_df['Month'] == 7]['Month'].count() + flights_df[flights_df['Month'] == 8]['Month'].count()

The number of flights during weekends is less than during weekdays (working days).<br>
There are less flights during winter than during summer.

In [None]:
num_flights_by_month = flights_df[flights_df['Cancelled'] == 0].groupby('Month').size()
num_flights_by_month.plot(kind='bar');

In [None]:
num_flights_by_day_of_week = flights_df[flights_df['Cancelled'] == 0].groupby('DayOfWeek').size()
num_flights_by_day_of_week.plot(kind='bar');

In [None]:
num_flights_by_day_of_week.plot();

**q17  Examine the distribution of cancellation reasons with time. Make a bar plot of cancellation reasons aggregated by months.**

In [None]:
flights_df.columns

In [None]:
flights_df['CancellationCode'].value_counts()

In [None]:
flights_df.groupby('Month')['CancellationCode'].value_counts()

In [None]:
flights_df.groupby('Month')['CancellationCode'].value_counts().plot(kind='bar', figsize=(20,8));

In [None]:
# create a month name list
import calendar

month_names = []

for month_idx in flights_df['Month'].unique():
    month_names.append((calendar.month_name[month_idx]))
    
ax = flights_df.groupby(['Month', 'CancellationCode'])\
    .size().unstack().plot(kind='bar')

ax.set_xticklabels(month_names, rotation=90)
plt.show()

April's top cancellation reason is carriers.

**q18 Which month has the greatest number of cancellations due to Carrier?**

In [None]:
flights_df.groupby('CancellationCode')['Month'].value_counts()

April

In [None]:
import matplotlib.colors as colors
import random 
import calendar

ax = flights_df[flights_df['CancellationCode'] == 'A']\
                                        .groupby(['Month'])['UniqueCarrier']\
                                        .count().plot(kind='bar', color='coral')
ax.set_xticklabels(month_names, rotation=90)
plt.show()

**q19 Identify the carrier with the greatest number of cancellations due to carrier in the corresponding month from the previous question.**

In [None]:
flights_df[(flights_df['CancellationCode'] == 'A') & (flights_df['Month'] == 4)].groupby('UniqueCarrier')['CancellationCode'].count().sort_values()

AA

In [None]:
import matplotlib.colors as colors
import random 

colors_list = list(colors._colors_full_map.values())
selected_colors = random.sample(colors_list,20)

cancelled_Carrier_April = flights_df[(flights_df['CancellationCode'] == 'A') 
                                     & (flights_df['Month'] == 4)]\
                .groupby(['Date', 'UniqueCarrier']).size().unstack()

#get the peak coordinates for annotation
max_value = cancelled_Carrier_April.max().max()
carrier_of_max_value = cancelled_Carrier_April.max().idxmax()
date_of_max_value = cancelled_Carrier_April[carrier_of_max_value].idxmax()

fig, ax = plt.subplots()

cancelled_Carrier_April.plot(color=selected_colors, figsize=(10,8), ax=ax)
ax.annotate(carrier_of_max_value, (str(date_of_max_value), max_value))
fig.autofmt_xdate()

**q20 Examine median arrival and departure delays (in time) by carrier. Which carrier has the lowest median delay time for both arrivals and departures? Leave only non-negative values of delay times ('ArrDelay', 'DepDelay').**

In [None]:
flights_df.columns

In [None]:
flights_df['ArrDelay'].describe()

In [None]:
flights_df['DepDelay'].describe()

In [None]:
flights_df.groupby('UniqueCarrier')['ArrDelay'].median().sort_values()

In [None]:
flights_df.groupby('UniqueCarrier')['DepDelay'].median().sort_values()

In [None]:
sns.boxplot(x='UniqueCarrier', y='ArrDelay', data=flights_df)

In [None]:
flights_df[(flights_df['ArrDelay'] > 0) & (flights_df['DepDelay'] > 0)].groupby('UniqueCarrier')['ArrDelay'].median().sort_values()

In [None]:
flights_df[(flights_df['ArrDelay'] > 0) & (flights_df['DepDelay'] > 0)].groupby('UniqueCarrier')['DepDelay'].median().sort_values()

In [None]:
flights_df[(flights_df['ArrDelay'] > 0) & (flights_df['DepDelay'] > 0)].groupby('UniqueCarrier')['ArrDelay'].max()

In [None]:
flights_df[(flights_df['ArrDelay'] > 0) & (flights_df['DepDelay'] > 0)]['ArrDelay'].describe()

In [None]:
flights_df[(flights_df['ArrDelay'] > 0) & (flights_df['DepDelay'] > 0)]['DepDelay'].describe()

In [None]:
sns.boxplot(flights_df["ArrDelay"])

In [None]:
sns.boxplot(x=flights_df["DepDelay"])

In [None]:
sub_flights_df = flights_df[(flights_df['ArrDelay'] > 0) & (flights_df['ArrDelay'] < 60) &
                            (flights_df['DepDelay'] > 0) & (flights_df['DepDelay'] < 60)]

In [None]:
sns.boxplot(x="UniqueCarrier", y="ArrDelay", data=sub_flights_df)

In [None]:
sns.boxplot(x="UniqueCarrier", y="DepDelay", data=sub_flights_df)