# Loading Required packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Loading Data

In [None]:
flights = pd.read_csv("../input/flight-delays/flights.csv", low_memory=False)
airports = pd.read_csv("../input/flight-delays/airports.csv")
airlines = pd.read_csv("../input/flight-delays/airlines.csv")

# Data Transformation

In [None]:
# converting all column names to lower case
flights.columns = map(str.lower, flights.columns)
airports.columns = map(str.lower, airports.columns)
airlines.columns = map(str.lower, airlines.columns)

# Creating date column form day, month, year columns.
flights['date'] = flights['day'].astype(str) +'-'+flights['month'].astype(str) + '-' + flights['year'].astype(str)

In [None]:
flights[['date','day','month', 'year']].sample(10)

In [None]:
# gathering data to dictionary
airlines_name = airlines.set_index('iata_code')['airline'].to_dict()

airports_city = airports.set_index('iata_code')['city'].to_dict()

airports_name = airports.set_index('iata_code')['airport'].to_dict()

airports_state = airports.set_index('iata_code')['state'].to_dict()

airports_country = airports.set_index('iata_code')['country'].to_dict()

airports_latitude = airports.set_index('iata_code')['latitude'].to_dict()

airports_longitude = airports.set_index('iata_code')['longitude'].to_dict()

## Adding required columns form other data frame to single data frame

In [None]:
flights['flight_name'] = flights['airline'].map(airlines_name)
flights['flight_name'] = flights['flight_name'].astype('category')

flights['dest_city'] = flights['destination_airport'].map(airports_city)
flights['dest_city'] = flights['dest_city'].astype('category')

flights['orig_city'] = flights['origin_airport'].map(airports_city)
flights['orig_city'] = flights['orig_city'].astype('category')

flights['dest_name'] = flights['destination_airport'].map(airports_name)
flights['dest_name'] = flights['dest_name'].astype('category')

flights['origin_name'] = flights['origin_airport'].map(airports_name)
flights['origin_name'] = flights['origin_name'].astype('category')


flights['orig_state'] = flights['origin_airport'].map(airports_state)
flights['orig_state'] = flights['orig_state'].astype('category')

flights['dest_state'] = flights['destination_airport'].map(airports_state)
flights['dest_state'] = flights['dest_state'].astype('category')

flights['orig_country'] = flights['origin_airport'].map(airports_country)
flights['orig_country'] = flights['orig_country'].astype('category')

flights['dest_country'] = flights['destination_airport'].map(airports_country)
flights['dest_country'] = flights['dest_country'].astype('category')

flights['dest_lat'] = flights['destination_airport'].map(airports_latitude)

flights['dest_lon'] = flights['destination_airport'].map(airports_longitude)

flights['orig_lat'] = flights['origin_airport'].map(airports_latitude)

flights['orig_lon'] = flights['origin_airport'].map(airports_longitude)


## Data Frame sample

In [None]:
flights.dtypes

In [None]:
flights.to_csv('flights_transformed.csv',index=False)

In [None]:
!tar -zcvf flights_transformed.tar.gz /kaggle/working 

# Data Cleaning and Analysis

For answering different questions, we need to cean data in different ways. So, we did data cleaning - if required, for answering each question.

## 1) In weekends which cities are experiencing more arrivals?

In [None]:
week_end = flights[(flights['day_of_week']==6) | (flights['day_of_week']==7)| (flights['day_of_week']==5)]
week_end = week_end['dest_city']
week_end.value_counts()[:10].plot.bar(xlabel="City", ylabel="Count",title = 'Top 10 flight arrival cities on Weekends')

On weekends(friday, saturday, sunday), more flights are arriving to Chicago and Atlanta Cities.

## 2) Does the flights from the same airport reporting more delays?

### Arrival Delay

In [None]:
delay_air_ports = flights[flights['arrival_delay'].notnull()]
delay_air_ports = delay_air_ports[delay_air_ports['dest_name'].notnull()]
delay_air_ports = delay_air_ports[['arrival_delay','dest_name']]
delay_air_ports['delay'] = np.where(delay_air_ports['arrival_delay'] >= 1, True, False)
delay_air_ports = delay_air_ports[delay_air_ports['delay']== True]
delay_air_ports = delay_air_ports['dest_name']
delay_air_ports.value_counts()[:10].plot.bar(xlabel="Airport Name", ylabel="Count",title = 'Top 10 Airports having more Arrival delay')

Flights from Hartsfield-jackson Atlanta International Airport and Chicago O'Hare International Airport have more arrival delays

### Depature Delay

In [None]:
delay_air_ports = flights[flights['departure_delay'].notnull()]
delay_air_ports = delay_air_ports[delay_air_ports['origin_name'].notnull()]
delay_air_ports = delay_air_ports[['departure_delay','origin_name']]
delay_air_ports['delay'] = np.where(delay_air_ports['departure_delay'] >= 1, True, False)
delay_air_ports = delay_air_ports[delay_air_ports['delay']== True]
delay_air_ports = delay_air_ports['origin_name']
delay_air_ports.value_counts()[:10].plot.bar(xlabel="Airport Name", ylabel="Count",title = 'Top 10 Airports having more Departure delay')

Flights from Hartsfield-jackson Atlanta International Airport and Chicago O'Hare International Airport have more Departure delays. We can see that both arrival and departure deays are hapenning more in Hartsfield-jackson Atlanta International Airport , Chicago O'Hare International Airport and Dallas/fort Worth International Airport.


## 3) Is there a month, in which delays are more common?

In [None]:
delay_month = flights[flights['arrival_delay'].notnull()]
delay_month = delay_month[['month','arrival_delay']]
delay_month['delay'] = np.where(delay_month['arrival_delay'] >= 1, True, False)
delay_month = delay_month[delay_month['delay']== True]
delay_month = delay_month['month']
delay_month.value_counts().plot.bar(xlabel="Month", ylabel="Count",title = 'Number of Arrival Delays in different months')

In the months of june and july there are more Arrival delays, and there are comperetaively less number of delays in October and september.

In [None]:
delay_month = flights[flights['departure_delay'].notnull()]
delay_month = delay_month[['month','departure_delay']]
delay_month['delay'] = np.where(delay_month['departure_delay'] >= 1, True, False)
delay_month = delay_month[delay_month['delay']== True]
delay_month = delay_month['month']
delay_month.value_counts().plot.bar(xlabel="Month", ylabel="Count",title = 'Number of Departure Delays in different months')

In the months of june and july there are more delays, and there are comperetaively less number of Departure delays in October and september.

## 4) In what day of the week people happen to travel more?

In [None]:
day = flights['day_of_week']
day.value_counts().plot.bar(xlabel="Day of the week", 
                            ylabel="Count",
                            title = 'Flight count on different days of the week')

In US people tend to trave equally on all days, except on saturday. 

Note: This inference in based on flight count, not the passenger count

## 5) Does the delay happen more in particular season?

In [None]:
delay_month = flights[flights['arrival_delay'].notnull()]
delay_month = delay_month[['month','arrival_delay']]
delay_month['delay'] = np.where(delay_month['arrival_delay'] >= 1, True, False)
delay_month = delay_month[delay_month['delay']== True]
delay_month = delay_month['month']
delay_month.value_counts().plot.bar(xlabel="Month", ylabel="Count",title = 'Number of Arrival Delays in different months')

In US, Arrival delay of flights are happening more in the summer season.

In [None]:
delay_month = flights[flights['departure_delay'].notnull()]
delay_month = delay_month[['month','departure_delay']]
delay_month['delay'] = np.where(delay_month['departure_delay'] >= 1, True, False)
delay_month = delay_month[delay_month['delay']== True]
delay_month = delay_month['month']
delay_month.value_counts().plot.bar(xlabel="Month", ylabel="Count",title = 'Number of Departure Delays in different months')

In US, Departure delay of flights are also happening more in the summer season.

## 6) Having more airtime results in more delay?

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(8, 8)
plt.scatter(flights['air_time'], flights['arrival_delay'],alpha = 0.3, marker = '.')
plt.ylabel('Arrival Delay(minutes)')
plt.xlabel('Air Time(minutes)')
plt.title('Relationship between Air Time and Arrival Delay')
plt.show()

From this scatter plot we can say that, The chances of Arrival delay is less when we have more air time

## 7) Witch air ports are taking more taxi in and taxi out times?

### Taxi Out

In [None]:
taxi_out = flights[flights['origin_name'].notnull()]
taxi_out = taxi_out[flights['taxi_out'].notnull()]
taxi_out = taxi_out[['origin_name', 'taxi_out']]
taxi_out = taxi_out.groupby(['origin_name'])['taxi_out'].median().reset_index()
taxi_out = taxi_out.sort_values(by='taxi_out', ascending=False, na_position='first').head(10)


plt.figure(figsize=(10,6))
sns.barplot(x='taxi_out', 
            y="origin_name", 
            data=taxi_out, 
            order=taxi_out.sort_values('taxi_out',ascending = False).origin_name,
           palette="Blues_d")
plt.ylabel('Airport Name')
plt.xlabel('Median Taxi Out Time(Minutes)')
plt.title('Top 10 Airports having more Taxi out time')
plt.show()

In New York International Airport, on Average, Flights are taking about 24 minute Taxi Out time

### Taxi In

In [None]:
taxi_in = flights[flights['dest_name'].notnull()]
taxi_in = taxi_in[flights['taxi_in'].notnull()]
taxi_in = taxi_in[['dest_name', 'taxi_in']]
taxi_in = taxi_in.groupby(['dest_name'])['taxi_in'].median().reset_index()
taxi_in = taxi_in.sort_values(by='taxi_in', ascending=False, na_position='first').head(10)

plt.figure(figsize=(10,6))
sns.barplot(x='taxi_in', 
            y="dest_name", 
            data=taxi_in, 
            order=taxi_in.sort_values('taxi_in',ascending = False).dest_name,
           palette="Blues_d")
plt.ylabel('Airport Name')
plt.xlabel('Median Taxi Out Time(Minutes)')
plt.title('Top 10 Airports having more Taxi out time')
plt.show()

In Chicago O'Hare International Airport, on Average, Flights are taking about 11 minute Taxi Out time

From the both plts we can say that, Taxi In time Longer than Taxi Out time


## 8) How much percent of flights get delayed through weather?

In [None]:
total_length = flights.shape[0]
weather_length = flights[flights['weather_delay'].notna()]
weather_length = flights[flights['weather_delay']>0]
weather_length = weather_length.shape[0]
weather_prop = (weather_length/total_length)*100
print("In the year 2015, {0:.2f}% of were delayed due to un expected Weather".format(weather_prop))

## 9) Does the flights from same Airline experiencing more delays?

### Departure Delay

In [None]:
airline_dept = flights[flights['departure_delay']>0]
airline_dept = airline_dept[['flight_name','departure_delay']]
airline_dept = airline_dept.groupby(['flight_name'])['departure_delay'].median().reset_index()
airline_dept = airline_dept.sort_values(by='departure_delay', ascending=False, na_position='first').head(10)

plt.figure(figsize=(10,6))
sns.barplot(x='departure_delay', 
            y="flight_name", 
            data=airline_dept, 
            order=airline_dept.sort_values('departure_delay',ascending = False).flight_name,
           palette="Blues_d")
plt.ylabel('Airline Name')
plt.xlabel('Median Departure Delay(Minutes)')
plt.title('Top 10 Airlines having Departure Delay')
plt.show()

Americal Eagle Airlines Inc has about 20 minute average, departure delay.

### Arrival Delay

In [None]:
airline_arr = flights[flights['arrival_delay']>0]
airline_arr = airline_arr[['flight_name','arrival_delay']]
airline_arr = airline_arr.groupby(['flight_name'])['arrival_delay'].median().reset_index()
airline_arr = airline_arr.sort_values(by='arrival_delay', ascending=False, na_position='first').head(10)

plt.figure(figsize=(10,6))
sns.barplot(x='arrival_delay', 
            y="flight_name", 
            data=airline_arr, 
            order=airline_arr.sort_values('arrival_delay',ascending = False).flight_name,
           palette="Blues_d")
plt.ylabel('Airline Name')
plt.xlabel('Median Arrival Delay(Minutes)')
plt.title('Top 10 Airlines having Arrival Delay')
plt.show()

Sprit Airlines has about 20 minute average, Arrival delay.