In [30]:
#Importing the required libraries
import pandas as pd

# Load the dataset
df = pd.read_csv('./data/Airlines.csv')

# Display sample data
df.head()


Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,1,CO,269,SFO,IAH,3,15,205,1
1,2,US,1558,PHX,CLT,3,15,222,1
2,3,AA,2400,LAX,DFW,3,20,165,1
3,4,AA,2466,SFO,DFW,3,20,195,1
4,5,AS,108,ANC,SEA,3,30,202,0


In [31]:
#Overall percentage of delayed flights
total_flights = len(df)
delayed_flights = df[df['Delay'] == 1]
percentage_delayed = (len(delayed_flights) / total_flights) * 100

print('Overall percentage of delayed flights:', percentage_delayed)

Overall percentage of delayed flights: 44.544229239705366


In [32]:
#Airport with the highest number of delays (AirportFrom)
airport_delay_counts = delayed_flights['AirportFrom'].value_counts().idxmax()
print('Airport with the highest number of delays:', airport_delay_counts)

Airport with the highest number of delays: ATL


In [33]:
#Airline most likely to experience delays
airline_delay_ratio = delayed_flights['Airline'].value_counts().idxmax()

print('Airline most likely to experience delays:', airline_delay_ratio)

Airline most likely to experience delays: WN


In [34]:
#Delays by day of the week
delay_by_day = delayed_flights['DayOfWeek'].value_counts().sort_index()
print('Delays by day of the week:', delay_by_day)

Delays by day of the week: DayOfWeek
1    34030
2    31913
3    42254
4    41244
5    35515
6    23615
7    31693
Name: count, dtype: int64


In [35]:
#Delay distribution by time of day (morning vs afternoon vs evening)
# We will divide the time of day into morning (00:00-12:00), afternoon (12:00-18:00), evening (18:00-00:00)
def categorize_time_of_day(time):
    if time <= 720:  # 12:00 PM
        return 'Morning'
    elif time <= 1080:  # 6:00 PM
        return 'Afternoon'
    else:
        return 'Evening'

df['TimeOfDay'] = df['Time'].apply(categorize_time_of_day)
delay_by_time_of_day = df['TimeOfDay'].value_counts()
print('Delay distribution by time of day:', delay_by_time_of_day)

Delay distribution by time of day: TimeOfDay
Morning      227676
Afternoon    206704
Evening      105003
Name: count, dtype: int64


In [36]:
#Routes (AirportFrom -> AirportTo) most prone to delays
df['Route'] = df['AirportFrom'] + ' -> ' + df['AirportTo']
most_delayed_routes = df['Route'].value_counts().idxmax()
print('Routes (AirportFrom -> AirportTo) most prone to delays:', most_delayed_routes)

Routes (AirportFrom -> AirportTo) most prone to delays: LAX -> SFO


In [37]:
#Impact of flight duration on delays
average_length_delayed = delayed_flights['Length'].mean()
average_length_not_delayed = df[df['Delay'] == 0]['Length'].mean()

# Output the results for these questions
print('Impact of flight duration on non-delays', average_length_not_delayed)
print('Impact of flight duration on delays', average_length_delayed)

Impact of flight duration on non-delays 129.6575944690909
Impact of flight duration on delays 135.3696974994173
