In [179]:
#%% Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#%% Importing Data
flights_data = pd.read_csv('flights.csv')
flights_data.head(10)
weather_data_pd = pd.read_csv('weather.csv')
weather_data_np = weather_data_pd.to_numpy()

In [180]:
#%% Pandas Data Filtering/Sorting Question Answering
#use flights_data
#Question 1 How many flights were there from JFK to SLC? Int

q_1 = len(flights_data.loc[(flights_data['origin'] == 'JFK') & (flights_data['dest'] == 'SLC')]) #Filters flights from JFK to destination of SLC. len() function counts the number of applicable flights
print(q_1)

2113


In [181]:
#Question 2 How many airlines fly to SLC? Should be int

flights_to_SLC = flights_data.loc[flights_data['dest'] == 'SLC'] #Finds the flights flying to SLC
q_2 = flights_to_SLC['carrier'].nunique()                        #Finds the number of unique carriers that fly into SLC
print(q_2)

2


In [182]:
#Question 3 What is the average arrival delay for flights to RDU? float

flights_to_RDU = flights_data.loc[flights_data['dest'] == "RDU"]  #Finds the flights that fly to RDU
q_3 = flights_to_RDU['arr_delay'].mean()                          #Finds the average arrival delay for the flights to RDU
print(q_3)

10.052380952380952


In [183]:
#Question 4 What proportion of flights to SEA come from the two NYC airports (LGA and JFK)?  float

flights_to_SEA = flights_data.loc[flights_data['dest'] == 'SEA']  #Finds all flights to SEA
SEA_from_NYC = flights_to_SEA.loc[(flights_to_SEA['origin'] == 'LGA') | (flights_to_SEA['origin'] == 'JFK')] #Finds flights to SEA that are from NYC airports
q_4 = len(SEA_from_NYC)/len(flights_to_SEA)                       #Calculates the proportion of flights to SEA that are from NYC
print(q_4)

0.5332653581442773


In [184]:
#Question 5 Which date has the largest average departure delay? Pd slice with date and float
#please make date a column. Preferred format is 2013/1/1 (y/m/d)

flights_data['date'] = flights_data['year'].astype(str) + '/' + flights_data['month'].astype(str) + '/' + flights_data['day'].astype(str) #Creat a new column using year, month, and day
avg_delay = flights_data.groupby('date')['dep_delay'].mean() #Find the average departure delay for each date
max_delay_date = avg_delay.idxmax()                          #Find the date with the longest average departure delay time
max_delay_time = avg_delay.max()                             #Find the value of the longest average departure delay time
max_delay_time_str = str(max_delay_time)                     #Convert the delay value to a string
q_5 = 'The largest average departure delay was' + ' ' + max_delay_time_str + ' ' + 'on' + ' ' + max_delay_date #Put the max average departure date and time together in a sentence
print(q_5)

The largest average departure delay was 83.5369211514393 on 2013/3/8


In [185]:
#Question 6 Which date has the largest average arrival delay? pd slice with date and float

avg_arr_delay = flights_data.groupby('date')['arr_delay'].mean() #Calculate the average arrival delay for each date
max_arr_delay_date = avg_arr_delay.idxmax()                      #Find the date with the largest average arrival delay
max_arr_delay_time = avg_arr_delay.max()                         #Find the value of the largest average arrival delay
max_arr_delay_time_str = str(max_arr_delay_time)                 #Convert the value to a string
q_6 = 'The largest average arrival delay was' + ' ' + max_arr_delay_time_str + ' ' + 'on' + ' ' + max_arr_delay_date #Put the date and value into a sentence
print(q_6)

The largest average arrival delay was 85.86215538847118 on 2013/3/8


In [186]:
#Question 7 Which flight departing LGA or JFK in 2013 flew the fastest? pd slice with tailnumber and speed
#speed = distance/airtime

flights_data['speed'] = flights_data['distance'] / flights_data['air_time'] #Create a column for speed as distnace/air time
departing_LGA_JFK = flights_data.loc[(flights_data['origin'] == 'LGA') | (flights_data['origin'] == "JFK")] #Find flights departing either LGA or JFK
fastest_flight_index = departing_LGA_JFK['speed'].idxmax()                  #Find the index of the flight with the fastest speed
fastest_flight = departing_LGA_JFK.loc[fastest_flight_index, 'tailnum']     #Find the tailnumber using the index
fastest_speed = departing_LGA_JFK['speed'].max()                            #Find the value of the fastest speed
fastest_speed_str = str(fastest_speed)                                      #Convert value to a string
q_7 = 'The fastest flight was tailnumber' + ' ' + fastest_flight + ' ' + 'that flew at speed' + ' ' + fastest_speed_str
print(q_7)

The fastest flight was tailnumber N666DN that flew at speed 11.723076923076922


In [187]:
#Question 8 Replace all nans in the weather pd dataframe with 0s. Pd with no nans

weather_data_pd.fillna(0, inplace = True)       #Replace all nans with 0
if weather_data_pd.isna().any().any():          #Search for any nan values within the dataset
    print('there are nans in the dataframe')    #Print this if any nans
else:                                           #Execute this if no nans
    print('there are no nans in the dataframe') #Print this is no nans

there are no nans in the dataframe


In [188]:
#%% Numpy Data Filtering/Sorting Question Answering
#Use weather_data_np
#Question 9 How many observations were made in Feburary? Int

february_obs = weather_data_np[weather_data_np[:, 3] == 2] #Search the month column for any values of 2 (February)
q_9 = len(february_obs)                                    #Count the number of observations in February
print(q_9)

671


In [189]:
#Question 10 What was the mean for humidity in February? Float

feb_humidity = february_obs[:, 8].astype(float) #Access the humidity column of February observations and make values floats
q_10 = feb_humidity.mean()                      #Calculate the average of all humidity observations in February
print(q_10)

62.91815201192251


In [190]:
#Question 11 What was the std for humidity in February? Float

q_11 = feb_humidity.std() #Calculate the standard deviation for all humidity observations in February
print(q_11)

20.33690087674334
