In [1]:
import pandas as pd

# Load the dataset
file_path = 'initial_submission_actual.csv'  
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,DATE,DAY,FLIGHT NUMBER,ORIGIN,DEPARTURE TIME,ARRIVAL TIME,ARRIVAL STATUS,ARRIVAL STATUS_Prev_flight_early,ARRIVAL STATUS_Prev_flight_ontime,ARRIVAL STATUS_Prev_flight_late,ACTUAL ARRIVAL TIME
0,4/10/24,WEDNESDAY,UA 1400,ORD,6:52 PM,9:47 PM,early,,,,9:35 PM
1,4/10/24,WEDNESDAY,AA 3402,ORD,7:59 PM,10:52 PM,,early,early,early,10:33 PM
2,4/10/24,WEDNESDAY,B6 116,JFK,1:33 PM,2:50 PM,late,,,,2:24 PM
3,4/10/24,WEDNESDAY,DL 5182,JFK,2:55 PM,4:21 PM,,early,early,early,3:57 PM
4,4/10/24,WEDNESDAY,WN 5285,MCO,11:05 AM,1:45 PM,late,,,,1:29 PM


In [2]:
# Define a function to classify the flight status
def classify_flight_status(scheduled_time, actual_time):
    # Convert times to pandas datetime objects
    scheduled = pd.to_datetime(scheduled_time, format='%I:%M %p')
    actual = pd.to_datetime(actual_time, format='%I:%M %p')
    
    # Calculate the time difference in minutes
    time_difference = (actual - scheduled).total_seconds() / 60
    if time_difference <= -5:
        return 'early'
    elif -5 <= time_difference <= 5:
        return 'on-time'
    else:
        return 'late'

# Apply the function to each row to determine the flight status
data['FLIGHT STATUS'] = data.apply(lambda row: classify_flight_status(row['ARRIVAL TIME'], row['ACTUAL ARRIVAL TIME']), axis=1)

# Save the updated data to a new CSV file if needed
output_file_path = 'path_to_your_output_csv_file.csv'  # Replace with your desired output CSV file path
data.to_csv(output_file_path, index=False)

# Print the status for each flight
print(data[['ARRIVAL TIME', 'ACTUAL ARRIVAL TIME', 'FLIGHT STATUS']])


   ARRIVAL TIME ACTUAL ARRIVAL TIME FLIGHT STATUS
0       9:47 PM             9:35 PM         early
1      10:52 PM            10:33 PM         early
2       2:50 PM             2:24 PM         early
3       4:21 PM             3:57 PM         early
4       1:45 PM             1:29 PM         early
5       4:25 PM             4:26 PM       on-time
6       9:47 PM             9:57 PM          late
7      10:52 PM            10:50 PM       on-time
8       2:50 PM             2:59 PM          late
9       4:21 PM             4:51 PM          late
10      2:20 PM             3:17 PM          late
11      4:25 PM             4:02 PM         early
12      9:47 PM             9:43 PM       on-time
13     10:52 PM            10:59 PM          late
14      2:50 PM             2:40 PM         early
15      4:21 PM             5:40 PM          late
16      2:20 PM             2:40 PM          late
17      4:25 PM             3:54 PM         early
18      9:47 PM             9:38 PM         early


In [3]:
import pandas as pd

data_sorted = pd.read_csv('path_to_your_output_csv_file.csv')

# Function to label flights, considering the possibility of an unpaired former flight
def label_flights_adjusted(group):
    # Assign labels alternately as 'former' and 'latter', but handle odd count by leaving the last as 'former' if needed
    if len(group) % 2 == 0:
        labels = ['former', 'latter'] * (len(group) // 2)
    else:
        labels = ['former', 'latter'] * (len(group) // 2) + ['former']
    group['Flight Type'] = labels
    return group

# Apply the adjusted labeling function
categorized_flights_adjusted = data_sorted.groupby(['DATE', 'ORIGIN']).apply(label_flights_adjusted).reset_index(drop=True)

# Display the result with the new column 'Flight Type'
print(categorized_flights_adjusted[['DATE', 'ORIGIN', 'FLIGHT NUMBER', 'DEPARTURE TIME', 'Flight Type']].head(40))


       DATE ORIGIN FLIGHT NUMBER DEPARTURE TIME Flight Type
0   4/10/24    JFK        B6 116        1:33 PM      former
1   4/10/24    JFK       DL 5182        2:55 PM      latter
2   4/10/24    MCO       WN 5285       11:05 AM      former
3   4/10/24    MCO        B6 656        1:35 PM      latter
4   4/10/24    ORD       UA 1400        6:52 PM      former
5   4/10/24    ORD       AA 3402        7:59 PM      latter
6   4/11/24    JFK        B6 116        1:33 PM      former
7   4/11/24    JFK       DL 5182        2:55 PM      latter
8   4/11/24    MCO       WN 5285       11:35 AM      former
9   4/11/24    MCO        B6 656        1:35 PM      latter
10  4/11/24    ORD       UA 1400        6:52 PM      former
11  4/11/24    ORD       AA 3402        7:59 PM      latter
12  4/12/24    JFK        B6 116        1:33 PM      former
13  4/12/24    JFK       DL 5182        2:55 PM      latter
14  4/12/24    MCO       WN 5285       11:35 AM      former
15  4/12/24    MCO        B6 656        

In [4]:
import pandas as pd

# Example loading and processing logic (load your data, apply grouping and flight type labeling)
# Assuming 'categorized_flights_adjusted' is already defined and properly structured

# Initialize a counter to track the number of matches and a list to store the matched rows
match_count = 0
match_details = []
matched_rows = []  # List to store indices of matched rows

# Iterate over the flights
for i in range(0, len(categorized_flights_adjusted)):
    current_flight = categorized_flights_adjusted.iloc[i]
    
    if current_flight['Flight Type'] == 'former':
        # Compare ARRIVAL STATUS and FLIGHT STATUS
        if current_flight['ARRIVAL STATUS'] == current_flight['FLIGHT STATUS']:
            match_count += 1
            match_details.append((current_flight['FLIGHT NUMBER'], 'former match'))
            matched_rows.append(i)  # Add row index to matched_rows list

        # Store the former flight status for the latter flight comparison
        former_status = current_flight['FLIGHT STATUS']

    elif current_flight['Flight Type'] == 'latter':
        # Determine which status column to check based on the former flight's ARRIVAL STATUS
        if former_status == 'early':
            status_to_check = 'ARRIVAL STATUS_Prev_flight_early'
        elif former_status == 'on-time':
            status_to_check = 'ARRIVAL STATUS_Prev_flight_ontime'
        elif former_status == 'late':
            status_to_check = 'ARRIVAL STATUS_Prev_flight_late'
        else:
            continue  # If the status is not one of the expected values, skip to the next iteration

        # Compare the relevant status for the latter flight
        if current_flight[status_to_check] == current_flight['FLIGHT STATUS']:
            match_count += 1
            match_details.append((current_flight['FLIGHT NUMBER'], 'latter match'))
            matched_rows.append(i)  # Add row index to matched_rows list

# Extract the DataFrame of matched rows
matched_flights_df = categorized_flights_adjusted.iloc[matched_rows]

# Print the total number of matches and details of each match
print(f"Total number of matches: {match_count}")
print("Match details:")
for flight_number, match_type in match_details:
    print(f"Flight {flight_number}: {match_type}")

# Print the DataFrame rows corresponding to matched flights, nicely formatted
print("\nMatched Flights Details:")
print(matched_flights_df[['DATE', 'ORIGIN', 'FLIGHT NUMBER', 'DEPARTURE TIME', 'FLIGHT STATUS', 'ARRIVAL STATUS', 'Flight Type']])


Total number of matches: 16
Match details:
Flight DL 5182: latter match
Flight UA 1400: former match
Flight AA 3402: latter match
Flight B6 116: former match
Flight WN 5285: former match
Flight B6 656: latter match
Flight UA 1400: former match
Flight AA 3402: latter match
Flight DL 5182: latter match
Flight UA 1400: former match
Flight DL 5182: latter match
Flight WN 5285: former match
Flight UA 1400: former match
Flight AA 3402: latter match
Flight B6 116 : former match
Flight DL 5182: latter match

Matched Flights Details:
       DATE ORIGIN FLIGHT NUMBER DEPARTURE TIME FLIGHT STATUS ARRIVAL STATUS  \
1   4/10/24    JFK       DL 5182        2:55 PM         early            NaN   
4   4/10/24    ORD       UA 1400        6:52 PM         early          early   
5   4/10/24    ORD       AA 3402        7:59 PM         early            NaN   
6   4/11/24    JFK        B6 116        1:33 PM          late           late   
8   4/11/24    MCO       WN 5285       11:35 AM          late        