<a href="https://colab.research.google.com/github/saketpram/nyc-subway/blob/main/Subway.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Saket Ram
# Collaboration: None
# Sources: Used ChatGPT for debugging and plot help, matplotlib documentation,
# pandas documentation, numpy documentation, images and data from MTA website

# Relevant imports
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
from google.colab import drive
from google.colab.patches import cv2_imshow
import cv2
drive.mount('/content/drive')
import zipfile
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import colormaps
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from scipy.stats import shapiro, ttest_ind, f_oneway, mannwhitneyu
import seaborn as sns
pd.options.mode.chained_assignment = None

# Global variables for file loading and zip extraction
realTimeData = "/content/drive/My Drive/ENGR 1050 Final Project/02 19 Data/stop_times.txt"
stops = "/content/drive/My Drive/ENGR 1050 Final Project/02 19 Data/stops.txt"
# zipFile = '/content/drive/My Drive/ENGR 1050 Final Project/02 19 Data/gtfs.zip'
# endLoc = '/content/drive/My Drive/ENGR 1050 Final Project/02 19 Data'

def readData(link):
  """Imports CSV as a dataframe with the first column of the spreadsheet
  set to row indices"""
  df = pd.read_csv(link, header=0, index_col=0)
  return df

# Set df and stopsDf as global variable so the dataframe does not need to be read from CSV multiple times
# (it is a large file)
df = readData(realTimeData)
stopsDf = readData(stops)

def extract(link, endLoc):
  """Extracts all files from zip file at location link and sends them to the folder
  specified by endLoc."""
  with zipfile.ZipFile(link, 'r') as zip_ref:
      zip_ref.extractall(endLoc)

  # Return nothing
  return

def subwayLineQuery(df, line, direction):
  """Takes in dataframe with realtime subway data, string representing subway line,
  and string representing direction. Returns dataframe with rows that
  correspond to weekdays, the line specified by the user, and the direction
  specified by the user."""

  # Uses unique code to extract trains that bear the 6X designation
  if line == '6X':

    # Train names that serve as unique identifiers for the 6X
    joinedString1 = f"{line[0]}..{direction}02"
    joinedString2 = f"{line[0]}..{direction}08"

    # Filters by weekday and ensures only rows remaining are rows associated with 6X
    filteredDf1 = df.query(f'trip_id.str.contains("Weekday")', engine='python')
    filteredDf = filteredDf1.query(f'trip_id.str.contains("_{joinedString1}") or trip_id.str.contains("_{joinedString2}")', engine='python')

  # 6 train
  elif line == '6':

    # Removes 6X trains using similar filtering logic as above
    joinedString1 = f"{line}..{direction}02"
    joinedString2 = f"{line[0]}..{direction}08"
    filteredDf1 = df[~df.index.str.contains(f"_{joinedString1}")]
    filteredDf2 = filteredDf1[~filteredDf1.index.str.contains(f"_{joinedString2}")]

    # In this filtered dataframe, only preserves rows associated with the 6 train
    joinedString = f"{line}..{direction}"
    filteredDf = filteredDf2.query(f'trip_id.str.contains("Weekday") and trip_id.str.contains("_{joinedString}")', engine='python')

  # 7 train
  elif line == '7':

    # Removes 7X trains
    joinedString1 = f"{line}..N97X"
    joinedString2 = f"{line}..S98X"
    filteredDf1 = df[~df.index.str.contains(f"_{joinedString1}")]
    filteredDf2 = filteredDf1[~filteredDf1.index.str.contains(f"_{joinedString2}")]

    # Only preserves rows associated with 7 train
    joinedString = f"{line}..{direction}"
    filteredDf = filteredDf2.query(f'trip_id.str.contains("L0S1") and trip_id.str.contains("_{joinedString}")', engine='python')

  # 7X train
  elif line == '7X':

    # Uses code to extract 7X trains running either northbound or southbound based on direction
    # specified when function was called
    if direction == 'N':
      joinedString = f"{line[0]}..{direction}97X"
    else:
      joinedString = f"{line[0]}..{direction}98X"

    # Only preserves rows associated with 7X train
    filteredDf = df.query(f'trip_id.str.contains("_{joinedString}")', engine='python')

  # Queens Blvd Lines and Nassau Line
  elif line == 'E' or line == 'F' or line == 'M' or line == 'R' or line == 'J':

    # Only preserves rows associated with E train
    joinedString = f"{line}..{direction}"
    filteredDf = df.query(f'trip_id.str.contains("L0S1") and trip_id.str.contains("_{joinedString}")', engine='python')

  # W train
  elif line == 'W':

    # Uses code to extract W trains running either northbound or southbound based on direction
    # specified when function was called
    if direction == 'N':
      joinedString = "N..N72"
    else:
      joinedString = "N..S72"
    filteredDf = df.query(f'trip_id.str.contains("Weekday") and trip_id.str.contains("_{joinedString}")', engine='python')

  # N train
  elif line == 'N':

    # Removes W trains
    joinedString1 = f"N..N72"
    joinedString2 = f"N..S72"
    filteredDf1 = df[~df.index.str.contains(f"_{joinedString1}")]
    filteredDf2 = filteredDf1[~filteredDf1.index.str.contains(f"_{joinedString2}")]

    # Only preserves rows associated with N train
    joinedString = f"{line}..{direction}"
    filteredDf = filteredDf2.query(f'trip_id.str.contains("Weekday") and trip_id.str.contains("_{joinedString}")', engine='python')

  # All other lines
  else:

    # Creates string representing unique identifier for specific train line and
    # only preserves rows containing that unique identifier
    joinedString = f"{line}..{direction}"
    filteredDf = df.query(f'trip_id.str.contains("Weekday") and trip_id.str.contains("_{joinedString}")', engine='python')

  # Returns filtered dataframe
  return filteredDf

def getSec(time):
  """Get total seconds from time of the form HH:MM:SS."""
  h, m, s = time.split(':')
  return int(h) * 3600 + int(m) * 60 + int(s)

def addSeconds(df, colList):
  """Given dataframe with specified list of columns that have time in format HH:MM:SS, adds
  that many columns with the same time in seconds format."""
  for column in colList:
    df[f"{column}_sec"] = df[column].apply(getSec)
  return df

def subwayTimeQuery(df, col1, col2, time1, time2):
  """Given dataframe, two columns, and two time values, creates filtered view of dataframe
  that only contains rows between these times."""
  timeDf = df.query(f'{col1}>={time1} and {col2}<{time2}', engine = 'python')
  return timeDf

def readBullet(line):
  """Give the specified train line, loads image of train bullet with OpenCV,
  switching the R and B color channels."""

  # Access image file
  bullet = f'/content/drive/My Drive/ENGR 1050 Final Project/02 19 Data/Bullets/{line}_Train.png'
  imageColor = cv2.imread(bullet, cv2.IMREAD_COLOR)

  if line != 'N' and line != 'Q' and line != 'R' and line != 'W':
    # Find all black pixels (each image has a transparent background, which is loaded
    # as black)
    black_pixels = np.where(
        (imageColor[:, :, 0] == 0) &
        (imageColor[:, :, 1] == 0) &
        (imageColor[:, :, 2] == 0))

    # Set black pixels to white
    imageColor[black_pixels] = [255, 255, 255]

  # Flip R and B channels (because matplotlib does RGB instead of BGR and return flipped image)
  imageColor_rgb = cv2.cvtColor(imageColor, cv2.COLOR_BGR2RGB)
  return imageColor_rgb

def stopExtractor(secondsDf, line, direction):
  """Creates dictionary that associates each stop ID with a number based on order."""

  # Creates list of standard lines for which this dictionary can be created easily
  stdLines = ['1', '6',  '7', 'C', 'L', 'J', 'W']

  # Initializes empty dictionary that will contain keys as stop IDs and values as
  # order number that corresponds to each stop
  numDict = {}

  # Conditional, checks if line is in stdLines or not
  if line in stdLines:

    # If line is in stdLines, simply sort the stop IDs in the dataframe under
    # column 'stop_id'
    stopsList = list(dict.fromkeys(secondsDf['stop_id']))
    stopsList.sort()

  # If the line is 2 train
  elif line == '2':

    # Assign stop IDs as specified in the subway map to create 2 train route
    part0 = ['201'+direction]
    part1 = [str(i)+direction for i in range(204,223)]
    part2 = [str(i)+direction for i in range(224,228)]
    part3 = ['120'+direction, '123'+direction, '127'+direction,
             '128'+direction, '132'+direction, '137'+direction]
    part4 = [str(i)+direction for i in range(228,240)]
    part5 = [str(i)+direction for i in range(241,248)]

    # Combine all lists
    stopsList = part0 + part1 + part2 + part3 + part4 + part5

    # Perform same dictionary addition steps as previously
    numList = np.arange(len(stopsList))
    if direction == 'N':
      stopsList.reverse()
    for i in numList:
      numDict[stopsList[i]] = i

  # If line is 3 train
  elif line == '3':

    # Assign stop IDs as specified in the subway map to create 3 train route
    part1 = ['301'+direction, '302'+direction]
    part2 = [str(i)+direction for i in range(224,228)]
    part3 = ['120'+direction, '123'+direction, '127'+direction,
             '128'+direction, '132'+direction, '137'+direction]
    part4 = [str(i)+direction for i in range(228,240)]
    part5 = [str(i)+direction for i in range(248,258)]
    stopsList = part1 + part2 + part3 + part4 + part5

  # If line is 4 train
  elif line == '4':

    # Assign stop IDs as specified in the subway map to create 4 train route
    part0 = ['401'+direction, '402'+direction]
    part1 = [str(i)+direction for i in range(405, 417)]
    part2 = ['621'+direction, '626'+direction, '629'+direction, '631'+direction,
             '635'+direction, '640'+direction]
    part3 = [str(i)+direction for i in range(418, 421)]
    part4 = ['423'+direction, '234'+direction, '235'+direction,
             '238'+direction, '239'+direction, '250'+direction]

    # Combine all lists
    stopsList = part0 + part1 + part2 + part3 + part4

  # If line is 5 train
  elif line == '5':

    # Assign stop IDs as specified in the subway map to create 5 train route
    # NOTE: Nereid Av pattern not shown to avoid confusion. 5 trains that terminate
    # at 180 St in the graphs produced are assumed to go to Nereid Av
    part1 = [str(i)+direction for i in range(501, 506)]
    part2 = [str(i)+direction for i in range(213, 223)]
    part3 = ['416'+direction, '621'+direction, '626'+direction,
             '629'+direction, '631'+direction, '635'+direction,
             '640'+direction]
    part4 = [str(i)+direction for i in range(418, 421)]
    part5 = ['423'+direction, '234'+direction, '235'+direction,
             '238'+direction, '239'+direction]
    part6 = [str(i)+direction for i in range(241,248)]

    # Combine all lists
    stopsList = part1 + part2 + part3 + part4 + part5 + part6

  # If train is 6X train
  elif line == '6X':

    # Assign stop IDs as specified in the subway map to create 6X train route
    part1 = [str(i)+direction for i in range(601, 605)]
    part2 = [str(i)+direction for i in range(606, 609)]
    part3 = ['613'+direction, '619'+direction]
    part4 = [str(i)+direction for i in range(621, 641)]

    # Combine all lists
    stopsList = part1 + part2 + part3 + part4

  # If train is 7X train
  elif line == '7X':

    # Assign stop IDs as specified in the subway map to create 7X train route
    # In 2023, the train is making all stops between Queensboro Plaza and
    # 74 St-Rawson, so these stops are included
    part1 = ['701'+direction, '702'+direction, '707'+direction]
    part2 = [str(i)+direction for i in range(710, 717)]
    part3 = [str(i)+direction for i in range(718,722)]
    part4 = [str(i)+direction for i in range(723,727)]

    # Combine all lists
    stopsList = part1 + part2 + part3 + part4

  # If train is A train
  elif line == 'A':

    # Assign stop IDs as specified in the subway map to create A train route
    # Route to Lefferts and to Rockaway Park 116 St ignored to avoid confusion
    # A trains terminating at Rockaway Blvd are assumed to go to Lefferts and
    # A trains terminating at Broad Channel are assumed to go to 116 St

    # Northbound A trains stop at Aqueduct Racetrack, southbound does not
    # 135 St included in graphs for both directions because some A trains went
    # local and stopped at 135 St
    if direction == 'S':
      part1 = ['A02', 'A03', 'A05', 'A06', 'A07', 'A09', 'A12', 'A14', 'A15',
             'A24', 'A27', 'A28', 'A31', 'A32', 'A34', 'A36', 'A38',
             'A40', 'A41', 'A42', 'A46', 'A48', 'A51', 'A55', 'A57',
             'A59', 'A60', 'A61', 'H02', 'H03', 'H04', 'H06', 'H07',
               'H08', 'H09', 'H10', 'H11']
    else:
      part1 = ['A02', 'A03', 'A05', 'A06', 'A07', 'A09', 'A12', 'A14', 'A15',
             'A24', 'A27', 'A28', 'A31', 'A32', 'A34', 'A36', 'A38',
             'A40', 'A41', 'A42', 'A46', 'A48', 'A51', 'A55', 'A57',
             'A59', 'A60', 'A61', 'H01', 'H02', 'H03', 'H04', 'H06', 'H07',
               'H08', 'H09', 'H10', 'H11']

    # Combine all lists
    stopsList = [i+direction for i in part1]

  # If train is E train
  elif line == 'E':

    # Assign stop IDs as specified in the subway map to create E train route
    # (Brairwood and 75 Av stops added as some E trains stop at Briarwood)
    part1 = ['G05', 'G06',
             'G07', 'F05', 'F06', 'F07', 'G08', 'G14', 'G21', 'F09', 'F11', 'F12', 'D14']
    part2 = ['A25', 'A27', 'A28', 'A30', 'A31', 'A32', 'A33',
             'A34', 'A36', 'A38', 'E01']

    # Combine all lists
    stopsList = [i+direction for i in part1] + [j+direction for j in part2]

  # If train is B train
  elif line == 'B':

    # Assign stop IDs as specified in the subway map to create B train route
    part1 = ['D0'+str(i) for i in range(3,10)]
    part2 = ['D'+str(i) for i in range(10,14)]
    part3 = ['A'+str(i) for i in range(14,23)]
    part4 = ['A24']
    part5 = ['D14', 'D15', 'D16', 'D17', 'D20', 'D21', 'D22', 'R30',
             'D24', 'D25', 'D26', 'D28', 'D31', 'D35', 'D39', 'D40']

    # Combine all lists
    stopsList = ([i+direction for i in part1] + [j+direction for j in part2]
                 + [k+direction for k in part3] + [l+direction for l in part4]
                 + [m+direction for m in part5])

  # If train is D train
  elif line == 'D':
    part1 = ['D01']
    part2 = ['D0'+str(i) for i in range(3,10)]
    part3 = ['D'+str(i) for i in range(10,14)]
    part4 = ['A15', 'A24']

    # DeKalb added even though D train does not regularly stop at DeKalb
    # (some trains do)
    part5 = ['D14', 'D15', 'D16', 'D17', 'D20', 'D21', 'D22', 'R30',
             'R31', 'R32', 'R33', 'R34',
             'R35', 'R36', 'B12', 'B13', 'B14', 'B15', 'B16',
             'B17', 'B18', 'B19', 'B20', 'B21', 'B22', 'B23',
             'D43']

    # Combine all lists
    stopsList = ([i+direction for i in part1] + [j+direction for j in part2]
                 + [k+direction for k in part3] + [l+direction for l in part4]
                 + [m+direction for m in part5])

  # F train
  elif line == 'F':
    part1 = ['F0'+str(i) for i in range(1,8)]

    # NOTE: SERVICE CHANGE DUE TO CONSTRUCTION ON 63 ST LINE
    part2 = ['G08', 'G14', 'G21', 'F09', 'F11', 'F12', 'D15',
             'D16', 'D17', 'D18', 'D19', 'D20', 'D21', 'F14',
             'F15', 'F16', 'F18', 'A41', 'F20', 'F21', 'F22', 'F23',
             'F24', 'F25', 'F26', 'F27', 'F29', 'F30', 'F31',
             'F32', 'F33', 'F34', 'F35', 'F36', 'F38', 'F39',
             'D42', 'D43']

    # Combine all lists
    stopsList = [i+direction for i in part1] + [j+direction for j in part2]

  # M train
  elif line == 'M':

    # NOTE: SERVICE CHANGE DUE TO CONSTRUCTION ON 63 ST LINE
    part1 = ['B10', 'D15',
             'D16', 'D17', 'D18', 'D19', 'D20', 'D21',
             'M18', 'M16', 'M14', 'M13', 'M12', 'M11', 'M10',
             'M09', 'M08', 'M06', 'M05', 'M04', 'M01']

    # Combine all lists
    stopsList = [i+direction for i in part1]
    stopsList.reverse()

  # G train
  elif line == 'G':
    part1 = ['G22', 'G24', 'G26', 'G28', 'G29', 'G30',
             'G31', 'G32', 'G33', 'G34', 'G35', 'G36',
             'A42', 'F20', 'F21', 'F22', 'F23', 'F24',
             'F25', 'F26', 'F27']

    # Combine all lists
    stopsList = [i+direction for i in part1]

  # Q train
  elif line == 'Q':
    part1 = ['Q05', 'Q04', 'Q03', 'B08', 'R14',
             'R16', 'R17', 'R20', 'Q01', 'R30',
             'D24', 'D25', 'D26', 'D27', 'D28',
             'D29', 'D30', 'D31', 'D32', 'D33',
             'D34', 'D35', 'D37', 'D38', 'D39',
             'D40', 'D41', 'D42', 'D43']

    # Combine all lists
    stopsList = [i+direction for i in part1]

  # N train
  elif line == 'N':
    part1 = ['R01', 'R03', 'R04', 'R05', 'R06',
              'R08', 'R09', 'R11', 'R13', 'R14',
             'R15', 'R16', 'R17', 'R18', 'R19','R20', 'R21',
             'R22','R23','Q01','R24',
             'R25', 'R26', 'R27', 'R28', 'R29', 'R30',
             'R31', 'R32', 'R33', 'R34',
             'R35', 'R36', 'R39', 'R40', 'R41', 'N02', 'N03',
             'N04', 'N05', 'N06', 'N07', 'N08',
             'N09', 'N10', 'D43']

    # Combine all lists
    stopsList = [i+direction for i in part1]


  # R train
  elif line == 'R':
    part1 = ['G08', 'G09', 'G10', 'G11', 'G12', 'G13', 'G14',
             'G15', 'G16', 'G18', 'G19', 'G20', 'G21']
    part2 = ['R11', 'R13', 'R14', 'R15', 'R16',
             'R17', 'R18', 'R19', 'R20', 'R21', 'R22',
             'R23', 'R24', 'R25', 'R26', 'R27', 'R28',
             'R29', 'R30', 'R31', 'R32', 'R33', 'R34',
             'R35', 'R36', 'R39', 'R40', 'R41', 'R42',
             'R43', 'R44', 'R45']

    # Combine all lists
    stopsList = [i+direction for i in part1] + [j+direction for j in part2]


  # Create list from 0 to len(stopsList)
  numList = np.arange(len(stopsList))

  # If the train is northbound, reverse order of stops
  if direction == 'N':
    stopsList.reverse()

  # Assign each key from stopsList a number from numList
  for i in numList:
    numDict[stopsList[i]] = i

  # Add this dictionary to dataframe
  secondsDf['stop_val'] = secondsDf['stop_id'].map(numDict)

  # Return dataframe and the dictionary that corresponds each stop ID to
  # sequence number
  return secondsDf, numDict


def stopIDtoName(stopsDf, stopIDlist):
  """Takes in list of stop IDs, returns dictionary with stop IDs as keys and stop
  names as values."""

  # Creates dictionary with indices of stopDf as keys and stopDf['stop_name'] as
  # values
  stopNameDict = dict(zip(stopsDf.index, stopsDf['stop_name']))

  # Initializes new empty dictionary called stopNames
  stopNames = {}

  # For each stopID in stopIDlist, finds the real street name of the stop and
  # stores it in stopNames dictionary as the value corresponding to the stopID
  # key
  for stopID in stopIDlist:
    stopName = stopNameDict[stopID]
    if stopID == 'Q01S' or stopID == 'Q01N':
      stopNames[stopID] = 'Canal St (Exp)'
    elif stopID == 'R23S' or stopID == 'R23N':
      stopNames[stopID] = 'Canal St (Lcl)'
    else:
      stopNames[stopID] = stopName

  # Returns stopNames dictionary
  return stopNames

def partOne(line, direction, rushHourType):
  """Given a line, a direction, and the time of day (morning rush or evening rush),
  returns
  - stopDict, a dictionary corresponding each stop ID to a sequence number
  - stopNames, a dictionary corresponding each stop ID to its street name
  - timeList, a list of times that will serve as the x-axis of the graph depending
  on whether it is morning rush or evening rush
  - time1, the time in seconds corresponding to the beginning of morning or evening rush
  - time2, the time in seconds corresponding to the  end of morning or evening rush
  - filteredSecondsDf, a dataframe for the specific line, time of day, and rush hour type
  with each stop arrival being listed in HH:MM:SS as well as just seconds"""

  # Initializes df as a global variable (original dataframe already loaded in)
  global df

  # Runs subwayLineQuery to filter df by line and direction
  subwayDf = subwayLineQuery(df, line, direction)

  # Runs addSeconds to ensure stop times are in both HH:MM:SS and seconds
  timeDf = addSeconds(subwayDf, ['arrival_time', 'departure_time'])

  # If evening, set time1 to be the time in seconds for the beginning of evening rush,
  # time2 to be the time in seconds for the end of evening rush, and timeList to be the
  # list of times to be sent to the x-axis
  if rushHourType == 'evening':
    time1 = 55800
    time2 = 72000
    timeList = ['15:30', '15:45', '16:00', '16:15', '16:30', '16:45', '17:00',
                                           '17:15', '17:30', '17:45', '18:00', '18:15', '18:30', '18:45', '19:00',
                                           '19:15', '19:30', '19:45', '20:00']

  # Do the same for morning rush
  else:
    time1 = 23400
    time2 = 34200
    timeList = ['6:30', '6:45', '7:00', '7:15', '7:30',
                                          '7:45', '8:00', '8:15', '8:30', '8:45',
                                          '9:00', '9:15', '9:30']

  # Filter to ensure only stop times between the time for the beginning of rush and the time for
  # the end of rush are listed
  secondsDf = subwayTimeQuery(timeDf, 'arrival_time_sec', 'departure_time_sec', time1, time2)

  # Get the stopDict for a train given direction as well the same secondsDf with an additional column
  # containing the stop number in the sequence for the line
  filteredSecondsDf, stopDict = stopExtractor(secondsDf, line, direction)

  # Initialize global variable stopsDf
  global stopsDf

  # Get dictionary associating each stopID to its street name
  stopNames = stopIDtoName(stopsDf, list(stopDict.keys()))

  # Return values as specified
  return stopDict, stopNames, timeList, time1, time2, filteredSecondsDf

def trainPlot(lines):
  """Create plot of all the trains that run in both directions during morning
  and evening rush. Return nothing."""

  # Set title weight and axes weight to bold
  plt.rcParams["font.weight"] = "bold"
  plt.rcParams["axes.labelweight"] = "bold"
  plt.style.use('default')

  # Specify list of directions and times
  directions = ['S', 'N']
  times = ['morning', 'evening']

  # Check if there is a rush hour express train variant (only applies to 6X and 7X)
  express = False
  for line in lines:
    if 'X' in line:
      express = True
      break

  # If there is, create a figure that has len(lines)*2-1 rows, 2 columns, with a
  # pre-determined figure size (to ensure that all of the train graphs have the same
  # figure size)
  if express == True:
    fig, axs = plt.subplots(len(lines)*2-1, len(directions), figsize=(30, 60*(len(lines)*2-1)/7))
  else:
    fig, axs = plt.subplots(len(lines)*2, len(directions), figsize=(30, 60*len(lines)*2/7))

  # Title graph based on which set of lines is submitted into the function
  # (some of the lines just have spaces because these are easier to work with
  # when adding the actual train bullets)
  # Also specify the color with which plots will be plotted (this is dependent
  # on the bullet color and was manually selected)
  if lines == ['1', '2', '3']:
    fig.suptitle('IRT 7 Av Line (1) (2) (3) Realtime Data', size=40, y=1.001,weight='bold')
    linColor = 'red'
  elif lines == ['4', '5', '6', '6X']:
    fig.suptitle('IRT Lexington Av Line (4) (5) (6) (X) Realtime Data', size=40, y=1.001,weight='bold')
    linColor = 'green'
  elif lines == ['7', '7X']:
    fig.suptitle('IRT Flushing Line (7) (X) Realtime Data', size=40, y=1.001,weight='bold')
    linColor = 'mediumorchid'
  elif lines == ['A', 'C', 'E']:
    fig.suptitle('IND 8 Av Line (A) (C) (E) Realtime Data', size=40, y=1.001, weight='bold')
    linColor = 'darkblue'
  elif lines == ['B', 'D', 'F', 'M']:
    fig.suptitle('IND 6 Av Line (                        ) Realtime Data', size=40, y=1.001, weight='bold')
    linColor = 'darkorange'
  elif lines == ['L']:
    fig.suptitle('BMT Canarsie Line (L) Realtime Data', size=40, y=1.001, weight='bold')
    linColor = 'darkgrey'
  elif lines == ['G']:
    fig.suptitle('IND Crosstown Line (G) Realtime Data', size=40, y=1.001, weight='bold')
    linColor = 'limegreen'
  elif lines == ['J']:
    fig.suptitle('BMT Nassau Line ( J) (Z) Realtime Data', size=40, y=1.001, weight='bold')
    linColor = 'brown'
  elif lines == ['N', 'Q', 'R', 'W']:
    fig.suptitle('BMT Broadway Line (                   ) Realtime Data', size=40, y=1.001, weight='bold')
    linColor = 'gold'

  # Iterate through each line, direction, time triplet
  for i, line in enumerate(lines):
    for k, dir in enumerate(directions):
      for j, time in enumerate(times):

          # Extract several pieces of information from partOne function
          stopDict, stopNames, timeList, time1, time2, filteredSecondsDf = partOne(line, dir, time)

          # For each train trip in the filteredSecondsDf, extract the station times and the stop vals
          for trip in list(filteredSecondsDf.index):
            tripDf = filteredSecondsDf.loc[trip]
            stationTimes = tripDf['arrival_time_sec']
            stopVal = tripDf['stop_val']

            # If the train is express variant of 6 or 7, plot it on the row before, else print it
            # as normal
            if dir == 'N' and 'X' in line:
              axs[i*2+j-1,k].plot(stationTimes, stopVal, color = linColor)
            else:
              axs[i*2+j,k].plot(stationTimes, stopVal, color = linColor)

          # Train labels; if train is 7 or 7X, 7X runs "South" in the morning so this will be
          # included and the south will be changed to west, which more accurately reflects how the
          # way the 7 train travels. The J is also trained to westbound. Else the train is just listed as
          # southbound. Final destination of train extracted from stopNames dictionary keys
          if dir == 'S':
            if time == 'morning':
              if line == '7' or line == '7X':
                axs[i*2+j, k].set_title(f'AM Rush Hour Westbound ({line}) Train to {list(stopNames.values())[-1]}', size=20)
              elif line == 'J':
                axs[i*2+j, k].set_title(f'AM Rush Hour Westbound ( J) and (Z) Trains to {list(stopNames.values())[-1]}', size=20, wrap=True)
              else:
                axs[i*2+j, k].set_title(f'AM Rush Hour Southbound ({line}) Train to {list(stopNames.values())[-1]}', size=20)

            # Simiar procedure for westbound evening
            else:
              if 'X' in line:
                pass
              else:
                if line == '7':
                  axs[i*2+j, k].set_title(f'PM Rush Hour Westbound ({line}) Train to {list(stopNames.values())[-1]}', size=20)
                elif line == 'J':
                  axs[i*2+j, k].set_title(f'AM Rush Hour Westbound (J) Train to {list(stopNames.values())[-1]}', size=20, wrap=True)
                else:
                  axs[i*2+j, k].set_title(f'PM Rush Hour Southbound ({line}) Train to {list(stopNames.values())[-1]}', size=20)

          # Similar procedure for eastbound morning
          else:
            if time == 'morning':
              if 'X' in line:
                pass
              else:
                if line == '7':
                  axs[i*2+j, k].set_title(f'AM Rush Hour Eastbound ({line}) Train to {list(stopNames.values())[-1]}', size=20)
                elif line == 'J':
                  axs[i*2+j, k].set_title(f'AM Rush Hour Eastbound (J) Train to {list(stopNames.values())[-1]}', size=20, wrap=True)
                else:
                  axs[i*2+j, k].set_title(f'AM Rush Hour Northbound ({line}) Train to {list(stopNames.values())[-1]}', size=20)

            # Similar procedure for eastbound evening
            else:
              if 'X' in line:
                if line == '7X':
                  axs[i*2+j-1, k].set_title(f'PM Rush Hour Eastbound ({line}) Train to {list(stopNames.values())[-1]}', size=20)
                else:
                  axs[i*2+j-1, k].set_title(f'PM Rush Hour Northbound ({line}) Train to {list(stopNames.values())[-1]}', size=20)
              else:
                if line == '7':
                  axs[i*2+j, k].set_title(f'PM Rush Hour Eastbound ({line}) Train to {list(stopNames.values())[-1]}', size=20)
                elif line == 'J':
                  axs[i*2+j, k].set_title(f'AM Rush Hour Eastbound (J) and (Z) Trains to {list(stopNames.values())[-1]}', size=20, wrap=True)
                else:
                  axs[i*2+j, k].set_title(f'PM Rush Hour Northbound ({line}) Train to {list(stopNames.values())[-1]}', size=20)

          # If X lines are being plotted, shift rows by 1 (only two express trains as they are
          # peak direction express compared to four permutations). Add time and axis station labels
          # and plot using stopDict.values() and stopNames.values()
          if 'X' in line and dir == 'N' and time == 'evening':
            axs[i*2+j-1, k].grid()
            axs[i*2+j-1, k].set_xlabel('Time')
            axs[i*2+j-1, k].set_xticks(np.arange(time1, time2+1, 900), timeList)
            axs[i*2+j-1, k].set_ylabel('Stations')
            axs[i*2+j-1, k].set_yticks(list(stopDict.values()), stopNames.values())

            # Invert the y axis if it is northbound so that the lines in the graph travel differently
            if k == 0:
              axs[i*2+j-1,k].invert_yaxis()
            axs[i*2+j-1,k].margins(x=0)
            axs[i*2+j-1,k].margins(y=0)

          # These are not possibilities as they are not peak direction so they are not plotted
          elif 'X' in line and dir == 'N' and time == 'morning':
            pass
          elif 'X' in line and dir == 'S' and time == 'evening':
            pass

          # Perform similar procedure for X in line and direction is S and time is morning
          else:
            axs[i*2+j, k].grid()
            axs[i*2+j, k].set_xlabel('Time')
            axs[i*2+j, k].set_xticks(np.arange(time1, time2+1, 900), timeList)
            axs[i*2+j, k].set_ylabel('Stations')
            axs[i*2+j, k].set_yticks(list(stopDict.values()), stopNames.values())
            if k == 0:
              axs[i*2+j,k].invert_yaxis()
            axs[i*2+j,k].margins(x=0)
            axs[i*2+j,k].margins(y=0)

  # For each set of lines, add their image to the title of the graph using readBullet
  # and positioning (guess and check)
  if lines == ['1', '2', '3']:
    imagebox = OffsetImage(readBullet('1'), zoom=0.05)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.455, .996), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

    imagebox = OffsetImage(readBullet('2'), zoom=0.05)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.49065, .996), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

    imagebox = OffsetImage(readBullet('3'), zoom=0.05)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.5269, .996), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

  # Do same for Lex Av lines
  elif lines == ['4', '5', '6', '6X']:
    imagebox = OffsetImage(readBullet('4'), zoom=0.05)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.48245, .996), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

    imagebox = OffsetImage(readBullet('5'), zoom=0.05)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.5196, .996), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

    imagebox = OffsetImage(readBullet('6'), zoom=0.05)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.55495, .996), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

    imagebox = OffsetImage(readBullet('6X'), zoom=0.05)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.5912, .996), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

  # Do same for Flushing lines
  elif lines == ['7', '7X']:
    imagebox = OffsetImage(readBullet('7'), zoom=0.05)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.49445, .99), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

    imagebox = OffsetImage(readBullet('7X'), zoom=0.05)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.5307, .99), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

  # Do same for 8 Av lines
  elif lines == ['A', 'C', 'E']:
    imagebox = OffsetImage(readBullet('A'), zoom=0.05)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.458, .995), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

    imagebox = OffsetImage(readBullet('C'), zoom=0.05)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.49365, .995), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

    imagebox = OffsetImage(readBullet('E'), zoom=0.05)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.5299, .995), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

  # Do same for 6 Av lines
  elif lines == ['B', 'D', 'F', 'M']:
    imagebox = OffsetImage(readBullet('B'), zoom=0.05)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.42145, .997), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

    imagebox = OffsetImage(readBullet('D'), zoom=0.05)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.4586, .997), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

    imagebox = OffsetImage(readBullet('F'), zoom=0.05)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.49395, .997), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

    imagebox = OffsetImage(readBullet('FX'), zoom=0.05)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.5302, .997), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

    imagebox = OffsetImage(readBullet('M'), zoom=0.05)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.56645, .997), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

  # Do same for G, L, and J trains
  elif lines == ['G']:
    imagebox = OffsetImage(readBullet('G'), zoom=0.05)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.5262, .985), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

  elif lines == ['L']:
    imagebox = OffsetImage(readBullet('L'), zoom=0.05)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.5212, .985), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

  elif lines == ['J']:
    imagebox = OffsetImage(readBullet('J'), zoom=0.05)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.49495, .985), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

    imagebox = OffsetImage(readBullet('Z'), zoom=0.05)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.5312, .985), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

  # Do same for Broadway lines
  elif lines == ['N', 'Q', 'R', 'W']:
    imagebox = OffsetImage(readBullet('N'), zoom=0.0725)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.47245, .997), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

    imagebox = OffsetImage(readBullet('Q'), zoom=0.0725)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.5096, .997), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

    imagebox = OffsetImage(readBullet('R'), zoom=0.0725)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.54495, .997), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

    imagebox = OffsetImage(readBullet('W'), zoom=0.0725)  # Adjust zoom as needed for size
    ab = AnnotationBbox(imagebox, (0.5812, .997), xycoords='figure fraction', frameon=False, pad=0, zorder=200)  # pad=0 to remove padding
    fig.add_artist(ab)

  # Use tight layout and plot the graphs
  plt.tight_layout()
  plt.show()

def distributionFinder(line, direction, rushHourType):
  """Finds distribution of intervals between trains for a line given time
  and direction. Returns distribution as a numpy array."""

  # Initializes global variables
  global df
  global stopsDf

  # Runs noExpressDifferentiatedQuery to get list of trips associated with
  # a specific line and direction
  subwayDf = noExpressDifferentiatedQuery(df, line, direction)

  # Add seconds to stop times which are currently in HH:MM:SS
  timeDf = addSeconds(subwayDf, ['arrival_time', 'departure_time'])

  # Set rush hour times as defined by MTA in seconds
  if rushHourType == 'evening':
    time1 = 55800
    time2 = 72000
  else:
    time1 = 23400
    time2 = 34200

  # Run secondsDf
  secondsDf = subwayTimeQuery(timeDf, 'arrival_time_sec', 'departure_time_sec', time1, time2)

  # Extract stops and get stop names using stopIDtoName and return stopDict,
  # filteredSecondsDf, and stopNames
  filteredSecondsDf, stopDict = stopExtractor(secondsDf, line, direction)
  stopNames = stopIDtoName(stopsDf, list(stopDict.keys()))
  return stopDict, filteredSecondsDf, stopNames

def noExpressDifferentiatedQuery(df, line, direction):
  """Takes in dataframe with realtime subway data, string representing subway line,
  and string representing direction. Returns dataframe with rows that
  correspond to weekdays, the line specified by the user, and the direction
  specified by the user. Does not differentiate by express vs. local variants for
  6, 7, and F trains"""

  # Uses similar validation to subwayQuery function
  if line == '7' or line == 'E' or line == 'F' or line == 'M' or line == 'R' or line == 'J':
    joinedString = f"{line}..{direction}"
    filteredDf = df.query(f'trip_id.str.contains("L0S1") and trip_id.str.contains("_{joinedString}")', engine='python')

  # W train dealt with separately because it is associated with the N in the data
  elif line == 'W':

    # Uses code to extract W trains running either northbound or southbound based on direction
    # specified when function was called
    if direction == 'N':
      joinedString = "N..N72"
    else:
      joinedString = "N..S72"
    filteredDf = df.query(f'trip_id.str.contains("Weekday") and trip_id.str.contains("_{joinedString}")', engine='python')

  # N train
  elif line == 'N':

    # Removes W trains
    joinedString1 = f"N..N72"
    joinedString2 = f"N..S72"
    filteredDf1 = df[~df.index.str.contains(f"_{joinedString1}")]
    filteredDf2 = filteredDf1[~filteredDf1.index.str.contains(f"_{joinedString2}")]

    # Only preserves rows associated with N train
    joinedString = f"{line}..{direction}"
    filteredDf = filteredDf2.query(f'trip_id.str.contains("Weekday") and trip_id.str.contains("_{joinedString}")', engine='python')

  # If normal train than filtering is done normally
  else:
    joinedString = f"{line}..{direction}"
    filteredDf = df.query(f'trip_id.str.contains("Weekday") and trip_id.str.contains("_{joinedString}")', engine='python')
  return filteredDf

def pairwiseDifferences(df, columnName):
  """Given dataframe and a specified column name, returns numpy array of all pairwise
  differences in that column."""

  # A column is taken and converted into a numpy array and sorted in ascending order
  column = df[columnName]
  colArray = np.array(column)
  sortedColArray = np.sort(colArray)

  # The pairwise differences are taken
  pairwiseDiff = np.abs(np.diff(sortedColArray))

  # Throwing out excessive times created by  4 train skipping 138 St
  pairwiseDiff = pairwiseDiff[pairwiseDiff < 7000]

  # Return pairwiseDiff
  return pairwiseDiff

def lineDistribution(line, direction, rushHourType):
  """Given a dataframe, a specified line, and time of rush hour (morning or night),
  generates array with all times in between trains for all stations in that line."""

  # Runs distributionFinder to get stopDict, filteredSecondsDf, and stopNames
  stopDict, filteredSecondsDf, stopNames = distributionFinder(line, direction, rushHourType)

  # Creates empty list that will contain distribution of times in between trains for all stations
  # in that line
  dist = []

  # For each stop as listed in stopDict.keys()
  for stop in stopDict.keys():

    # Stops included in the graph that are not part of the normal service pattern are skipped
    if line == 'E' and stop == 'F05'+direction:
      continue
    if line == 'E' and stop == 'F07'+direction:
      continue
    if line == 'A' and stop == 'A14'+direction:
      continue
    if line == 'D' and stop == 'R30'+direction:
      continue
    if (line == 'D' or line == 'N') and 'R' in stop and (32 <= int(stop[1:3]) and 35 >= int(stop[1:3])):
      continue

    # N train going local sometimes is dealt with here
    if line  == 'N' and (stop[0:3] == 'R40' or stop[0:3] == 'R39' or stop[0:3] == 'R32'
      or stop[0:3] == 'R33' or stop[0:3] == 'R34' or stop[0:3] == 'R35' or stop[0:3] == 'R30'
      or stop[0:3] == 'R29' or stop[0:3] == 'R28' or stop[0:3] == 'R27' or stop[0:3] == 'R26'
      or stop[0:3] == 'R25' or stop[0:3] == 'R24' or stop[0:3] == 'R23' or stop[0:3] == 'R22'
      or stop[0:3] == 'R21' or stop[0:3] == 'R19' or stop[0:3] == 'R18'):
      continue

    # Station arrivals is taken from filteredSecondsDf and querying it for a specific stop
    stationArrivals = filteredSecondsDf.query(f'stop_id.str.contains("{stop}")')

    # Pairwise differences are found for stationArrivals
    pairDiff = pairwiseDifferences(stationArrivals, 'arrival_time_sec')

    # The distributions are hstacked (concatenation)
    dist = np.hstack((dist, pairDiff))

  # Returning distribution and stopNames
  return dist, stopNames

def trainIntervals(system):
  """Based on a given system (IRT or IND), plots all of the histograms representing
  distributions for time in between trains based on time of day and type of rush hour."""

  # Sets title and axes to bold, initializes possible directions and times
  plt.rcParams["font.weight"] = "bold"
  plt.rcParams["axes.labelweight"] = "bold"
  plt.style.use('default')
  directions = ['S', 'N']
  times = ['morning', 'evening']

  # Associates each system with their respective lines
  if system == 'IRT':
    lines = ['1', '2', '3', '4', '5', '6', '7']
  else:
    lines = ['A', 'C', 'E', 'B', 'D', 'F', 'M', 'G', 'L', 'J', 'N', 'Q',
             'R', 'W']

  # Creates plot
  fig, axs = plt.subplots(len(lines), len(directions)+len(times), figsize=(30, 75*len(lines)/14))

  # Titles based on which system
  if system == 'IRT':
    fig.suptitle('IRT Train Interval Data', size=40, y=1.001,weight='bold')
  else:
    fig.suptitle('IND and BMT Train Interval Data', size=40, y=1.001, weight='bold')

  # Iterates over line, direction, and time in three nested for
  for i, line in enumerate(lines):
    for k, dir in enumerate(directions):
      for j, time in enumerate(times):

          # Get the distribution and stopNames from lineDistribution specified for the line, direction, and time
          dist, stopNames = lineDistribution(line, dir, time)

          # Organize titles based on direction and time of day
          if dir == 'S':

            # Morning and southbound
            if time == 'morning':

              # Set 7 and J trains to westbound instead of southbound to better reflect their directionality
              if line == '7':
                axs[i, j+2*k].set_title(f'AM Rush Hour Westbound (7) and (7X) Trains\nto {list(stopNames.values())[-1]}', size=20, wrap=True)
              elif line == 'J':
                axs[i, j+2*k].set_title(f'AM Rush Hour Westbound (J) and (Z) Trains\nto {list(stopNames.values())[-1]}', size=20, wrap=True)
              elif line == '6':
                axs[i, j+2*k].set_title(f'AM Rush Hour Southbound (6) and (6X) Trains\nto {list(stopNames.values())[-1]}', size=20, wrap=True)
              else:
                axs[i, j+2*k].set_title(f'AM Rush Hour Southbound ({line}) Train\nto {list(stopNames.values())[-1]}', size=20, wrap=True)

            # Do the same for evening and southbound
            else:
              if line == '7':
                axs[i, j+2*k].set_title(f'PM Rush Hour Westbound ({line}) Train\nto {list(stopNames.values())[-1]}', size=20, wrap=True)
              elif line == 'J':
                axs[i, j+2*k].set_title(f'AM Rush Hour Westbound (J) Train\nto {list(stopNames.values())[-1]}', size=20, wrap=True)
              else:
                axs[i, j+2*k].set_title(f'PM Rush Hour Southbound ({line}) Train\nto {list(stopNames.values())[-1]}', size=20, wrap=True)

          # Northbound
          else:

            # Morning and northbound
            if time == 'morning':
              if line == '7':
                axs[i, j+2*k].set_title(f'AM Rush Hour Eastbound ({line}) Train\nto {list(stopNames.values())[-1]}', size=20, wrap=True)
              elif line == 'J':
                axs[i, j+2*k].set_title(f'AM Rush Hour Eastbound (J) Train\nto {list(stopNames.values())[-1]}', size=20, wrap=True)
              else:
                axs[i, j+2*k].set_title(f'AM Rush Hour Northbound ({line}) Train\nto {list(stopNames.values())[-1]}', size=20, wrap=True)

            # Mornign and southbound
            else:
              if line == '7':
                  axs[i, j+2*k].set_title(f'PM Rush Hour Eastbound (7) and (7X) Trains\nto {list(stopNames.values())[-1]}', size=20, wrap=True)
              elif line == 'J':
                axs[i, j+2*k].set_title(f'AM Rush Hour Eastbound (J) and (Z) Trains\nto {list(stopNames.values())[-1]}', size=20, wrap=True)
              elif line == '6':
                axs[i, j+2*k].set_title(f'PM Rush Hour Northbound (6) and (6X) Trains\nto {list(stopNames.values())[-1]}', size=20, wrap=True)
              else:
                axs[i, j+2*k].set_title(f'PM Rush Hour Northbound ({line}) Train\nto {list(stopNames.values())[-1]}', size=20, wrap=True)

          # Set line color based on the color of the train line
          if line in ['1', '2', '3']:
            linColor = 'red'
          elif line in ['4', '5', '6']:
            linColor = 'green'
          elif line == '7':
            linColor = 'mediumorchid'
          elif line in ['A', 'C', 'E']:
            linColor = 'darkblue'
          elif line in ['B', 'D', 'F', 'M']:
            linColor = 'darkorange'
          elif line in ['N', 'Q', 'R', 'W']:
            linColor = 'gold'
          elif line == 'L':
            linColor = 'darkgray'
          elif line == 'J':
            linColor = 'brown'
          elif line == 'G':
            linColor = 'limegreen'

          # Create axes, setting each bin width to 33.3
          axs[i, j+2*k].hist(dist, bins=np.linspace(0,1100,34,endpoint=True), color=linColor)  # bins determine the number of bins in the histogram
          axs[i, j+2*k].set_xlabel('Times Between Trains (s)')
          axs[i, j+2*k].set_ylabel('Frequency')
          axs[i, j+2*k].grid()
          axs[i, j+2*k].grid(True)
          axs[i, j+2*k].margins(x=0)
          axs[i, j+2*k].margins(y=0)
          axs[i, j+2*k].set_xlim([0,1100])
          axs[i, j+2*k].set_ylim([0,900])


  # Use tight layout and plot the plot
  plt.tight_layout()
  plt.show()
  return

def shapirowilk(dataval):
  """Takes in list or 1D array of data values and returns p statistic from test."""
  stat, p = shapiro(dataval)
  return p

def anovatest(distDict):
  """Takes in dictionary associating lines with their distributions.
  Perform ANOVA test on these distributions and returns associated p-val"""

  # Run ANOVA test and return p-val
  stat, p = f_oneway(*list(distDict.values()))
  return p

def mannwhitneypairwise(type1, type2):
  """Takes in two lists/1D arrays (type1 and type2) and returns p val."""
  stat, p = mannwhitneyu(type1, type2)
  return stat, p

def twosamplet(type1, type2):
  """Takes in two lists or arrays of data and returns p val."""
  stat, p = ttest_ind(type1, type2)
  return stat, p

def trainComparison(lines, direction, rushHourType):
  """Takes in a set of lines, a common direction, and rushHourType.
  Performs ANOVA test and returns p-value. Runs Shapiro Wilk Normality test on all data.
  and returns dataframe of p-values and whether all
  data is normal or not as a boolean. If all data are normal, performs pairwise
  t-tests on all data. If all data are not normal, performs pairwise Mann Whitney
  U tests on all data. Returns matrix of pairwise p-values."""

  # Set empty dictionary to distributions and empty dictionary with shapiroWilk data values
  distDict = {}
  shapWilkDict = {}
  normal = True

  # For each line, find distribution and run Shapiro Wilk test. Associate that p value with
  # the line in the shapWilkDict
  for line in lines:
    dist, stopNames = lineDistribution(line, direction, rushHourType)
    distDict[line] = dist
    shapWilkDict[line] = shapirowilk(distDict[line])

  # If the p val of any of the lines is less than 0.05, we can reject the null hypothesis that
  # the dataset is normal and conclude it is not normal
  for line in lines:
    if shapWilkDict[line] < 0.05:
      normal = False
      break

  # Perform ANOVA test
  anovapval = anovatest(distDict)

  # Initialize an empty DataFrame with groups as rows and columns
  columns = lines.copy()
  index = lines.copy()
  statDf = pd.DataFrame(np.zeros((len(lines), len(lines)), dtype=float))
  statDf.columns = columns
  statDf.index = index

  # Perform pairwise t-tests and populate the DataFrame
  for i in range(len(lines)):
      for j in range(len(lines)):
          if i != j:
            if normal == True:
              stat, p = twosamplet(distDict[lines[i]], distDict[lines[j]])
              statDf.iloc[i, j] = float(p)
            else:
              stat, p = mannwhitneyu(distDict[lines[i]], distDict[lines[j]])
              statDf.iloc[i, j] = float(p)

      # Set t tests between the same train lines in the dataframe as 1
      statDf.iloc[i,i] = float(1)

  # Return the shapWilkDict, anovapval, the normality of the distribution as a
  # Boolean, and the statDf dictionary
  return shapWilkDict, anovapval, normal, statDf

def makeHeatmap(statDf, normal, direction, rushHourType):
    """Using the statDf that contains pvals from pairwise t-tests/MWU tests, creates a heatmap."""

    # Replace 0 values with NaN to handle logarithmic transformation
    statDf[statDf == 0] = np.nan

    # Logarithmically scale heatmap using log base 10
    logStatDf = np.log10(statDf)

    # Define a custom colormap that maps NaN to white and other values to a red-blue gradient
    cmap_colors = matplotlib.colormaps['spring']
    cmap_colors.set_bad('black')  # Set color for NaN values to white

    # Set figure size to make the plot square
    plt.figure(figsize=(20, 20))  # Adjust the size as needed

    # Plot heatmap with custom colormap and normalization
    plt.imshow(logStatDf, cmap=cmap_colors, interpolation='nearest', aspect='auto')

    # Set tick labels for x-axis (columns) and y-axis (indices) based on DataFrame
    plt.xticks(ticks=np.arange(len(statDf.columns)), labels=statDf.columns, ha='right')
    plt.yticks(ticks=np.arange(len(statDf.index)), labels=statDf.index)

    # Set direction label based on input
    dir_label = 'Northbound' if direction == 'N' else 'Southbound'

    # Set time label based on rush hour type
    time_label = 'AM' if rushHourType == 'morning' else 'PM'

    # Annotate non-NaN values with their corresponding value
    for i in range(logStatDf.shape[0]):
        for j in range(logStatDf.shape[1]):
            value = logStatDf.iloc[i, j]  # Use iloc for indexing
            if not np.isnan(value):
                plt.text(j, i, f'{value:.2f}', ha='center', va='center', color='black', weight='bold', fontsize=9)

    # Set title based on test type
    test_type = 'Pairwise t-Tests' if normal else 'Pairwise Mann-Whitney U Tests'
    plt.title(f'Logarithmic Heat Map (Base 10) of {test_type}\nfor {time_label} {dir_label} Trains', fontsize=25)

    # Show colorbar to indicate scale
    plt.colorbar(label='Logarithmic Value')

    # Show plot
    plt.show()

def combineDistributions(lines):
  """Combines the four distributions created for a line (N and morning, N and
  evening, S and morning, S and evening) into one distribution for comparison
  between lines."""

  # Create empty dictionary
  distDict = {}

  # Iterates over each line, runs lineDistribution for every combination of directions
  # and times, extracts the first element in what is returned (the array with the
  # distributions).
  for line in lines:
    dist1 = lineDistribution(line, 'N', 'morning')[0]
    dist2 = lineDistribution(line, 'N', 'evening')[0]
    dist3 = lineDistribution(line, 'S', 'morning')[0]
    dist4 = lineDistribution(line, 'S', 'evening')[0]

    # Assigns the concatenated distribution to the line in the distDict
    distDict[line] = np.hstack((dist1, dist2, dist3, dist4))

  # Return distDict
  return distDict

def boxAndWhisker(lines):
    """ Creates a box and whisker plot for each line, highlighting 5 num summary.
    Outliers excluded. Boxplots are ordered from lowest median to highest median
    in order to get a picture of what trains come most frequently and what trains
    the least."""

    # Run combineDistributions to get a dictionary
    distDict = combineDistributions(lines)
    medians = {key: np.median(value) for key, value in distDict.items()}
    sortedLines = sorted(medians, key=medians.get)

    # Create a figure and axis for the plot
    plt.figure(figsize=(12, 8))

    categoryColors = {
        '1': '#EE352E',   # red
        '2': '#EE352E',   # red
        '3': '#EE352E',   # red
        '4': '#00933C',   # green
        '5': '#00933C',   # green
        '6': '#00933C',   # green
        '7': '#B933AD',   # purple
        'A': '#0039A6',   # blue
        'C': '#0039A6',   # blue
        'E': '#0039A6',   # blue
        'B': '#FF6319',   # orange
        'D': '#FF6319',   # orange
        'F': '#FF6319',   # orange
        'M': '#FF6319',   # orange
        'G': '#6CBE45',   # lime green
        'J': '#996633',   # brown
        'Z': '#996633',   # brown
        'L': '#A7A9AC',   # gray
        'N': '#FCCC0A',   # yellow
        'Q': '#FCCC0A',   # yellow
        'R': '#FCCC0A',   # yellow
        'W': '#FCCC0A',   # yellow
    }

    # Plot box and whisker plots for each category ordered by median
    for i, line in enumerate(sortedLines):
        data = distDict[line]
        color = 'blue'  # Default color (replace with actual category_colors logic)

        # Customize box color based on category (subway line)
        if line in categoryColors:
            color = categoryColors[line]

        # Median color
        medianColor = 'black'
        if line not in ['N', 'Q', 'R', 'W', '7']:
            medianColor = 'white'

        # Plot box and whisker plot
        bp = plt.boxplot(data, positions=[i+1], widths=0.6, patch_artist=True, medianprops=dict(color=medianColor), showfliers=False)

        # Customize box colors
        for box in bp['boxes']:
            box.set_facecolor(color)

    # Customize x-axis labels and other plot properties
    newLines = ['J/Z' if i=='J' else i for i in sortedLines]
    plt.xticks(range(1, len(sortedLines) + 1), newLines)
    plt.xlabel('Lines', fontsize=12)
    plt.ylabel('Time (s)', fontsize=12)
    plt.title('Distribution of Time Between Trains for Different Lines', fontsize=14)

    # Show grid and adjust layout
    plt.grid(True, axis='y', linestyle='--', alpha=0.5)
    plt.tight_layout()

    # Display the plot
    plt.show()

def averageWaitTime(lines):
  """Given list of lines, returns their average wait times in a dataframe
  as given by the Poisson distribution."""

  # Run combineDistributions to get a dictionary
  distDict = combineDistributions(lines)

  # Initialize new dictionary
  waitTimeDict = {}

  # Iterates over lines
  for line in lines:

    # Gets distribution from distDict
    distribution = distDict[line]

    # Finds mean interval in minutes
    mean = np.mean(distribution)/60

    # Adds to waitTimeDict
    waitTimeDict[line] = mean

  # Returns waitTimeDict
  return waitTimeDict

def main():

    # Creates list of all lines
    lines = ['1','2','3','4','5','6','7',
             'A','C','E','B','D','F','M',
             'G','L','J','N','Q','R','W']

    # Create trainPlots for each of the trunk lines
    trainPlot(['1','2','3'])
    trainPlot(['4','5','6','6X'])
    trainPlot(['7','7X'])
    trainPlot(['A','C','E'])
    trainPlot(['B','D','F','M'])
    trainPlot(['G'])
    trainPlot(['L'])
    trainPlot(['J'])
    trainPlot(['N','Q','R','W'])

    # Plot train intervals for IRT trains and IND trains
    trainIntervals('IRT')
    trainIntervals('BMT')

    # Create matrix with normality tests, anova pval, whether the
    # distributions are normal, and pairwise t test/Mann-Whitney U matrix
    shapWilkDict1, anovapval1, normal1, statDf1 = trainComparison(lines, 'N', 'morning')
    shapWilkDict2, anovapval2, normal2, statDf2 = trainComparison(lines, 'N', 'evening')
    shapWilkDict3, anovapval3, normal3, statDf3 = trainComparison(lines, 'S', 'morning')
    shapWilkDict4, anovapval4, normal4, statDf4 = trainComparison(lines, 'S', 'evening')

    # Print shap wilk dictionaries
    print(f"Shapiro-Wilk Test p values for each line's train interval distribution going northbound in morning rush hour: {shapWilkDict1}")
    print(f"Shapiro-Wilk Test p values for each line's train interval distribution going northbound in evening rush hour: {shapWilkDict2}")
    print(f"Shapiro-Wilk Test p values for each line's train interval distribution going southbound in morning rush hour: {shapWilkDict3}")
    print(f"Shapiro-Wilk Test p values for each line's train interval distribution going southbound in evening rush hour: {shapWilkDict4}")

    # Print whether normal or not
    print(f"Based on the Shapiro-Wilk p values, we assume that it is {normal1} that all all the train interval distributions are normally distributed for all lines going northbound in morning rush hour.")
    print(f"Based on the Shapiro-Wilk p values, we assume that it is {normal2} that all all the train interval distributions are normally distributed for all lines going northbound in evening rush hour.")
    print(f"Based on the Shapiro-Wilk p values, we assume that it is {normal3} that all all the train interval distributions are normally distributed for all lines going southbound in morning rush hour.")
    print(f"Based on the Shapiro-Wilk p values, we assume that it is {normal4} that all all the train interval distributions are normally distributed for all lines going southbound in evening rush hour.")

    # Print ANOVA p-val
    print(f"ANOVA Test p value for all lines going northbound in morning rush hour: {anovapval1}")
    print(f"ANOVA Test p value for all lines going northbound in evening rush hour: {anovapval2}")
    print(f"ANOVA Test p value for all lines going southbound in morning rush hour: {anovapval3}")
    print(f"ANOVA Test p value for all lines going southbound in evening rush hour: {anovapval4}")

    # Make heatmaps
    makeHeatmap(statDf1, normal1, 'N', 'morning')
    makeHeatmap(statDf2, normal2, 'N', 'evening')
    makeHeatmap(statDf3, normal3, 'S', 'morning')
    makeHeatmap(statDf4, normal4, 'S', 'evening')

    # Print average wait time
    print(f"Average wait time dictionary (in minutes): {averageWaitTime(lines)}")

    # Box and whisker plot of line intervals
    boxAndWhisker(lines)

if __name__ == "__main__":
  main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import unittest

# readData is not unit tested because it has been used in past homeworks
# extract is not unit tested due as it has no outputs

class TestSubwayLineQuery(unittest.TestCase):

  # Write function
  def test_subwayLineQuery_OutputType(self):
    """Test output type of subwayLineQuery function."""

    # Create dataframe populated with various lines and times extracted
    # from the given CSV file manually. This should have enough to test various cases
    # for this function
    # Original dictionary
    df = {
        'AFA23GEN-1038-Sunday-00_008600_1..S03R': ['104S', '01:29:00', '01:29:00', '3'],
        'AFA23GEN-1092-Weekday-00_057850_1..N13R': ['112N', '10:20:30', '10:20:30', '29'],
        'AFA23GEN-1092-Weekday-00_057950_1..S03R': ['113S', '09:54:30', '09:54:30', '11'],
        'AFA23GEN-6089-Weekday-00_075000_6..N02R': ['640N', '12:30:00', '12:30:00', '1'],
        'AFA23GEN-6089-Weekday-00_067800_6..N01R': ['619N', '11:50:00', '11:50:00', '21'],
        'AFA23GEN-6089-Weekday-00_067000_6..S03R': ['625S', '11:35:00', '11:35:00', '17'],
        'AFA23GEN-6089-Weekday-00_048500_6..S08R': ['608S', '08:15:00', '08:17:00', '7'],
        'L0S1-7-1064-S02_038300_7..S98X001': ['712S', '06:37:00', '06:37:00', '6'],
        'L0S1-7-1064-S02_038000_7..N97R': ['702N', '06:52:00', '06:52:00', '21'],
        'L0S1-7-1064-S02_096850_7..N97X005': ['725N', '16:11:00', '16:11:00', '2'],
        'L0S1-7-1064-S02_096800_7..S97R': ['724S', '16:41:00', '16:41:00', '20'],
        'L0S1-F-1077-S02_113250_F..S53X011': ['F35S', '20:16:30', '20:20:00', '40'],
        'L0S1-R-1093-S02_129650_R..N78R': ['R44N', '21:38:30', '21:38:30', '2'],
        'BFA23GEN-N098-Weekday-00_103100_N..S72R': ['R08S', '17:18:30', '17:18:30', '6'],
        'BFA23GEN-N098-Weekday-00_104000_N..S72R': ['R23S', '17:55:30', '17:55:30', '19'],
        'FA23GEN-N098-Weekday-00_104150_N..N33R': ['N08N', '17:28:30', '17:28:30', '4'],
        'BFA23GEN-N098-Weekday-00_103600_N..S34R': ['R16S', '17:39:30', '17:39:30', '12']
    }

    # Specify column names
    columns = ['stop_id', 'arrival_time', 'departure_time', 'stop_sequence']

    # Create list of lines and directions to test
    linesToTest = ['1','6','6X','7','7X','N','W','F','R']
    directions = ['N','S']

    # Create dataframe from dictionary with appropriate columns
    importedDf = pd.DataFrame(df).T
    importedDf.columns = columns
    importedDf = importedDf.rename_axis(index='trip_id')

    # Iterate through all of them, checking if a dataframe is always returned
    for line in linesToTest:
      for dir in directions:
        subwayDf = subwayLineQuery(importedDf, line, dir)
        self.assertIsInstance(subwayDf, pd.DataFrame, "Returned object is not a DataFrame")

  # Write function
  def test_subwayLineQuery_OutputValue(self):
    """Test output value of subwayLineQuery function."""

    # Create dataframe populated with various lines and times extracted
    # from the given CSV file manually. This should have enough to test various cases
    # for this function
    # Original dictionary
    df = {
        'AFA23GEN-1038-Sunday-00_008600_1..S03R': ['104S', '01:29:00', '01:29:00', '3'],
        'AFA23GEN-1092-Weekday-00_057850_1..N13R': ['112N', '10:20:30', '10:20:30', '29'],
        'AFA23GEN-1092-Weekday-00_057950_1..S03R': ['113S', '09:54:30', '09:54:30', '11'],
        'AFA23GEN-6089-Weekday-00_075000_6..N02R': ['640N', '12:30:00', '12:30:00', '1'],
        'AFA23GEN-6089-Weekday-00_067800_6..N01R': ['619N', '11:50:00', '11:50:00', '21'],
        'AFA23GEN-6089-Weekday-00_067000_6..S03R': ['625S', '11:35:00', '11:35:00', '17'],
        'AFA23GEN-6089-Weekday-00_048500_6..S08R': ['608S', '08:15:00', '08:17:00', '7'],
        'L0S1-7-1064-S02_038300_7..S98X001': ['712S', '06:37:00', '06:37:00', '6'],
        'L0S1-7-1064-S02_038000_7..N97R': ['702N', '06:52:00', '06:52:00', '21'],
        'L0S1-7-1064-S02_096850_7..N97X005': ['725N', '16:11:00', '16:11:00', '2'],
        'L0S1-7-1064-S02_096800_7..S97R': ['724S', '16:41:00', '16:41:00', '20'],
        'L0S1-F-1077-S02_113250_F..S53X011': ['F35S', '20:16:30', '20:20:00', '40'],
        'L0S1-R-1093-S02_129650_R..N78R': ['R44N', '21:38:30', '21:38:30', '2'],
        'BFA23GEN-N098-Weekday-00_103100_N..S72R': ['R08S', '17:18:30', '17:18:30', '6'],
        'BFA23GEN-N098-Weekday-00_104000_N..S72R': ['R23S', '17:55:30', '17:55:30', '19'],
        'BFA23GEN-N098-Weekday-00_104150_N..N33R': ['N08N', '17:28:30', '17:28:30', '4'],
        'BFA23GEN-N098-Weekday-00_103600_N..S34R': ['R16S', '17:39:30', '17:39:30', '12']
    }

    # Specify column names
    columns = ['stop_id', 'arrival_time', 'departure_time', 'stop_sequence']

    # Create list of lines and directions to test
    linesToTest = ['1','6','6X','7','7X','N','W','F','R']
    directions = ['N','S']

    # Create dataframe from dictionary with appropriate columns
    importedDf = pd.DataFrame(df).T
    importedDf.columns = columns
    importedDf = importedDf.rename_axis(index='trip_id')

    # Iterate through all of them, checking if correct dataframe is always returned
    for line in linesToTest:
      for dir in directions:
        subwayDf = subwayLineQuery(importedDf, line, dir)
        if line == '6' and dir == 'N':
            actualDf = importedDf.query('trip_id.str.contains("AFA23GEN-6089-Weekday-00_067800_6..N01R")')
        elif line == '6' and dir == 'S':
            actualDf = importedDf.query('trip_id.str.contains("AFA23GEN-6089-Weekday-00_067000_6..S03R")')
        elif line == '6X' and dir == 'N':
            actualDf = importedDf.query('trip_id.str.contains("AFA23GEN-6089-Weekday-00_075000_6..N02R")')
        elif line == '6X' and dir == 'S':
            actualDf = importedDf.query('trip_id.str.contains("AFA23GEN-6089-Weekday-00_048500_6..S08R")')
        elif line == '7' and dir == 'N':
            actualDf = importedDf.query('trip_id.str.contains("L0S1-7-1064-S02_038000_7..N97R")')
        elif line == '7' and dir == 'S':
            actualDf = importedDf.query('trip_id.str.contains("L0S1-7-1064-S02_096800_7..S97R")')
        elif line == '7X' and dir == 'N':
            actualDf = importedDf.query('trip_id.str.contains("L0S1-7-1064-S02_096850_7..N97X005")')
        elif line == '7X' and dir == 'S':
            actualDf = importedDf.query('trip_id.str.contains("L0S1-7-1064-S02_038300_7..S98X001")')
        elif line == 'N' and dir == 'N':
            actualDf = importedDf.query('trip_id.str.contains("BFA23GEN-N098-Weekday-00_104150_N..N33R")')
        elif line == 'N' and dir == 'S':
            actualDf = importedDf.query('trip_id.str.contains("BFA23GEN-N098-Weekday-00_103600_N..S34R")')
        elif line == 'W' and dir == 'N':
            actualDf = importedDf.query('trip_id.str.contains("NaN")')
        elif line == 'W' and dir == 'S':
            actualDf = importedDf.query('trip_id.str.contains("BFA23GEN-N098-Weekday-00_103100_N..S72R") or trip_id.str.contains("BFA23GEN-N098-Weekday-00_104000_N..S72R")')
        elif line == 'F' and dir == 'N':
            # We just need a blank dataframe for this case
            actualDf = importedDf.query('trip_id.str.contains("NaN")')
        elif line == 'F' and dir == 'S':
            actualDf = importedDf.query('trip_id.str.contains("L0S1-F-1077-S02_113250_F..S53X011")')
        elif line == 'R' and dir == 'N':
            actualDf = importedDf.query('trip_id.str.contains("L0S1-R-1093-S02_129650_R..N78R")')
        elif line == 'R' and dir == 'S':
            # We just need a blank dataframe in this case
            actualDf = importedDf.query('trip_id.str.contains("NaN")')
        elif line == '1' and dir == 'N':
            actualDf = importedDf.query('trip_id.str.contains("AFA23GEN-1092-Weekday-00_057850_1..N13R")')
        elif line == '1' and dir == 'S':
            actualDf = importedDf.query('trip_id.str.contains("AFA23GEN-1092-Weekday-00_057950_1..S03R")')

        # Run test for each case
        self.assertTrue(actualDf.equals(subwayDf),
                    "subwayLineQuery does not return the expected value.")

class TestGetSec(unittest.TestCase):

  # Write function
  def test_getSec_OutputType(self):
    """Test output type of getSec function."""

    # Create times in the form of HH:MM:SS
    time1 = '07:32:00'
    time2 = '13:59:59'
    time3 = '25:12:20'

    # Create list of times
    timeList = [time1, time2, time3]

    # Test type
    for time in timeList:
      self.assertTrue(type(getSec(time)) == int,
      "getSec needs to return an integer.")

  # Write function
  def test_getSec_OutputValue(self):
    """Test output value of getSec function."""

    # Create times in the form of HH:MM:SS
    time1 = '07:32:00'
    time2 = '13:59:59'
    time3 = '25:12:20'

    # Test value
    self.assertTrue(getSec(time1) == 27120,
                    "getSec does not return the expected value.")
    self.assertTrue(getSec(time2) == 50399,
                    "getSec does not return the expected value.")
    self.assertTrue(getSec(time3) == 90740,
                    "getSec does not return the expected value.")

class TestAddSeconds(unittest.TestCase):

  # Write function
  def test_addSeconds_OutputType(self):
    """Test output type of addSeconds function."""

    # Create dataframe populated with various lines and times extracted
    # from the given CSV file manually. This should have enough to test various cases
    # for this function
    # Original dictionary
    df = {
        'AFA23GEN-1038-Sunday-00_008600_1..S03R': ['104S', '01:29:00', '01:29:00', '3'],
        'AFA23GEN-1092-Weekday-00_057850_1..N13R': ['112N', '10:20:30', '10:20:30', '29'],
        'AFA23GEN-1092-Weekday-00_057950_1..S03R': ['113S', '09:54:30', '09:54:30', '11'],
        'AFA23GEN-6089-Weekday-00_075000_6..N02R': ['640N', '12:30:00', '12:30:00', '1'],
        'AFA23GEN-6089-Weekday-00_067800_6..N01R': ['619N', '11:50:00', '11:50:00', '21'],
        'AFA23GEN-6089-Weekday-00_067000_6..S03R': ['625S', '11:35:00', '11:35:00', '17'],
        'AFA23GEN-6089-Weekday-00_048500_6..S08R': ['608S', '08:15:00', '08:17:00', '7'],
        'L0S1-7-1064-S02_038300_7..S98X001': ['712S', '06:37:00', '06:37:00', '6'],
        'L0S1-7-1064-S02_038000_7..N97R': ['702N', '06:52:00', '06:52:00', '21'],
        'L0S1-7-1064-S02_096850_7..N97X005': ['725N', '16:11:00', '16:11:00', '2'],
        'L0S1-7-1064-S02_096800_7..S97R': ['724S', '16:41:00', '16:41:00', '20'],
        'L0S1-F-1077-S02_113250_F..S53X011': ['F35S', '20:16:30', '20:20:00', '40'],
        'L0S1-R-1093-S02_129650_R..N78R': ['R44N', '21:38:30', '21:38:30', '2'],
        'BFA23GEN-N098-Weekday-00_103100_N..S72R': ['R08S', '17:18:30', '17:18:30', '6'],
        'BFA23GEN-N098-Weekday-00_104000_N..S72R': ['R23S', '17:55:30', '17:55:30', '19'],
        'BFA23GEN-N098-Weekday-00_104150_N..N33R': ['N08N', '17:28:30', '17:28:30', '4'],
        'BFA23GEN-N098-Weekday-00_103600_N..S34R': ['R16S', '17:39:30', '17:39:30', '12']
    }

    # Specify column names
    columns = ['stop_id', 'arrival_time', 'departure_time', 'stop_sequence']

    # Create dataframe from dictionary with appropriate columns
    importedDf = pd.DataFrame(df).T
    importedDf.columns = columns
    importedDf = importedDf.rename_axis(index='trip_id')

    # Run function and test if type is correct
    timeDf = addSeconds(importedDf, ['arrival_time', 'departure_time'])
    self.assertIsInstance(timeDf, pd.DataFrame, "Returned object is not a DataFrame")

  # Write function
  def test_addSeconds_OutputValue(self):
    """Test output value of addSeconds function."""

    # Create dataframe populated with various lines and times extracted
    # from the given CSV file manually. This should have enough to test various cases
    # for this function.
    # Original dictionary
    df = {
        'AFA23GEN-1038-Sunday-00_008600_1..S03R': ['104S', '01:29:00', '01:29:00', '3'],
        'AFA23GEN-1092-Weekday-00_057850_1..N13R': ['112N', '10:20:30', '10:20:30', '29'],
        'AFA23GEN-1092-Weekday-00_057950_1..S03R': ['113S', '09:54:30', '09:54:30', '11'],
        'AFA23GEN-6089-Weekday-00_075000_6..N02R': ['640N', '12:30:00', '12:30:00', '1'],
        'AFA23GEN-6089-Weekday-00_067800_6..N01R': ['619N', '11:50:00', '11:50:00', '21'],
        'AFA23GEN-6089-Weekday-00_067000_6..S03R': ['625S', '11:35:00', '11:35:00', '17'],
        'AFA23GEN-6089-Weekday-00_048500_6..S08R': ['608S', '08:15:00', '08:17:00', '7'],
        'L0S1-7-1064-S02_038300_7..S98X001': ['712S', '06:37:00', '06:37:00', '6'],
        'L0S1-7-1064-S02_038000_7..N97R': ['702N', '06:52:00', '06:52:00', '21'],
        'L0S1-7-1064-S02_096850_7..N97X005': ['725N', '16:11:00', '16:11:00', '2'],
        'L0S1-7-1064-S02_096800_7..S97R': ['724S', '16:41:00', '16:41:00', '20'],
        'L0S1-F-1077-S02_113250_F..S53X011': ['F35S', '20:16:30', '20:20:00', '40'],
        'L0S1-R-1093-S02_129650_R..N78R': ['R44N', '21:38:30', '21:38:30', '2'],
        'BFA23GEN-N098-Weekday-00_103100_N..S72R': ['R08S', '17:18:30', '17:18:30', '6'],
        'BFA23GEN-N098-Weekday-00_104000_N..S72R': ['R23S', '17:55:30', '17:55:30', '19'],
        'BFA23GEN-N098-Weekday-00_104150_N..N33R': ['N08N', '17:28:30', '17:28:30', '4'],
        'BFA23GEN-N098-Weekday-00_103600_N..S34R': ['R16S', '17:39:30', '17:39:30', '12']
    }

    # The time has already been converted to seconds manually
    modifiedDict = {
    'AFA23GEN-1038-Sunday-00_008600_1..S03R': ['104S', '01:29:00', '01:29:00', '3', 5340, 5340],
    'AFA23GEN-1092-Weekday-00_057850_1..N13R': ['112N', '10:20:30', '10:20:30', '29', 37230, 37230],
    'AFA23GEN-1092-Weekday-00_057950_1..S03R': ['113S', '09:54:30', '09:54:30', '11', 35670, 35670],
    'AFA23GEN-6089-Weekday-00_075000_6..N02R': ['640N', '12:30:00', '12:30:00', '1', 45000, 45000],
    'AFA23GEN-6089-Weekday-00_067800_6..N01R': ['619N', '11:50:00', '11:50:00', '21', 42600, 42600],
    'AFA23GEN-6089-Weekday-00_067000_6..S03R': ['625S', '11:35:00', '11:35:00', '17', 41700, 41700],
    'AFA23GEN-6089-Weekday-00_048500_6..S08R': ['608S', '08:15:00', '08:17:00', '7', 29700, 29820],
    'L0S1-7-1064-S02_038300_7..S98X001': ['712S', '06:37:00', '06:37:00', '6', 23820, 23820],
    'L0S1-7-1064-S02_038000_7..N97R': ['702N', '06:52:00', '06:52:00', '21', 24720, 24720],
    'L0S1-7-1064-S02_096850_7..N97X005': ['725N', '16:11:00', '16:11:00', '2', 58260, 58260],
    'L0S1-7-1064-S02_096800_7..S97R': ['724S', '16:41:00', '16:41:00', '20', 60060, 60060],
    'L0S1-F-1077-S02_113250_F..S53X011': ['F35S', '20:16:30', '20:20:00', '40', 72990, 73200],
    'L0S1-R-1093-S02_129650_R..N78R': ['R44N', '21:38:30', '21:38:30', '2', 77910, 77910],
    'BFA23GEN-N098-Weekday-00_103100_N..S72R': ['R08S', '17:18:30', '17:18:30', '6', 62310, 62310],
    'BFA23GEN-N098-Weekday-00_104000_N..S72R': ['R23S', '17:55:30', '17:55:30', '19', 64530, 64530],
    'BFA23GEN-N098-Weekday-00_104150_N..N33R': ['N08N', '17:28:30', '17:28:30', '4', 62910, 62910],
    'BFA23GEN-N098-Weekday-00_103600_N..S34R': ['R16S', '17:39:30', '17:39:30', '12', 63570, 63570]
    }


    # Specify column names
    oldColumns = ['stop_id', 'arrival_time', 'departure_time', 'stop_sequence']
    newColumns = ['stop_id', 'arrival_time', 'departure_time', 'stop_sequence',  'arrival_time_sec', 'departure_time_sec']

    # Create dataframe from dictionary with appropriate columns
    importedDf = pd.DataFrame(df).T
    importedDf.columns = oldColumns
    importedDf = importedDf.rename_axis(index='trip_id')

    # Do the same with the modified df
    modifiedDf = pd.DataFrame(modifiedDict).T
    modifiedDf.columns = newColumns
    modifiedDf = modifiedDf.rename_axis(index='trip_id')

    # Run function and test if type is correct
    timeDf = addSeconds(importedDf, ['arrival_time', 'departure_time'])

    # Extract time in seconds columns
    modifiedArrivalTimes = modifiedDf['arrival_time_sec'].astype(int)
    modifiedDepartureTimes = modifiedDf['departure_time_sec'].astype(int)
    timeArrivalTimes = timeDf['arrival_time_sec'].astype(int)
    timeDepartureTimes = timeDf['departure_time_sec'].astype(int)

    # Test if the columns are equal
    self.assertTrue(modifiedArrivalTimes.equals(timeArrivalTimes),
                    "addSeconds does not return the expected value.")
    self.assertTrue(modifiedDepartureTimes.equals(timeDepartureTimes),
                    "addSeconds does not return the expected value.")

class TestSubwayTimeQuery(unittest.TestCase):

  # Write function
  def test_subwayTimeQuery_OutputType(self):
    """Test output type of subwayTimeQuery function."""

    # Create dataframe populated with various lines and times extracted
    # from the given CSV file manually. This should have enough to test various cases
    # for this function
    # Original dictionary
    df = {
    'AFA23GEN-1038-Sunday-00_008600_1..S03R': ['104S', '01:29:00', '01:29:00', '3', 5340, 5340],
    'AFA23GEN-1092-Weekday-00_057850_1..N13R': ['112N', '10:20:30', '10:20:30', '29', 37230, 37230],
    'AFA23GEN-1092-Weekday-00_057950_1..S03R': ['113S', '09:54:30', '09:54:30', '11', 35670, 35670],
    'AFA23GEN-6089-Weekday-00_075000_6..N02R': ['640N', '12:30:00', '12:30:00', '1', 45000, 45000],
    'AFA23GEN-6089-Weekday-00_067800_6..N01R': ['619N', '11:50:00', '11:50:00', '21', 42600, 42600],
    'AFA23GEN-6089-Weekday-00_067000_6..S03R': ['625S', '11:35:00', '11:35:00', '17', 41700, 41700],
    'AFA23GEN-6089-Weekday-00_048500_6..S08R': ['608S', '08:15:00', '08:17:00', '7', 29700, 29820],
    'L0S1-7-1064-S02_038300_7..S98X001': ['712S', '06:37:00', '06:37:00', '6', 23820, 23820],
    'L0S1-7-1064-S02_038000_7..N97R': ['702N', '06:52:00', '06:52:00', '21', 24720, 24720],
    'L0S1-7-1064-S02_096850_7..N97X005': ['725N', '16:11:00', '16:11:00', '2', 58260, 58260],
    'L0S1-7-1064-S02_096800_7..S97R': ['724S', '16:41:00', '16:41:00', '20', 60060, 60060],
    'L0S1-F-1077-S02_113250_F..S53X011': ['F35S', '20:16:30', '20:20:00', '40', 72990, 73200],
    'L0S1-R-1093-S02_129650_R..N78R': ['R44N', '21:38:30', '21:38:30', '2', 77910, 77910],
    'BFA23GEN-N098-Weekday-00_103100_N..S72R': ['R08S', '17:18:30', '17:18:30', '6', 62310, 62310],
    'BFA23GEN-N098-Weekday-00_104000_N..S72R': ['R23S', '17:55:30', '17:55:30', '19', 64530, 64530],
    'BFA23GEN-N098-Weekday-00_104150_N..N33R': ['N08N', '17:28:30', '17:28:30', '4', 62910, 62910],
    'BFA23GEN-N098-Weekday-00_103600_N..S34R': ['R16S', '17:39:30', '17:39:30', '12', 63570, 63570]
    }

    # Specify column names
    columns = ['stop_id', 'arrival_time', 'departure_time', 'stop_sequence', 'arrival_time_sec', 'departure_time_sec']

    # Create dataframe from dictionary with appropriate columns
    importedDf = pd.DataFrame(df).T
    importedDf.columns = columns
    importedDf = importedDf.rename_axis(index='trip_id')

    # Run function and test if type is correct
    timeDf1 = subwayTimeQuery(importedDf, 'arrival_time_sec', 'departure_time_sec', 30000, 60000)
    self.assertIsInstance(timeDf1, pd.DataFrame, "Returned object is not a DataFrame")
    timeDf2 = subwayTimeQuery(importedDf, 'arrival_time_sec', 'departure_time_sec', 60060, 64530)
    self.assertIsInstance(timeDf2, pd.DataFrame, "Returned object is not a DataFrame")

  # Write function
  def test_subwayTimeQuery_OutputValue(self):
    """Test output value of subwayTimeQuery function."""

    # Create dataframe populated with various lines and times extracted
    # from the given CSV file manually. This should have enough to test various cases
    # for this function.
    # Original dictionary
    df = {
    'AFA23GEN-1038-Sunday-00_008600_1..S03R': ['104S', '01:29:00', '01:29:00', '3', 5340, 5340],
    'AFA23GEN-1092-Weekday-00_057850_1..N13R': ['112N', '10:20:30', '10:20:30', '29', 37230, 37230],
    'AFA23GEN-1092-Weekday-00_057950_1..S03R': ['113S', '09:54:30', '09:54:30', '11', 35670, 35670],
    'AFA23GEN-6089-Weekday-00_075000_6..N02R': ['640N', '12:30:00', '12:30:00', '1', 45000, 45000],
    'AFA23GEN-6089-Weekday-00_067800_6..N01R': ['619N', '11:50:00', '11:50:00', '21', 42600, 42600],
    'AFA23GEN-6089-Weekday-00_067000_6..S03R': ['625S', '11:35:00', '11:35:00', '17', 41700, 41700],
    'AFA23GEN-6089-Weekday-00_048500_6..S08R': ['608S', '08:15:00', '08:17:00', '7', 29700, 29820],
    'L0S1-7-1064-S02_038300_7..S98X001': ['712S', '06:37:00', '06:37:00', '6', 23820, 23820],
    'L0S1-7-1064-S02_038000_7..N97R': ['702N', '06:52:00', '06:52:00', '21', 24720, 24720],
    'L0S1-7-1064-S02_096850_7..N97X005': ['725N', '16:11:00', '16:11:00', '2', 58260, 58260],
    'L0S1-7-1064-S02_096800_7..S97R': ['724S', '16:41:00', '16:41:00', '20', 60060, 60060],
    'L0S1-F-1077-S02_113250_F..S53X011': ['F35S', '20:16:30', '20:20:00', '40', 72990, 73200],
    'L0S1-R-1093-S02_129650_R..N78R': ['R44N', '21:38:30', '21:38:30', '2', 77910, 77910],
    'BFA23GEN-N098-Weekday-00_103100_N..S72R': ['R08S', '17:18:30', '17:18:30', '6', 62310, 62310],
    'BFA23GEN-N098-Weekday-00_104000_N..S72R': ['R23S', '17:55:30', '17:55:30', '19', 64530, 64530],
    'BFA23GEN-N098-Weekday-00_104150_N..N33R': ['N08N', '17:28:30', '17:28:30', '4', 62910, 62910],
    'BFA23GEN-N098-Weekday-00_103600_N..S34R': ['R16S', '17:39:30', '17:39:30', '12', 63570, 63570]
    }

    # The time has been filtered manually for a certain range (30000 to 60000)
    modifiedDict1 = {
    'AFA23GEN-1092-Weekday-00_057850_1..N13R': ['112N', '10:20:30', '10:20:30', '29', 37230, 37230],
    'AFA23GEN-1092-Weekday-00_057950_1..S03R': ['113S', '09:54:30', '09:54:30', '11', 35670, 35670],
    'AFA23GEN-6089-Weekday-00_075000_6..N02R': ['640N', '12:30:00', '12:30:00', '1', 45000, 45000],
    'AFA23GEN-6089-Weekday-00_067800_6..N01R': ['619N', '11:50:00', '11:50:00', '21', 42600, 42600],
    'AFA23GEN-6089-Weekday-00_067000_6..S03R': ['625S', '11:35:00', '11:35:00', '17', 41700, 41700],
    'L0S1-7-1064-S02_096850_7..N97X005': ['725N', '16:11:00', '16:11:00', '2', 58260, 58260],
    }

    # The time has been filtered manually again for a different range (60060 to 64530)
    modifiedDict2 = {
    'L0S1-7-1064-S02_096800_7..S97R': ['724S', '16:41:00', '16:41:00', '20', 60060, 60060],
    'BFA23GEN-N098-Weekday-00_103100_N..S72R': ['R08S', '17:18:30', '17:18:30', '6', 62310, 62310],
    'BFA23GEN-N098-Weekday-00_104150_N..N33R': ['N08N', '17:28:30', '17:28:30', '4', 62910, 62910],
    'BFA23GEN-N098-Weekday-00_103600_N..S34R': ['R16S', '17:39:30', '17:39:30', '12', 63570, 63570]
    }

    # Specify column names
    columns = ['stop_id', 'arrival_time', 'departure_time', 'stop_sequence',  'arrival_time_sec', 'departure_time_sec']

    # Create dataframe from dictionary with appropriate columns
    importedDf = pd.DataFrame(df).T
    importedDf.columns = columns
    importedDf = importedDf.rename_axis(index='trip_id')

    # Do the same with the modified df 1 and 2
    modifiedDf1 = pd.DataFrame(modifiedDict1).T
    modifiedDf1.columns = columns
    modifiedDf1 = modifiedDf1.rename_axis(index='trip_id')
    modifiedDf2 = pd.DataFrame(modifiedDict2).T
    modifiedDf2.columns = columns
    modifiedDf2 = modifiedDf2.rename_axis(index='trip_id')

    # Run function
    timeQuery1 = subwayTimeQuery(importedDf, 'arrival_time_sec', 'departure_time_sec', 30000, 60000)
    timeQuery2 = subwayTimeQuery(importedDf, 'arrival_time_sec', 'departure_time_sec', 60060, 64530)

    # Extract time in seconds columns
    timeQuery1Arrival = timeQuery1['arrival_time_sec'].astype(int)
    timeQuery1Departure = timeQuery1['departure_time_sec'].astype(int)
    timeQuery2Arrival = timeQuery2['arrival_time_sec'].astype(int)
    timeQuery2Departure = timeQuery2['departure_time_sec'].astype(int)
    modifiedDf1Arrival = modifiedDf1['arrival_time_sec'].astype(int)
    modifiedDf1Departure = modifiedDf1['departure_time_sec'].astype(int)
    modifiedDf2Arrival = modifiedDf2['arrival_time_sec'].astype(int)
    modifiedDf2Departure = modifiedDf2['departure_time_sec'].astype(int)

    # Test if the columns are equal
    self.assertTrue(modifiedDf1Arrival.equals(timeQuery1Arrival),
                    "subwayTimeQuery does not return the expected value.")
    self.assertTrue(modifiedDf1Departure.equals(timeQuery1Departure),
                    "subwayTimeQuery does not return the expected value.")
    self.assertTrue(modifiedDf2Arrival.equals(timeQuery2Arrival),
                    "subwayTimeQuery does not return the expected value.")
    self.assertTrue(modifiedDf2Departure.equals(timeQuery2Departure),
                    "subwayTimeQuery does not return the expected value.")

class TestReadBullet(unittest.TestCase):

  # Write function
  def test_readBullet_OutputType(self):
    """Test output type of readBullet function."""

    # Read test bullets
    testBulletBlack = readBullet('Black')
    testBulletBlue = readBullet('Blue')

    # Test type
    self.assertTrue(type(testBulletBlack) == np.ndarray,
      "readBullet needs to return an ndarray.")
    self.assertTrue(type(testBulletBlue) == np.ndarray,
      "readBullet needs to return an ndarray.")

  # Write function
  def test_readBullet_OutputValue(self):
    """Test output value of readBullet function."""

    # Read test bullets
    testBulletBlack = readBullet('Black')
    testBulletBlue = readBullet('Blue')

    # Test value of black image (should be all 255 because returned image should be white)
    self.assertTrue(np.all(testBulletBlack ==  np.array([255, 255, 255])),
                    "readBullet does not return the expected value.")

    # Test value of blue image (should be all [0,0,255] because returned image should be RGB)
    self.assertTrue(np.all(testBulletBlue == np.array([0,0,255])),
                    "readBullet does not return the expected value.")

class TestNoExpressDifferentiatedQuery(unittest.TestCase):

  # Write function
  def test_noExpressDifferentiatedQuery_OutputType(self):
    """Test output type of noExpressDifferentiatedQuery function."""

    # Create dataframe populated with various lines and times extracted
    # from the given CSV file manually. This should have enough to test various cases
    # for this function
    # Original dictionary
    df = {
        'AFA23GEN-1038-Sunday-00_008600_1..S03R': ['104S', '01:29:00', '01:29:00', '3'],
        'AFA23GEN-1092-Weekday-00_057850_1..N13R': ['112N', '10:20:30', '10:20:30', '29'],
        'AFA23GEN-1092-Weekday-00_057950_1..S03R': ['113S', '09:54:30', '09:54:30', '11'],
        'AFA23GEN-6089-Weekday-00_075000_6..N02R': ['640N', '12:30:00', '12:30:00', '1'],
        'AFA23GEN-6089-Weekday-00_067800_6..N01R': ['619N', '11:50:00', '11:50:00', '21'],
        'AFA23GEN-6089-Weekday-00_067000_6..S03R': ['625S', '11:35:00', '11:35:00', '17'],
        'AFA23GEN-6089-Weekday-00_048500_6..S08R': ['608S', '08:15:00', '08:17:00', '7'],
        'L0S1-7-1064-S02_038300_7..S98X001': ['712S', '06:37:00', '06:37:00', '6'],
        'L0S1-7-1064-S02_038000_7..N97R': ['702N', '06:52:00', '06:52:00', '21'],
        'L0S1-7-1064-S02_096850_7..N97X005': ['725N', '16:11:00', '16:11:00', '2'],
        'L0S1-7-1064-S02_096800_7..S97R': ['724S', '16:41:00', '16:41:00', '20'],
        'L0S1-F-1077-S02_113250_F..S53X011': ['F35S', '20:16:30', '20:20:00', '40'],
        'L0S1-R-1093-S02_129650_R..N78R': ['R44N', '21:38:30', '21:38:30', '2'],
        'BFA23GEN-N098-Weekday-00_103100_N..S72R': ['R08S', '17:18:30', '17:18:30', '6'],
        'BFA23GEN-N098-Weekday-00_104000_N..S72R': ['R23S', '17:55:30', '17:55:30', '19'],
        'FA23GEN-N098-Weekday-00_104150_N..N33R': ['N08N', '17:28:30', '17:28:30', '4'],
        'BFA23GEN-N098-Weekday-00_103600_N..S34R': ['R16S', '17:39:30', '17:39:30', '12']
    }

    # Specify column names
    columns = ['stop_id', 'arrival_time', 'departure_time', 'stop_sequence']

    # Create list of lines and directions to test
    linesToTest = ['1','6','7','N','W','F','R']
    directions = ['N','S']

    # Create dataframe from dictionary with appropriate columns
    importedDf = pd.DataFrame(df).T
    importedDf.columns = columns
    importedDf = importedDf.rename_axis(index='trip_id')

    # Iterate through all of them, checking if a dataframe is always returned
    for line in linesToTest:
      for dir in directions:
        noExpressDf = noExpressDifferentiatedQuery(importedDf, line, dir)
        self.assertIsInstance(noExpressDf, pd.DataFrame, "Returned object is not a DataFrame")

  # Write function
  def test_noExpressDifferentiatedQuery_OutputValue(self):
    """Test output value of noExpressDifferentiatedQuery function."""

    # Create dataframe populated with various lines and times extracted
    # from the given CSV file manually. This should have enough to test various cases
    # for this function
    # Original dictionary
    df = {
        'AFA23GEN-1038-Sunday-00_008600_1..S03R': ['104S', '01:29:00', '01:29:00', '3'],
        'AFA23GEN-1092-Weekday-00_057850_1..N13R': ['112N', '10:20:30', '10:20:30', '29'],
        'AFA23GEN-1092-Weekday-00_057950_1..S03R': ['113S', '09:54:30', '09:54:30', '11'],
        'AFA23GEN-6089-Weekday-00_075000_6..N02R': ['640N', '12:30:00', '12:30:00', '1'],
        'AFA23GEN-6089-Weekday-00_067800_6..N01R': ['619N', '11:50:00', '11:50:00', '21'],
        'AFA23GEN-6089-Weekday-00_067000_6..S03R': ['625S', '11:35:00', '11:35:00', '17'],
        'AFA23GEN-6089-Weekday-00_048500_6..S08R': ['608S', '08:15:00', '08:17:00', '7'],
        'L0S1-7-1064-S02_038300_7..S98X001': ['712S', '06:37:00', '06:37:00', '6'],
        'L0S1-7-1064-S02_038000_7..N97R': ['702N', '06:52:00', '06:52:00', '21'],
        'L0S1-7-1064-S02_096850_7..N97X005': ['725N', '16:11:00', '16:11:00', '2'],
        'L0S1-7-1064-S02_096800_7..S97R': ['724S', '16:41:00', '16:41:00', '20'],
        'L0S1-F-1077-S02_113250_F..S53X011': ['F35S', '20:16:30', '20:20:00', '40'],
        'L0S1-R-1093-S02_129650_R..N78R': ['R44N', '21:38:30', '21:38:30', '2'],
        'BFA23GEN-N098-Weekday-00_103100_N..S72R': ['R08S', '17:18:30', '17:18:30', '6'],
        'BFA23GEN-N098-Weekday-00_104000_N..S72R': ['R23S', '17:55:30', '17:55:30', '19'],
        'BFA23GEN-N098-Weekday-00_104150_N..N33R': ['N08N', '17:28:30', '17:28:30', '4'],
        'BFA23GEN-N098-Weekday-00_103600_N..S34R': ['R16S', '17:39:30', '17:39:30', '12']
    }

    # Specify column names
    columns = ['stop_id', 'arrival_time', 'departure_time', 'stop_sequence']

    # Create list of lines and directions to test
    linesToTest = ['1','6','7','N','W','F','R']
    directions = ['N','S']

    # Create dataframe from dictionary with appropriate columns
    importedDf = pd.DataFrame(df).T
    importedDf.columns = columns
    importedDf = importedDf.rename_axis(index='trip_id')

    # Iterate through all of them, checking if correct dataframe is always returned
    for line in linesToTest:
      for dir in directions:
        noExpressDf = noExpressDifferentiatedQuery(importedDf, line, dir)
        if line == '6' and dir == 'N':
            actualDf = importedDf.query('trip_id.str.contains("AFA23GEN-6089-Weekday-00_067800_6..N01R") or trip_id.str.contains("AFA23GEN-6089-Weekday-00_075000_6..N02R")')
        elif line == '6' and dir == 'S':
            actualDf = importedDf.query('trip_id.str.contains("AFA23GEN-6089-Weekday-00_067000_6..S03R") or trip_id.str.contains("AFA23GEN-6089-Weekday-00_048500_6..S08R")')
        elif line == '7' and dir == 'N':
            actualDf = importedDf.query('trip_id.str.contains("L0S1-7-1064-S02_038000_7..N97R") or trip_id.str.contains("L0S1-7-1064-S02_096850_7..N97X005")')
        elif line == '7' and dir == 'S':
            actualDf = importedDf.query('trip_id.str.contains("L0S1-7-1064-S02_096800_7..S97R") or trip_id.str.contains("L0S1-7-1064-S02_038300_7..S98X001")')
        elif line == 'N' and dir == 'N':
            actualDf = importedDf.query('trip_id.str.contains("BFA23GEN-N098-Weekday-00_104150_N..N33R")')
        elif line == 'N' and dir == 'S':
            actualDf = importedDf.query('trip_id.str.contains("BFA23GEN-N098-Weekday-00_103600_N..S34R")')
        elif line == 'W' and dir == 'N':
            actualDf = importedDf.query('trip_id.str.contains("NaN")')
        elif line == 'W' and dir == 'S':
            actualDf = importedDf.query('trip_id.str.contains("BFA23GEN-N098-Weekday-00_103100_N..S72R") or trip_id.str.contains("BFA23GEN-N098-Weekday-00_104000_N..S72R")')
        elif line == 'F' and dir == 'N':
            # We just need a blank dataframe for this case
            actualDf = importedDf.query('trip_id.str.contains("NaN")')
        elif line == 'F' and dir == 'S':
            actualDf = importedDf.query('trip_id.str.contains("L0S1-F-1077-S02_113250_F..S53X011")')
        elif line == 'R' and dir == 'N':
            actualDf = importedDf.query('trip_id.str.contains("L0S1-R-1093-S02_129650_R..N78R")')
        elif line == 'R' and dir == 'S':
            # We just need a blank dataframe in this case
            actualDf = importedDf.query('trip_id.str.contains("NaN")')
        elif line == '1' and dir == 'N':
            actualDf = importedDf.query('trip_id.str.contains("AFA23GEN-1092-Weekday-00_057850_1..N13R")')
        elif line == '1' and dir == 'S':
            actualDf = importedDf.query('trip_id.str.contains("AFA23GEN-1092-Weekday-00_057950_1..S03R")')

        # Run test for each case
        self.assertTrue(actualDf.equals(noExpressDf),
                    "noExpressDifferentiatedQuery does not return the expected value.")

class TestPairwiseDifferences(unittest.TestCase):

  # Write function
  def test_pairwiseDifferences_OutputType(self):
    """Test output type of pairwiseDifferences function."""

    # Create dataframe populated with various lines and times extracted
    # from the given CSV file manually. This should have enough to test various cases
    # for this function.
    # Original dictionary
    df = {
    'L0S1-7-1064-S02_096800_7..S97R': ['724S', '16:41:00', '16:41:00', '20', 60060, 60060],
    'BFA23GEN-N098-Weekday-00_103100_N..S72R': ['R08S', '17:18:30', '17:18:30', '6', 62310, 62310],
    'BFA23GEN-N098-Weekday-00_104150_N..N33R': ['N08N', '17:28:30', '17:28:30', '4', 62910, 62910],
    'BFA23GEN-N098-Weekday-00_103600_N..S34R': ['R16S', '17:39:30', '17:39:30', '12', 63570, 63570],
    'BFA23GEN-N098-Weekday-00_104000_N..S72R': ['R23S', '17:55:30', '17:55:30', '19', 64530, 64530],
    'BFA23GEN-N098-Weekday-00_104000_N..S72R': ['R23S', '17:55:30', '17:55:30', '19', 84530, 84530],
    }

    # Specify column names
    columns = ['stop_id', 'arrival_time', 'departure_time', 'stop_sequence',  'arrival_time_sec', 'departure_time_sec']

    # Create dataframe from dictionary with appropriate columns
    importedDf = pd.DataFrame(df).T
    importedDf.columns = columns
    importedDf = importedDf.rename_axis(index='trip_id')

    # Run function and test if type is correct
    self.assertTrue(type(pairwiseDifferences(importedDf, 'arrival_time_sec')) == np.ndarray, "Returned object is not a numpy array")

  # Write function
  def test_pairwiseDifferences_OutputValue(self):
    """Test output value of pairwiseDifferences function."""

    # Create dataframe populated with various lines and times extracted
    # from the given CSV file manually. This should have enough to test various cases
    # for this function.
    # Original dictionary
    df = {
    'L0S1-7-1064-S02_096800_7..S97R': ['724S', '16:41:00', '16:41:00', '20', 60060, 60060],
    'BFA23GEN-N098-Weekday-00_103100_N..S72R': ['R08S', '17:18:30', '17:18:30', '6', 62310, 62310],
    'BFA23GEN-N098-Weekday-00_104150_N..N33R': ['N08N', '17:28:30', '17:28:30', '4', 62910, 62910],
    'BFA23GEN-N098-Weekday-00_103600_N..S34R': ['R16S', '17:39:30', '17:39:30', '12', 63570, 63570],
    'BFA23GEN-N098-Weekday-00_104000_N..S72R': ['R23S', '17:55:30', '17:55:30', '19', 64530, 64530],
    'BFA23GEN-N098-Weekday-00_104001_N..S72R': ['R23S', '17:55:30', '17:55:30', '19', 84530, 84530],
    }

    # Specify column names
    columns = ['stop_id', 'arrival_time', 'departure_time', 'stop_sequence',  'arrival_time_sec', 'departure_time_sec']

    # Create dataframe from dictionary with appropriate columns
    importedDf = pd.DataFrame(df).T
    importedDf.columns = columns
    importedDf = importedDf.rename_axis(index='trip_id')

    # Create resulting array (note last difference thrown out because 20000 > 7000)
    differences = np.array([2250, 600, 660, 960])

    # Test if the columns are equal
    self.assertTrue(np.array_equal(pairwiseDifferences(importedDf, 'arrival_time_sec'), differences),
                    "pairwiseDifferences does not return the expected value.")

# Value not tested for this function because it compiles statistic tests, hard to know value beforehand
class TestTrainComparison(unittest.TestCase):

  # Write function
  def test_trainComparison_OutputType(self):
    """Test output type of trainComparison function."""

    # Set inputs
    lines = ['1', '2', '3']
    direction = 'N'
    rushHourType = 'morning'

    # Run function and unpack tuple
    shapWilkDict, anovapval, normal, statDf = trainComparison(lines, direction, rushHourType)

    # Set types (statDf is dataframe)
    shapWilkType = dict
    anovapvalType = np.float64
    normalType = bool

    # Run function and test if type is correct
    self.assertTrue(type(shapWilkDict) == shapWilkType, "Returned object is not a dict")
    self.assertTrue(type(anovapval) == anovapvalType, "Returned object is not a float")
    self.assertTrue(type(normal) == normalType, "Returned object is not a float")
    self.assertIsInstance(statDf, pd.DataFrame, "Returned object is not a DataFrame")

class TestStopExtractor(unittest.TestCase):

  # Write function
  def test_stopExtractor_OutputType(self):
    """Test output type of stopExtractor function."""

    # Create dictionary
    df = {
    'AFA23GEN-1038-Sunday-00_000600_1..S03R': ['101S', '00:06:00', '00:06:00', '1'],
    'AFA23GEN-1038-Sunday-00_000600_1..S03R': ['103S', '00:07:30', '00:07:30', '2'],
    'AFA23GEN-1038-Sunday-00_000600_1..S03R': ['104S', '00:09:00', '00:09:00', '3'],
    'AFA23GEN-1038-Sunday-00_000600_1..S03R': ['106S', '00:10:30', '00:10:30', '4'],
    'AFA23GEN-1038-Sunday-00_000600_1..S03R': ['107S', '00:12:00', '00:12:00', '5'],
    'AFA23GEN-1038-Sunday-00_000600_1..S03R': ['108S', '00:13:00', '00:13:00', '6'],
    'AFA23GEN-1038-Sunday-00_000600_1..S03R': ['109S', '00:14:30', '00:14:30', '7'],
    'AFA23GEN-1038-Sunday-00_000600_1..S03R': ['110S', '00:16:00', '00:16:00', '8'],
    'AFA23GEN-1038-Sunday-00_000600_1..S03R': ['111S', '00:17:30', '00:17:30', '9']
    }

    # Convert dictionary to dataframe
    # Specify column names
    columns = ['stop_id', 'arrival_time', 'departure_time', 'stop_sequence']

    # Create dataframe from dictionary with appropriate columns
    importedDf = pd.DataFrame(df).T
    importedDf.columns = columns
    importedDf = importedDf.rename_axis(index='trip_id')

    # Add seconds
    secondsDf = addSeconds(importedDf, ['arrival_time', 'departure_time'])

    # Run function and unpack tuple
    secondsDf, numDict = stopExtractor(secondsDf, '1', 'S')

    # Run function and test if type is correct
    self.assertTrue(type(numDict) == dict, "Returned object is not a dict")
    self.assertIsInstance(secondsDf, pd.DataFrame, "Returned object is not a DataFrame")

  def test_stopExtractor_OutputValue(self):
    """Test output value of stopExtractor function."""

    # Create array
    df = [['AFA23GEN-1038-Sunday-00_000600_1..S03R','101S', '00:06:00', '00:06:00', '1'],
    ['AFA23GEN-1038-Sunday-00_000600_1..S03R','103S', '00:07:30', '00:07:30', '2'],
    ['AFA23GEN-1038-Sunday-00_000600_1..S03R','104S', '00:09:00', '00:09:00', '3'],
    ['AFA23GEN-1038-Sunday-00_000600_1..S03R','106S', '00:10:30', '00:10:30', '4'],
    ['AFA23GEN-1038-Sunday-00_000600_1..S03R','107S', '00:12:00', '00:12:00', '5'],
    ['AFA23GEN-1038-Sunday-00_000600_1..S03R','108S', '00:13:00', '00:13:00', '6'],
    ['AFA23GEN-1038-Sunday-00_000600_1..S03R','109S', '00:14:30', '00:14:30', '7'],
    ['AFA23GEN-1038-Sunday-00_000600_1..S03R','110S', '00:16:00', '00:16:00', '8'],
    ['AFA23GEN-1038-Sunday-00_000600_1..S03R','111S', '00:17:30', '00:17:30', '9']]

    dfOutput = [['AFA23GEN-1038-Sunday-00_000600_1..S03R','101S', '00:06:00', '00:06:00', '1', 360, 360, 0],
    ['AFA23GEN-1038-Sunday-00_000600_1..S03R','103S', '00:07:30', '00:07:30', '2', 450, 450, 1],
    ['AFA23GEN-1038-Sunday-00_000600_1..S03R','104S', '00:09:00', '00:09:00', '3', 540, 540, 2],
    ['AFA23GEN-1038-Sunday-00_000600_1..S03R','106S', '00:10:30', '00:10:30', '4', 630, 630, 3],
    ['AFA23GEN-1038-Sunday-00_000600_1..S03R','107S', '00:12:00', '00:12:00', '5', 720, 720, 4],
    ['AFA23GEN-1038-Sunday-00_000600_1..S03R','108S', '00:13:00', '00:13:00', '6', 780, 780, 5],
    ['AFA23GEN-1038-Sunday-00_000600_1..S03R','109S', '00:14:30', '00:14:30', '7', 870, 870, 6],
    ['AFA23GEN-1038-Sunday-00_000600_1..S03R','110S', '00:16:00', '00:16:00', '8', 960, 960, 7],
    ['AFA23GEN-1038-Sunday-00_000600_1..S03R','111S', '00:17:30', '00:17:30', '9', 1050, 1050, 8]]

    # Specify column names
    columns = ['trip_id', 'stop_id', 'arrival_time', 'departure_time', 'stop_sequence']
    finalCols = ['trip_id', 'stop_id', 'arrival_time', 'departure_time', 'stop_sequence', 'arrival_time_sec', 'departure_time_sec', 'stop_val']

    # Create dataframe
    finalDf = pd.DataFrame(dfOutput)
    finalDf.columns = finalCols

    # Create dataframe from dictionary with appropriate columns
    importedDf = pd.DataFrame(df)
    importedDf.columns = columns

    # Add seconds
    secondsDf = addSeconds(importedDf, ['arrival_time', 'departure_time'])

    # Run function and unpack tuple
    secondsDf, numDict = stopExtractor(secondsDf, '1', 'S')

    # Write out numDict
    numDict = {'101S':1, '103S':2, '104S':3, '106S':4, '107S':5, '108S':6, '109S':7,
                '110S':8, '111S':9}

    # Run function and test if type is correct
    self.assertTrue(numDict == numDict, "stopExtractor does not return the expected value")
    self.assertTrue(secondsDf['stop_val'].equals(finalDf['stop_val']), "stopExtractor does not return the expected value")

class TestStopExtractor(unittest.TestCase):

  # Write function
  def test_stopIDtoName_OutputType(self):
    """Test output type of stopIDtoName function."""

    # Create dictionary
    df = [('L01', '8 Av', 40.739777, -74.002578, 1),
       ('L01N', '8 Av', 40.739777, -74.002578, ''),
       ('L01S', '8 Av', 40.739777, -74.002578, ''),
       ('L02', '6 Av', 40.737335, -73.996786, 1),
       ('L02N', '6 Av', 40.737335, -73.996786, ''),
       ('L02S', '6 Av', 40.737335, -73.996786, ''),
       ('L03', '14 St-Union Sq', 40.734789, -73.99073, 1),
       ('L03N', '14 St-Union Sq', 40.734789, -73.99073, ''),
       ('L03S', '14 St-Union Sq', 40.734789, -73.99073, ''),
       ('L05', '3 Av', 40.732849, -73.986122, 1),
       ('L05N', '3 Av', 40.732849, -73.986122, ''),
       ('L05S', '3 Av', 40.732849, -73.986122, ''),
       ('L06', '1 Av', 40.730953, -73.981628, 1),
       ('L06N', '1 Av', 40.730953, -73.981628, ''),
       ('L06S', '1 Av', 40.730953, -73.981628, ''),
       ('L08', 'Bedford Av', 40.717304, -73.956872, 1),
       ('L08N', 'Bedford Av', 40.717304, -73.956872, ''),
       ('L08S', 'Bedford Av', 40.717304, -73.956872, ''),
       ('L10', 'Lorimer St', 40.714063, -73.950275, 1),
       ('L10N', 'Lorimer St', 40.714063, -73.950275, ''),
       ('L10S', 'Lorimer St', 40.714063, -73.950275, '')]


    # Convert dictionary to dataframe
    # Specify column names
    columns = ['stop_id','stop_name','stop_lat','stop_lon','location_type']

    # Create dataframe from dictionary with appropriate columns
    importedDf = pd.DataFrame(df)
    importedDf.columns = columns
    importedDf = importedDf.set_index('stop_id')

    # Create stopID list
    stopIDlist = ['L01N', 'L02N', 'L03N', 'L05N', 'L06N', 'L08N', 'L10N']

    # Run function and test if type is correct
    self.assertTrue(type(stopIDtoName(importedDf, stopIDlist)) == dict, "Returned object is not a dict")

  def test_stopIDtoName_OutputValue(self):
    """Test output value of stopExtractor function."""


    # Create dictionary
    df = [('L01', '8 Av', 40.739777, -74.002578, 1),
       ('L01N', '8 Av', 40.739777, -74.002578, ''),
       ('L01S', '8 Av', 40.739777, -74.002578, ''),
       ('L02', '6 Av', 40.737335, -73.996786, 1),
       ('L02N', '6 Av', 40.737335, -73.996786, ''),
       ('L02S', '6 Av', 40.737335, -73.996786, ''),
       ('L03', '14 St-Union Sq', 40.734789, -73.99073, 1),
       ('L03N', '14 St-Union Sq', 40.734789, -73.99073, ''),
       ('L03S', '14 St-Union Sq', 40.734789, -73.99073, ''),
       ('L05', '3 Av', 40.732849, -73.986122, 1),
       ('L05N', '3 Av', 40.732849, -73.986122, ''),
       ('L05S', '3 Av', 40.732849, -73.986122, ''),
       ('L06', '1 Av', 40.730953, -73.981628, 1),
       ('L06N', '1 Av', 40.730953, -73.981628, ''),
       ('L06S', '1 Av', 40.730953, -73.981628, ''),
       ('L08', 'Bedford Av', 40.717304, -73.956872, 1),
       ('L08N', 'Bedford Av', 40.717304, -73.956872, ''),
       ('L08S', 'Bedford Av', 40.717304, -73.956872, ''),
       ('L10', 'Lorimer St', 40.714063, -73.950275, 1),
       ('L10N', 'Lorimer St', 40.714063, -73.950275, ''),
       ('L10S', 'Lorimer St', 40.714063, -73.950275, '')]


    # Convert dictionary to dataframe
    # Specify column names
    columns = ['stop_id','stop_name','stop_lat','stop_lon','location_type']

    # Create dataframe from dictionary with appropriate columns
    importedDf = pd.DataFrame(df)
    importedDf.columns = columns
    importedDf = importedDf.set_index('stop_id')

    # Create stopID list
    stopIDlist = ['L01N', 'L02N', 'L03N', 'L05N', 'L06N', 'L08N', 'L10N']

    # Expected dict
    stopDict = {'L01N':'8 Av', 'L02N':'6 Av', 'L03N':'14 St-Union Sq',
                'L05N':'3 Av', 'L06N':'1 Av', 'L08N':'Bedford Av',
                'L10N':'Lorimer St'}

    # Run function and test if type is correct
    self.assertTrue(stopIDtoName(importedDf, stopIDlist) == stopDict, "stopDict does not return the expected value")

class TestPartOne(unittest.TestCase):

  # Write function (only test output type because this function combines other
  # functions that we already tested)
  def test_stopIDtoName_OutputType(self):
    """Test output type of stopIDtoName function."""

    # Run function on one test case
    stopDict, stopNames, timeList, time1, time2, filteredSecondsDf = partOne('1', 'N', 'morning')


    # Test type
    self.assertTrue(type(stopDict) == dict, "Returned object is not a dict")
    self.assertTrue(type(stopNames) == dict, "Returned object is not a dict")
    self.assertTrue(type(timeList) == list, "Returned object is not a list")
    self.assertTrue(type(time1) == int, "Returned object is not an int")
    self.assertTrue(type(time2) == int, "Returned object is not an int")
    self.assertIsInstance(filteredSecondsDf, pd.DataFrame, "Returned object is not a DataFrame")

# shapirowilk, anovatest, mannwhitneypairwise, and twosamplet are functions already written for past homeworks
# trainIntervals, makeHeatmap, and boxAndWhisker do not return values, so they were not tested

# Run test
unittest.main(argv=[''], verbosity=2, exit=False)

test_addSeconds_OutputType (__main__.TestAddSeconds)
Test output type of addSeconds function. ... ok
test_addSeconds_OutputValue (__main__.TestAddSeconds)
Test output value of addSeconds function. ... ok
test_getSec_OutputType (__main__.TestGetSec)
Test output type of getSec function. ... ok
test_getSec_OutputValue (__main__.TestGetSec)
Test output value of getSec function. ... ok
test_noExpressDifferentiatedQuery_OutputType (__main__.TestNoExpressDifferentiatedQuery)
Test output type of noExpressDifferentiatedQuery function. ... ok
test_noExpressDifferentiatedQuery_OutputValue (__main__.TestNoExpressDifferentiatedQuery)
Test output value of noExpressDifferentiatedQuery function. ... ok
test_pairwiseDifferences_OutputType (__main__.TestPairwiseDifferences)
Test output type of pairwiseDifferences function. ... ok
test_pairwiseDifferences_OutputValue (__main__.TestPairwiseDifferences)
Test output value of pairwiseDifferences function. ... ok
test_stopIDtoName_OutputType (__main__.TestPar

<unittest.main.TestProgram at 0x7fdb78edebc0>

In [None]:
def partOne(line, direction, rushHourType):
  """Given a line, a direction, and the time of day (morning rush or evening rush),
  returns
  - stopDict, a dictionary corresponding each stop ID to a sequence number
  - stopNames, a dictionary corresponding each stop ID to its street name
  - timeList, a list of times that will serve as the x-axis of the graph depending
  on whether it is morning rush or evening rush
  - time1, the time in seconds corresponding to the beginning of morning or evening rush
  - time2, the time in seconds corresponding to the  end of morning or evening rush
  - filteredSecondsDf, a dataframe for the specific line, time of day, and rush hour type
  with each stop arrival being listed in HH:MM:SS as well as just seconds"""

  # Initializes df as a global variable (original dataframe already loaded in)
  global df

  # Runs subwayLineQuery to filter df by line and direction
  subwayDf = subwayLineQuery(df, line, direction)

  # Runs addSeconds to ensure stop times are in both HH:MM:SS and seconds
  timeDf = addSeconds(subwayDf, ['arrival_time', 'departure_time'])

  # If evening, set time1 to be the time in seconds for the beginning of evening rush,
  # time2 to be the time in seconds for the end of evening rush, and timeList to be the
  # list of times to be sent to the x-axis
  if rushHourType == 'evening':
    time1 = 55800
    time2 = 72000
    timeList = ['15:30', '15:45', '16:00', '16:15', '16:30', '16:45', '17:00',
                                           '17:15', '17:30', '17:45', '18:00', '18:15', '18:30', '18:45', '19:00',
                                           '19:15', '19:30', '19:45', '20:00']

  # Do the same for morning rush
  else:
    time1 = 23400
    time2 = 34200
    timeList = ['6:30', '6:45', '7:00', '7:15', '7:30',
                                          '7:45', '8:00', '8:15', '8:30', '8:45',
                                          '9:00', '9:15', '9:30']

  # Filter to ensure only stop times between the time for the beginning of rush and the time for
  # the end of rush are listed
  secondsDf = subwayTimeQuery(timeDf, 'arrival_time_sec', 'departure_time_sec', time1, time2)

  # Get the stopDict for a train given direction as well the same secondsDf with an additional column
  # containing the stop number in the sequence for the line
  filteredSecondsDf, stopDict = stopExtractor(secondsDf, line, direction)

  # Initialize global variable stopsDf
  global stopsDf

  # Get dictionary associating each stopID to its street name
  stopNames = stopIDtoName(stopsDf, list(stopDict.keys()))

  # Return values as specified
  return stopDict, stopNames, timeList, time1, time2, filteredSecondsDf



In [None]:
df

Unnamed: 0_level_0,stop_id,arrival_time,departure_time,stop_sequence
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AFA23GEN-1038-Sunday-00_000600_1..S03R,101S,00:06:00,00:06:00,1
AFA23GEN-1038-Sunday-00_000600_1..S03R,103S,00:07:30,00:07:30,2
AFA23GEN-1038-Sunday-00_000600_1..S03R,104S,00:09:00,00:09:00,3
AFA23GEN-1038-Sunday-00_000600_1..S03R,106S,00:10:30,00:10:30,4
AFA23GEN-1038-Sunday-00_000600_1..S03R,107S,00:12:00,00:12:00,5
...,...,...,...,...
SIR-FA2017-SI017-Weekday-08_147100_SI..N03R,S27N,25:03:00,25:03:00,17
SIR-FA2017-SI017-Weekday-08_147100_SI..N03R,S28N,25:06:00,25:06:00,18
SIR-FA2017-SI017-Weekday-08_147100_SI..N03R,S29N,25:08:00,25:08:00,19
SIR-FA2017-SI017-Weekday-08_147100_SI..N03R,S30N,25:10:00,25:10:00,20


In [None]:
# - lineDistribution
# Write for stopExtractor, stopIDtoName, and partOne

In [None]:
np.unique(readBullet('Blue')) == [0,255]

array([ True,  True])

In [None]:
def readBullet(line):
  """Give the specified train line, loads image of train bullet with OpenCV,
  switching the R and B color channels."""

  # Access image file
  bullet = f'/content/drive/My Drive/ENGR 1050 Final Project/02 19 Data/Bullets/{line}_Train.png'
  imageColor = cv2.imread(bullet, cv2.IMREAD_COLOR)

  if line != 'N' and line != 'Q' and line != 'R' and line != 'W':
    # Find all black pixels (each image has a transparent background, which is loaded
    # as black)
    black_pixels = np.where(
        (imageColor[:, :, 0] == 0) &
        (imageColor[:, :, 1] == 0) &
        (imageColor[:, :, 2] == 0))

    # Set black pixels to white
    imageColor[black_pixels] = [255, 255, 255]

  # Flip R and B channels (because matplotlib does RGB instead of BGR and return flipped image)
  imageColor_rgb = cv2.cvtColor(imageColor, cv2.COLOR_BGR2RGB)
  return imageColor_rgb


In [None]:
pd.DataFrame(df).T

Unnamed: 0,0,1,2,3
AFA23GEN-1038-Sunday-00_008600_1..S03R,104S,01:29:00,01:29:00,3
AFA23GEN-1092-Weekday-00_057850_1..N13R,112N,10:20:30,10:20:30,29
AFA23GEN-1092-Weekday-00_057950_1..S03R,113S,09:54:30,09:54:30,11
AFA23GEN-6089-Weekday-00_075000_6..N02R,640N,12:30:00,12:30:00,1
AFA23GEN-6089-Weekday-00_067800_6..N01R,619N,11:50:00,11:50:00,21
AFA23GEN-6089-Weekday-00_067000_6..S03R,625S,11:35:00,11:35:00,17
AFA23GEN-6089-Weekday-00_048500_6..S08R,608S,08:15:00,08:17:00,7
L0S1-7-1064-S02_038300_7..S98X001,712S,06:37:00,06:37:00,6
L0S1-7-1064-S02_038000_7..N97R,702N,06:52:00,06:52:00,21
L0S1-7-1064-S02_096850_7..N97X005,725N,16:11:00,16:11:00,2
