In [1802]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import plotly.graph_objects as go
from datetime import datetime

In [1803]:
# Load data
df = pd.read_excel('ML_LC_DEST_Refined_V1.xlsx')

# Extract date from TRAIN_ID
df['DATE'] = pd.to_datetime(df['TD'].str.extract(r'(\d{4}-\d{2}-\d{2})')[0])

# Split into train and test
split_date = pd.to_datetime('2024-01-01')
train_df = df[df['DATE'] < split_date].copy()
test_df = df[df['DATE'] >= split_date].copy()

print(f"Original dataset: {df['TRAIN_ID'].nunique()} trains")
print(f"Train set (before 2024): {train_df['TRAIN_ID'].nunique()} trains")
print(f"Test set (2024+): {test_df['TRAIN_ID'].nunique()} trains")

Original dataset: 693 trains
Train set (before 2024): 543 trains
Test set (2024+): 150 trains


In [1804]:
# Define bin ranges for DWELL_TIME to prepare for outlier detection
# Create bins with varying sizes: 0-10 (0.5 intervals), 10-20 (1.0 intervals), 20+ (5.0 intervals)1

# Define bins
bins = np.concatenate([
    np.arange(0, 10.5, 0.5),    # 0-10: bins of 0.5
    np.arange(11, 21, 1),       # 10-20: bins of 1.0  
    np.arange(25, 145, 5)       # 20-140: bins of 5.0
])

# Create DataFrame with interval objects
bins_df = pd.DataFrame({
    'BIN_RANGE': pd.IntervalIndex.from_breaks(bins, closed='right')
})

print(f"Number of bins created: {len(bins_df)}")
bins_df.head(10)

Number of bins created: 54


Unnamed: 0,BIN_RANGE
0,"(0.0, 0.5]"
1,"(0.5, 1.0]"
2,"(1.0, 1.5]"
3,"(1.5, 2.0]"
4,"(2.0, 2.5]"
5,"(2.5, 3.0]"
6,"(3.0, 3.5]"
7,"(3.5, 4.0]"
8,"(4.0, 4.5]"
9,"(4.5, 5.0]"


In [1805]:
# Map each train ID and its dwell time to the appropriate bin range
# This will add a new column showing which bin each record belongs to in both the train and test datasets

# Assign bins to train and test datasets
train_df['DWELL_TIME_BIN'] = pd.cut(train_df['DWELL_TIME'], bins=bins)
test_df['DWELL_TIME_BIN'] = pd.cut(test_df['DWELL_TIME'], bins=bins)
df['DWELL_TIME_BIN'] = pd.cut(df['DWELL_TIME'], bins=bins)


# Show sample from training data
print("Sample from training data:")
print(train_df[['TRAIN_ID', 'DWELL_TIME', 'DWELL_TIME_BIN']].head(10))

Sample from training data:
                 TRAIN_ID  DWELL_TIME DWELL_TIME_BIN
0   XBERBSB928H2021-05-28       0.300     (0.0, 0.5]
2   XCROBSB921A2023-09-21       0.933     (0.5, 1.0]
3   XCROEGE922A2023-09-22       1.017     (1.0, 1.5]
4   XCROHBG919A2023-09-19       1.400     (1.0, 1.5]
5   XPOWBNY925A2021-09-25       0.983     (0.5, 1.0]
6   XPOWBRE920A2021-09-20       2.100     (2.0, 2.5]
7   XPOWBSB901A2021-10-01       2.150     (2.0, 2.5]
8   XPOWEGE902A2021-10-02       0.850     (0.5, 1.0]
9   XPOWGWN903A2021-10-03       4.433     (4.0, 4.5]
10  XPOWCSN921A2021-09-21       0.967     (0.5, 1.0]


In [1806]:
# Count unique trains per bin using TRAINING data and add to bins_df
train_counts_by_bin = train_df.groupby('DWELL_TIME_BIN', observed=False)['TRAIN_ID'].nunique()
bins_df['NB_OF_TRAIN_TRAINS'] = bins_df['BIN_RANGE'].map(train_counts_by_bin).fillna(0).astype(int)

print(f"Total trains counted from TRAINING set: {bins_df['NB_OF_TRAIN_TRAINS'].sum()}")
print(bins_df.sort_values(by='NB_OF_TRAIN_TRAINS', ascending=False))

Total trains counted from TRAINING set: 543
         BIN_RANGE  NB_OF_TRAIN_TRAINS
1       (0.5, 1.0]                 109
2       (1.0, 1.5]                  77
3       (1.5, 2.0]                  48
4       (2.0, 2.5]                  47
5       (2.5, 3.0]                  37
6       (3.0, 3.5]                  27
7       (3.5, 4.0]                  24
8       (4.0, 4.5]                  23
10      (5.0, 5.5]                  19
9       (4.5, 5.0]                  15
20    (10.0, 11.0]                  13
13      (6.5, 7.0]                  11
12      (6.0, 6.5]                   9
11      (5.5, 6.0]                   9
19     (9.5, 10.0]                   9
14      (7.0, 7.5]                   8
15      (7.5, 8.0]                   8
16      (8.0, 8.5]                   7
0       (0.0, 0.5]                   6
23    (13.0, 14.0]                   6
24    (14.0, 15.0]                   5
17      (8.5, 9.0]                   4
21    (11.0, 12.0]                   4
18      (9.0, 9.5]  

In [1807]:
# Filter bins_df to keep only bins with more than 0 trains
bins_df = bins_df[bins_df['NB_OF_TRAIN_TRAINS'] > 0]  # CORRECT column name
bins_df

Unnamed: 0,BIN_RANGE,NB_OF_TRAIN_TRAINS
0,"(0.0, 0.5]",6
1,"(0.5, 1.0]",109
2,"(1.0, 1.5]",77
3,"(1.5, 2.0]",48
4,"(2.0, 2.5]",47
5,"(2.5, 3.0]",37
6,"(3.0, 3.5]",27
7,"(3.5, 4.0]",24
8,"(4.0, 4.5]",23
9,"(4.5, 5.0]",15


In [1808]:
# Find bins containing at least 5% of trains
# IMPORTANT: Calculate 5% based on TRAINING data only
threshold = int(np.ceil(df['TRAIN_ID'].nunique() * 0.05))
bins_above_threshold = bins_df[bins_df['NB_OF_TRAIN_TRAINS'] >= threshold]

print(f"5% threshold: {threshold} trains")
print(f"Bins meeting threshold: {bins_above_threshold.shape[0]}")
bins_above_threshold

5% threshold: 35 trains
Bins meeting threshold: 5


Unnamed: 0,BIN_RANGE,NB_OF_TRAIN_TRAINS
1,"(0.5, 1.0]",109
2,"(1.0, 1.5]",77
3,"(1.5, 2.0]",48
4,"(2.0, 2.5]",47
5,"(2.5, 3.0]",37


In [1809]:
# Get bounds from first and last bin above threshold
lower_bound = bins_above_threshold.iloc[0]['BIN_RANGE'].left
upper_bound = bins_above_threshold.iloc[-1]['BIN_RANGE'].right

# Create summary
summary_df = pd.DataFrame({
    'STN_333': ['MINOT'],
    'Overall_Count': [df['TRAIN_ID'].nunique()],  # Total unique trains in dataset
    'Lower_Bound': [lower_bound],
    'Upper_Bound': [upper_bound]
})

summary_df

Unnamed: 0,STN_333,Overall_Count,Lower_Bound,Upper_Bound
0,MINOT,693,0.5,3.0


In [1810]:
# Get outlier bounds from the bins that meet threshold
lower_bound = bins_above_threshold.iloc[0]['BIN_RANGE'].left
upper_bound = bins_above_threshold.iloc[-1]['BIN_RANGE'].right

print(f"Outlier bounds: {lower_bound} to {upper_bound}")

# Add outlier flag to all datasets
train_df['IS_OUTLIER'] = (train_df['DWELL_TIME'] < lower_bound) | (train_df['DWELL_TIME'] > upper_bound)
test_df['IS_OUTLIER'] = (test_df['DWELL_TIME'] < lower_bound) | (test_df['DWELL_TIME'] > upper_bound)
df['IS_OUTLIER'] = (df['DWELL_TIME'] < lower_bound) | (df['DWELL_TIME'] > upper_bound)

# Quick summary
print(f"\nTraining outliers: {train_df['IS_OUTLIER'].sum()} out of {len(train_df)} records")
print(f"Test outliers: {test_df['IS_OUTLIER'].sum()} out of {len(test_df)} records")

# Show sample
print("\nSample:")
df[['TRAIN_ID', 'DWELL_TIME','IS_OUTLIER']].head(5)

Outlier bounds: 0.5 to 3.0

Training outliers: 225 out of 543 records
Test outliers: 65 out of 150 records

Sample:


Unnamed: 0,TRAIN_ID,DWELL_TIME,IS_OUTLIER
0,XBERBSB928H2021-05-28,0.3,True
1,XKAHKND911H2024-08-11,3.417,True
2,XCROBSB921A2023-09-21,0.933,False
3,XCROEGE922A2023-09-22,1.017,False
4,XCROHBG919A2023-09-19,1.4,False


In [1811]:
processed_df = pd.concat([train_df, test_df])
processed_df.shape

(693, 24)

In [1812]:
# Get date range for TA
min_date = pd.to_datetime(processed_df['TA'], utc=True).min()
max_date = pd.to_datetime(processed_df['TA'], utc=True).max()
print(f"Date range: {min_date.date()} to {max_date.date()}")

Date range: 2020-05-05 to 2024-08-20


In [1813]:
# US Federal Holidays (using 2025 dates for all years)
US_HOLIDAYS = [
    (1, 1),    # New Year's Day
    (1, 20),   # Martin Luther King Jr. Day
    (2, 17),   # Presidents Day
    (5, 26),   # Memorial Day
    (6, 19),   # Juneteenth
    (7, 4),    # Independence Day
    (9, 1),    # Labor Day
    (10, 13),  # Columbus Day
    (11, 11),  # Veterans Day
    (11, 27),  # Thanksgiving
    (12, 25),  # Christmas Day
]

In [1814]:
# Convert TA to datetime
processed_df['TA'] = pd.to_datetime(processed_df['TA'], utc= True)

In [1815]:
# Create IS_HOLIDAY column
def check_holiday(date):
    for month, day in US_HOLIDAYS:
        if date.month == month and date.day == day:
            return 1
    return 0

processed_df['IS_HOLIDAY'] = processed_df['TA'].apply(check_holiday)

In [1816]:
processed_df.head()

Unnamed: 0,TRAIN_ID,STN_333,STN_ST,STN_TYPE_CD,STN_SEQ_NBR,TRN_MILES_TOT_TD,crew_order,DPT_DIR,REQ_INSP,case,...,STN_SEQ_NBR_DEST,DISTANCE,LAST_CREW_STATION,SECOND_LAST_CREW_STATION,TRAVEL_TIME,DWELL_TIME,DATE,DWELL_TIME_BIN,IS_OUTLIER,IS_HOLIDAY
0,XBERBSB928H2021-05-28,MINOT,ND,C,40,22.7,1,E,N,General,...,210,126.7,MINOT,GLASGOW,5.75,0.3,2021-05-30,"(0.0, 0.5]",True,0
2,XCROBSB921A2023-09-21,MINOT,ND,C,90,63.6,1,E,N,General,...,260,126.7,MINOT,GLASGOW,5.75,0.933,2023-09-21,"(0.5, 1.0]",False,0
3,XCROEGE922A2023-09-22,MINOT,ND,C,90,63.6,1,E,N,General,...,370,217.4,MINOT,GLASGOW,5.5,1.017,2023-09-22,"(1.0, 1.5]",False,0
4,XCROHBG919A2023-09-19,MINOT,ND,C,90,63.6,1,E,N,General,...,210,91.3,MINOT,GLASGOW,1.75,1.4,2023-09-19,"(1.0, 1.5]",False,0
5,XPOWBNY925A2021-09-25,MINOT,ND,C,110,80.0,1,E,N,General,...,390,217.4,MINOT,GLASGOW,8.917,0.983,2021-09-25,"(0.5, 1.0]",False,0


In [1817]:
processed_df['TA'].head()

0   2021-05-30 10:04:00+00:00
2   2023-09-22 00:53:00+00:00
3   2023-09-22 23:36:00+00:00
4   2023-09-19 21:21:00+00:00
5   2021-09-25 20:49:00+00:00
Name: TA, dtype: datetime64[ns, UTC]

In [1818]:
processed_df[processed_df['IS_HOLIDAY'] == 1]['TA']#.sort_values().unique().#sorted()

30    2020-09-01 06:09:00+00:00
69    2021-10-13 16:16:00+00:00
155   2023-09-01 00:20:00+00:00
161   2022-01-20 06:10:00+00:00
162   2022-01-20 03:03:00+00:00
175   2023-10-13 06:03:00+00:00
194   2020-11-11 05:53:00+00:00
271   2021-11-27 18:37:00+00:00
273   2020-06-19 04:03:00+00:00
387   2023-10-13 18:02:00+00:00
438   2023-12-25 08:30:00+00:00
470   2023-09-01 13:20:00+00:00
631   2020-11-11 13:46:00+00:00
662   2020-12-25 06:48:00+00:00
50    2024-07-04 19:19:00+00:00
608   2024-06-19 00:36:00+00:00
Name: TA, dtype: datetime64[ns, UTC]

In [1819]:
processed_df.columns

Index(['TRAIN_ID', 'STN_333', 'STN_ST', 'STN_TYPE_CD', 'STN_SEQ_NBR',
       'TRN_MILES_TOT_TD', 'crew_order', 'DPT_DIR', 'REQ_INSP', 'case', 'TA',
       'TD', 'DESTINATION', 'ArrivalDestination', 'TRN_MILES_TOT_DEST',
       'STN_SEQ_NBR_DEST', 'DISTANCE', 'LAST_CREW_STATION',
       'SECOND_LAST_CREW_STATION', 'TRAVEL_TIME', 'DWELL_TIME', 'DATE',
       'DWELL_TIME_BIN', 'IS_OUTLIER', 'IS_HOLIDAY'],
      dtype='object')

In [1820]:
# Analyzing how dwell time distributions differ between holidays and non-holidays

# Remove outliers
df_clean = processed_df[processed_df['IS_OUTLIER'] == False].copy()

# Create figure
fig = go.Figure()

# Non-holiday
fig.add_trace(go.Histogram(
    x=df_clean[df_clean['IS_HOLIDAY'] == 0]['DWELL_TIME'],
    name='Non-Holiday'
))

# Holiday
fig.add_trace(go.Histogram(
    x=df_clean[df_clean['IS_HOLIDAY'] == 1]['DWELL_TIME'],
    name='Holiday'
))

# Labels
fig.update_layout(
    title='Dwell Time Distribution',
    xaxis_title='Dwell Time (hours)',
    yaxis_title='Count',
    barmode='group'
)

fig.show()

In [1821]:
# Create box plots for holiday vs non-holiday (excluding outliers)

# Create figure
fig = go.Figure()

# Non-holiday
fig.add_trace(go.Box(
   y=df_clean[df_clean['IS_HOLIDAY'] == 0]['DWELL_TIME'],
   name='Non-Holiday'
))

# Holiday
fig.add_trace(go.Box(
   y=df_clean[df_clean['IS_HOLIDAY'] == 1]['DWELL_TIME'],
   name='Holiday'
))

# Labels
fig.update_layout(
   title='Dwell Time Box Plot (Excluding Outliers)',
   yaxis_title='Dwell Time (hours)'
)

fig.show()

In [1822]:
# Calculate statistics for holiday vs non-holiday
holiday_trains = df_clean[df_clean['IS_HOLIDAY'] == 1]['DWELL_TIME']
non_holiday_trains = df_clean[df_clean['IS_HOLIDAY'] == 0]['DWELL_TIME']

print("DWELL TIME STATISTICS (excluding outliers)")
print(f"\nNon-Holiday Trains: {len(non_holiday_trains)}")
print(f"  Mean: {non_holiday_trains.mean():.3f} hours")
print(f"  Median: {non_holiday_trains.median():.3f} hours")
print(f"  Std Dev: {non_holiday_trains.std():.3f} hours")
print(f"  Min: {non_holiday_trains.min():.3f} hours")
print(f"  Max: {non_holiday_trains.max():.3f} hours")

print(f"\nHoliday Trains: {len(holiday_trains)}")
print(f"  Mean: {holiday_trains.mean():.3f} hours")
print(f"  Median: {holiday_trains.median():.3f} hours")
print(f"  Std Dev: {holiday_trains.std():.3f} hours")
print(f"  Min: {holiday_trains.min():.3f} hours")
print(f"  Max: {holiday_trains.max():.3f} hours")

DWELL TIME STATISTICS (excluding outliers)

Non-Holiday Trains: 393
  Mean: 1.481 hours
  Median: 1.350 hours
  Std Dev: 0.674 hours
  Min: 0.517 hours
  Max: 3.000 hours

Holiday Trains: 10
  Mean: 1.322 hours
  Median: 1.050 hours
  Std Dev: 0.673 hours
  Min: 0.533 hours
  Max: 2.300 hours


In [1823]:
# Create a binary inspection flag column: 'N' remains 'N', all other codes become 'Y'
df_clean['INSPECTION_REQUIRED'] = df_clean['REQ_INSP'].apply(lambda x: 'N' if x == 'N' else 'Y')


In [1824]:
df_clean.head()

Unnamed: 0,TRAIN_ID,STN_333,STN_ST,STN_TYPE_CD,STN_SEQ_NBR,TRN_MILES_TOT_TD,crew_order,DPT_DIR,REQ_INSP,case,...,DISTANCE,LAST_CREW_STATION,SECOND_LAST_CREW_STATION,TRAVEL_TIME,DWELL_TIME,DATE,DWELL_TIME_BIN,IS_OUTLIER,IS_HOLIDAY,INSPECTION_REQUIRED
2,XCROBSB921A2023-09-21,MINOT,ND,C,90,63.6,1,E,N,General,...,126.7,MINOT,GLASGOW,5.75,0.933,2023-09-21,"(0.5, 1.0]",False,0,N
3,XCROEGE922A2023-09-22,MINOT,ND,C,90,63.6,1,E,N,General,...,217.4,MINOT,GLASGOW,5.5,1.017,2023-09-22,"(1.0, 1.5]",False,0,N
4,XCROHBG919A2023-09-19,MINOT,ND,C,90,63.6,1,E,N,General,...,91.3,MINOT,GLASGOW,1.75,1.4,2023-09-19,"(1.0, 1.5]",False,0,N
5,XPOWBNY925A2021-09-25,MINOT,ND,C,110,80.0,1,E,N,General,...,217.4,MINOT,GLASGOW,8.917,0.983,2021-09-25,"(0.5, 1.0]",False,0,N
6,XPOWBRE920A2021-09-20,MINOT,ND,C,110,80.0,1,E,N,General,...,217.4,MINOT,GLASGOW,6.417,2.1,2021-09-20,"(2.0, 2.5]",False,0,N


In [1825]:
# Analyzing how dwell time distributions differ between inspection required vs not required
# Using cleaned data (excluding outliers)

# Create figure for histogram
fig = go.Figure()

# No inspection required
fig.add_trace(go.Histogram(
   x=df_clean[df_clean['INSPECTION_REQUIRED'] == 'N']['DWELL_TIME'],
   name='No Inspection'
))

# Inspection required
fig.add_trace(go.Histogram(
   x=df_clean[df_clean['INSPECTION_REQUIRED'] == 'Y']['DWELL_TIME'],
   name='Inspection Required'
))

# Labels
fig.update_layout(
   title='Dwell Time Distribution by Inspection Requirement (Excluding Outliers)',
   xaxis_title='Dwell Time (hours)',
   yaxis_title='Count',
   barmode='group'
)

fig.show()

In [1826]:
# Create box plots to analyze how dwell time varies according to INSPECTION_REQUIRED

fig2 = go.Figure()

# No inspection required
fig2.add_trace(go.Box(
   y=df_clean[df_clean['INSPECTION_REQUIRED'] == 'N']['DWELL_TIME'],
   name='No Inspection'
))

# Inspection required
fig2.add_trace(go.Box(
   y=df_clean[df_clean['INSPECTION_REQUIRED'] == 'Y']['DWELL_TIME'],
   name='Inspection Required'
))

# Labels
fig2.update_layout(
   title='Dwell Time Box Plot by Inspection Requirement (Excluding Outliers)',
   yaxis_title='Dwell Time (hours)'
)

fig2.show()

In [1827]:
# Calculate statistics
no_inspection = df_clean[df_clean['INSPECTION_REQUIRED'] == 'N']['DWELL_TIME']
inspection_required = df_clean[df_clean['INSPECTION_REQUIRED'] == 'Y']['DWELL_TIME']

print("DWELL TIME STATISTICS BY INSPECTION REQUIREMENT (excluding outliers)")
print(f"\nNo Inspection Required: {len(no_inspection)}")
print(f"  Mean: {no_inspection.mean():.3f} hours")
print(f"  Median: {no_inspection.median():.3f} hours")
print(f"  Std Dev: {no_inspection.std():.3f} hours")
print(f"  Min: {no_inspection.min():.3f} hours")
print(f"  Max: {no_inspection.max():.3f} hours")

print(f"\nInspection Required: {len(inspection_required)}")
print(f"  Mean: {inspection_required.mean():.3f} hours")
print(f"  Median: {inspection_required.median():.3f} hours")
print(f"  Std Dev: {inspection_required.std():.3f} hours")
print(f"  Min: {inspection_required.min():.3f} hours")
print(f"  Max: {inspection_required.max():.3f} hours")

DWELL TIME STATISTICS BY INSPECTION REQUIREMENT (excluding outliers)

No Inspection Required: 393
  Mean: 1.454 hours
  Median: 1.317 hours
  Std Dev: 0.664 hours
  Min: 0.517 hours
  Max: 3.000 hours

Inspection Required: 10
  Mean: 2.410 hours
  Median: 2.367 hours
  Std Dev: 0.263 hours
  Min: 2.067 hours
  Max: 2.917 hours


In [1828]:
# Use processed_df (with outliers) for feature creation
df_clean['TA_DAY'] = processed_df['TA'].dt.day_name()
df_clean['TA_MONTH'] = processed_df['TA'].dt.month_name()
df_clean['TA_HOUR'] = processed_df['TA'].dt.hour

In [1829]:
df_clean[['TA', 'TA_DAY', 'TA_MONTH', 'TA_HOUR']].head()

Unnamed: 0,TA,TA_DAY,TA_MONTH,TA_HOUR
2,2023-09-22 00:53:00+00:00,Friday,September,0
3,2023-09-22 23:36:00+00:00,Friday,September,23
4,2023-09-19 21:21:00+00:00,Tuesday,September,21
5,2021-09-25 20:49:00+00:00,Saturday,September,20
6,2021-09-21 01:41:00+00:00,Tuesday,September,1


In [1830]:
# Create histogram showing dwell time distribution for each day of week
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

fig = go.Figure()

for day in day_order:
   day_data = df_clean[df_clean['TA_DAY'] == day]['DWELL_TIME']
   fig.add_trace(go.Histogram(x=day_data, name=day, opacity=0.7))

fig.update_layout(
   title='Dwell Time Distribution by Day of Week',
   xaxis_title='Dwell Time (hours)',
   yaxis_title='Count',
   barmode='group'
)

fig.show()

In [1831]:
# Create box plot showing dwell time distribution for each day of week

fig = go.Figure()

for day in day_order:
   day_data = df_clean[df_clean['TA_DAY'] == day]['DWELL_TIME']
   fig.add_trace(go.Box(y=day_data, name=day))

fig.update_layout(
   title='Dwell Time Distribution by Day of Week',
   yaxis_title='Dwell Time (hours)',
   xaxis_title='Day of Week'
)

fig.show()

In [1832]:
# Calculate statistics for each day of week
("DWELL TIME STATISTICS BY DAY OF WEEK (excluding outliers)")

for day in day_order:
   day_data = df_clean[df_clean['TA_DAY'] == day]['DWELL_TIME']
   if len(day_data) > 0:
       print(f"\n{day}: {len(day_data)} trains")
       print(f"  Mean: {day_data.mean():.3f} hours")
       print(f"  Median: {day_data.median():.3f} hours")
       print(f"  Std Dev: {day_data.std():.3f} hours")
       print(f"  Min: {day_data.min():.3f} hours")
       print(f"  Max: {day_data.max():.3f} hours")



Monday: 55 trains
  Mean: 1.523 hours
  Median: 1.350 hours
  Std Dev: 0.710 hours
  Min: 0.517 hours
  Max: 2.817 hours

Tuesday: 56 trains
  Mean: 1.456 hours
  Median: 1.367 hours
  Std Dev: 0.648 hours
  Min: 0.567 hours
  Max: 2.900 hours

Wednesday: 69 trains
  Mean: 1.394 hours
  Median: 1.250 hours
  Std Dev: 0.641 hours
  Min: 0.533 hours
  Max: 3.000 hours

Thursday: 70 trains
  Mean: 1.486 hours
  Median: 1.359 hours
  Std Dev: 0.656 hours
  Min: 0.583 hours
  Max: 2.883 hours

Friday: 49 trains
  Mean: 1.557 hours
  Median: 1.383 hours
  Std Dev: 0.657 hours
  Min: 0.683 hours
  Max: 2.900 hours

Saturday: 61 trains
  Mean: 1.517 hours
  Median: 1.417 hours
  Std Dev: 0.698 hours
  Min: 0.517 hours
  Max: 3.000 hours

Sunday: 43 trains
  Mean: 1.420 hours
  Median: 1.083 hours
  Std Dev: 0.743 hours
  Min: 0.583 hours
  Max: 2.917 hours


In [1833]:
# Create weekend/weekday grouping
df_clean['IS_WEEKEND'] = df_clean['TA_DAY'].isin(['Saturday', 'Sunday']).astype(int)
df_clean['DAY_TYPE'] = df_clean['IS_WEEKEND'].map({0: 'Weekday', 1: 'Weekend'})

# Histogram for weekday vs weekend
fig = go.Figure()

weekday_data = df_clean[df_clean['DAY_TYPE'] == 'Weekday']['DWELL_TIME']
weekend_data = df_clean[df_clean['DAY_TYPE'] == 'Weekend']['DWELL_TIME']

fig.add_trace(go.Histogram(x=weekday_data, name='Weekday', opacity=0.7))
fig.add_trace(go.Histogram(x=weekend_data, name='Weekend', opacity=0.7))

fig.update_layout(
   title='Dwell Time Distribution: Weekday vs Weekend',
   xaxis_title='Dwell Time (hours)',
   yaxis_title='Count',
   barmode='group'
)

fig.show()

In [1834]:
# Box plot for weekday vs weekend
fig = go.Figure()

fig.add_trace(go.Box(y=weekday_data, name='Weekday'))
fig.add_trace(go.Box(y=weekend_data, name='Weekend'))

fig.update_layout(
    title='Dwell Time Distribution: Weekday vs Weekend',
    yaxis_title='Dwell Time (hours)'
)

fig.show()

In [1835]:
# Statistics for weekday vs weekend
print("DWELL TIME STATISTICS: WEEKDAY VS WEEKEND (excluding outliers)")

print(f"\nWeekday: {len(weekday_data)} trains")
print(f"  Mean: {weekday_data.mean():.3f} hours")
print(f"  Median: {weekday_data.median():.3f} hours")
print(f"  Std Dev: {weekday_data.std():.3f} hours")
print(f"  Min: {weekday_data.min():.3f} hours")
print(f"  Max: {weekday_data.max():.3f} hours")

print(f"\nWeekend: {len(weekend_data)} trains")
print(f"  Mean: {weekend_data.mean():.3f} hours")
print(f"  Median: {weekend_data.median():.3f} hours")
print(f"  Std Dev: {weekend_data.std():.3f} hours")
print(f"  Min: {weekend_data.min():.3f} hours")
print(f"  Max: {weekend_data.max():.3f} hours")

DWELL TIME STATISTICS: WEEKDAY VS WEEKEND (excluding outliers)

Weekday: 299 trains
  Mean: 1.477 hours
  Median: 1.350 hours
  Std Dev: 0.659 hours
  Min: 0.517 hours
  Max: 3.000 hours

Weekend: 104 trains
  Mean: 1.477 hours
  Median: 1.292 hours
  Std Dev: 0.715 hours
  Min: 0.517 hours
  Max: 3.000 hours


In [1836]:
# Create histogram showing dwell time distribution for each month
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 
              'July', 'August', 'September', 'October', 'November', 'December']

fig = go.Figure()

for month in month_order:
   month_data = df_clean[df_clean['TA_MONTH'] == month]['DWELL_TIME']
   fig.add_trace(go.Histogram(x=month_data, name=month, opacity=0.7))

fig.update_layout(
   title='Dwell Time Distribution by Month',
   xaxis_title='Dwell Time (hours)',
   yaxis_title='Count',
   barmode='group'
)

fig.show()

In [1837]:
# Create box plot showing dwell time distribution for each month
fig = go.Figure()

for month in month_order:
    month_data = df_clean[df_clean['TA_MONTH'] == month]['DWELL_TIME']
    fig.add_trace(go.Box(y=month_data, name=month))

fig.update_layout(
    title='Dwell Time Distribution by Month',
    yaxis_title='Dwell Time (hours)',
    xaxis_title='Month'
)

fig.show()

In [1838]:
# Calculate statistics for each month
print("DWELL TIME STATISTICS BY MONTH (excluding outliers)")

for month in month_order:
    month_data = df_clean[df_clean['TA_MONTH'] == month]['DWELL_TIME']
    if len(month_data) > 0:
        print(f"\n{month}: {len(month_data)} trains")
        print(f"  Mean: {month_data.mean():.3f} hours")
        print(f"  Median: {month_data.median():.3f} hours")
        print(f"  Std Dev: {month_data.std():.3f} hours")
        print(f"  Min: {month_data.min():.3f} hours")
        print(f"  Max: {month_data.max():.3f} hours")


DWELL TIME STATISTICS BY MONTH (excluding outliers)

January: 35 trains
  Mean: 1.535 hours
  Median: 1.317 hours
  Std Dev: 0.745 hours
  Min: 0.750 hours
  Max: 2.900 hours

February: 32 trains
  Mean: 1.676 hours
  Median: 1.367 hours
  Std Dev: 0.741 hours
  Min: 0.783 hours
  Max: 3.000 hours

March: 38 trains
  Mean: 1.420 hours
  Median: 1.409 hours
  Std Dev: 0.569 hours
  Min: 0.517 hours
  Max: 2.750 hours

April: 32 trains
  Mean: 1.390 hours
  Median: 1.192 hours
  Std Dev: 0.580 hours
  Min: 0.617 hours
  Max: 2.583 hours

May: 30 trains
  Mean: 1.377 hours
  Median: 1.100 hours
  Std Dev: 0.685 hours
  Min: 0.583 hours
  Max: 2.900 hours

June: 19 trains
  Mean: 1.824 hours
  Median: 1.917 hours
  Std Dev: 0.754 hours
  Min: 0.667 hours
  Max: 2.883 hours

July: 23 trains
  Mean: 1.575 hours
  Median: 1.433 hours
  Std Dev: 0.644 hours
  Min: 0.683 hours
  Max: 2.733 hours

August: 10 trains
  Mean: 1.567 hours
  Median: 1.350 hours
  Std Dev: 0.678 hours
  Min: 0.800 hou

In [1839]:
# Create histogram showing dwell time distribution for each hour
fig = go.Figure()

for hour in range(24):
   hour_data = df_clean[df_clean['TA_HOUR'] == hour]['DWELL_TIME']
   fig.add_trace(go.Histogram(x=hour_data, name=f'{hour}:00', opacity=0.7))

fig.update_layout(
   title='Dwell Time Distribution by Hour of Day',
   xaxis_title='Dwell Time (hours)',
   yaxis_title='Count',
   barmode='group'
)

fig.show()

In [1840]:
# Create box plot showing dwell time distribution for each hour
fig = go.Figure()

for hour in range(24):
    hour_data = df_clean[df_clean['TA_HOUR'] == hour]['DWELL_TIME']
    fig.add_trace(go.Box(y=hour_data, name=f'{hour}:00'))

fig.update_layout(
    title='Dwell Time Distribution by Hour of Day',
    yaxis_title='Dwell Time (hours)',
    xaxis_title='Hour of Day'
)

fig.show()

In [1841]:
# Average dwell time by hour
hour_avg = df_clean.groupby('TA_HOUR')['DWELL_TIME'].mean().sort_index()

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=hour_avg.index,
    y=hour_avg.values,
    mode='lines+markers',
    line=dict(width=3),
    marker=dict(size=8)
))

fig.update_layout(
    title='Average Dwell Time by Hour of Day',
    xaxis_title='Hour of Day',
    yaxis_title='Average Dwell Time (hours)',
    xaxis=dict(tickmode='linear', tick0=0, dtick=1)
)
fig.show()

In [1842]:
# Calculate statistics for each hour
print("DWELL TIME STATISTICS BY HOUR OF DAY (excluding outliers)")

for hour in range(24):
    hour_data = df_clean[df_clean['TA_HOUR'] == hour]['DWELL_TIME']
    if len(hour_data) > 0:
        print(f"\n{hour}:00 - {hour+1}:00: {len(hour_data)} trains")
        print(f"  Mean: {hour_data.mean():.3f} hours")
        print(f"  Median: {hour_data.median():.3f} hours")
        print(f"  Std Dev: {hour_data.std():.3f} hours")
        print(f"  Min: {hour_data.min():.3f} hours")
        print(f"  Max: {hour_data.max():.3f} hours")


DWELL TIME STATISTICS BY HOUR OF DAY (excluding outliers)

0:00 - 1:00: 19 trains
  Mean: 1.456 hours
  Median: 1.350 hours
  Std Dev: 0.601 hours
  Min: 0.817 hours
  Max: 2.650 hours

1:00 - 2:00: 19 trains
  Mean: 1.355 hours
  Median: 1.350 hours
  Std Dev: 0.600 hours
  Min: 0.633 hours
  Max: 2.883 hours

2:00 - 3:00: 7 trains
  Mean: 1.600 hours
  Median: 1.550 hours
  Std Dev: 0.913 hours
  Min: 0.733 hours
  Max: 3.000 hours

3:00 - 4:00: 14 trains
  Mean: 1.389 hours
  Median: 1.400 hours
  Std Dev: 0.662 hours
  Min: 0.617 hours
  Max: 2.917 hours

4:00 - 5:00: 22 trains
  Mean: 1.446 hours
  Median: 1.325 hours
  Std Dev: 0.721 hours
  Min: 0.517 hours
  Max: 2.883 hours

5:00 - 6:00: 19 trains
  Mean: 1.467 hours
  Median: 1.433 hours
  Std Dev: 0.632 hours
  Min: 0.583 hours
  Max: 2.883 hours

6:00 - 7:00: 21 trains
  Mean: 1.712 hours
  Median: 1.800 hours
  Std Dev: 0.711 hours
  Min: 0.683 hours
  Max: 2.900 hours

7:00 - 8:00: 23 trains
  Mean: 1.306 hours
  Median: 

In [1843]:
# Convert TD to datetime
df_clean['TD'] = pd.to_datetime(df_clean['TD'],utc=True)

# Create day of week column (Monday, Tuesday, etc.)
df_clean['TD_DAY'] = df_clean['TD'].dt.day_name()

# Create month column (January, February, etc.)
df_clean['TD_MONTH'] = df_clean['TD'].dt.month_name()

# Create hour column (0-23)
df_clean['TD_HOUR'] = df_clean['TD'].dt.hour

In [1844]:
df_clean[['TD', 'TD_DAY', 'TD_MONTH', 'TD_HOUR']].head()

Unnamed: 0,TD,TD_DAY,TD_MONTH,TD_HOUR
2,2023-09-22 01:49:00+00:00,Friday,September,1
3,2023-09-23 00:37:00+00:00,Saturday,September,0
4,2023-09-19 22:45:00+00:00,Tuesday,September,22
5,2021-09-25 21:48:00+00:00,Saturday,September,21
6,2021-09-21 03:47:00+00:00,Tuesday,September,3


In [1845]:
fig = go.Figure()

for day in day_order:
   day_data = df_clean[df_clean['TD_DAY'] == day]['DWELL_TIME']
   fig.add_trace(go.Histogram(x=day_data, name=day, opacity=0.7))

fig.update_layout(
   title='Dwell Time Distribution by Departure Day of Week (TD)',
   xaxis_title='Dwell Time (hours)',
   yaxis_title='Count',
   barmode='group'
)

fig.show()

In [1846]:
# Create box plot showing dwell time distribution for each day of week (TD)
fig = go.Figure()

for day in day_order:
    day_data = df_clean[df_clean['TD_DAY'] == day]['DWELL_TIME']
    fig.add_trace(go.Box(y=day_data, name=day))

fig.update_layout(
    title='Dwell Time Distribution by Departure Day of Week (TD)',
    yaxis_title='Dwell Time (hours)',
    xaxis_title='Day of Week'
)

fig.show()

In [1847]:
# Calculate statistics for each day of week (TD)
print("DWELL TIME STATISTICS BY DEPARTURE DAY OF WEEK (TD) (excluding outliers)")

for day in day_order:
    day_data = df_clean[df_clean['TD_DAY'] == day]['DWELL_TIME']
    if len(day_data) > 0:
        print(f"\n{day}: {len(day_data)} trains")
        print(f"  Mean: {day_data.mean():.3f} hours")
        print(f"  Median: {day_data.median():.3f} hours")
        print(f"  Std Dev: {day_data.std():.3f} hours")
        print(f"  Min: {day_data.min():.3f} hours")
        print(f"  Max: {day_data.max():.3f} hours")


DWELL TIME STATISTICS BY DEPARTURE DAY OF WEEK (TD) (excluding outliers)

Monday: 49 trains
  Mean: 1.434 hours
  Median: 1.267 hours
  Std Dev: 0.678 hours
  Min: 0.517 hours
  Max: 2.800 hours

Tuesday: 60 trains
  Mean: 1.520 hours
  Median: 1.391 hours
  Std Dev: 0.667 hours
  Min: 0.567 hours
  Max: 2.900 hours

Wednesday: 70 trains
  Mean: 1.373 hours
  Median: 1.150 hours
  Std Dev: 0.632 hours
  Min: 0.533 hours
  Max: 2.917 hours

Thursday: 68 trains
  Mean: 1.498 hours
  Median: 1.383 hours
  Std Dev: 0.673 hours
  Min: 0.583 hours
  Max: 3.000 hours

Friday: 51 trains
  Mean: 1.533 hours
  Median: 1.367 hours
  Std Dev: 0.632 hours
  Min: 0.683 hours
  Max: 2.900 hours

Saturday: 60 trains
  Mean: 1.520 hours
  Median: 1.391 hours
  Std Dev: 0.727 hours
  Min: 0.517 hours
  Max: 3.000 hours

Sunday: 45 trains
  Mean: 1.478 hours
  Median: 1.133 hours
  Std Dev: 0.731 hours
  Min: 0.583 hours
  Max: 2.917 hours


In [1848]:
# Create histogram showing dwell time distribution for each month (TD)
fig = go.Figure()

for month in month_order:
   month_data = df_clean[df_clean['TD_MONTH'] == month]['DWELL_TIME']
   fig.add_trace(go.Histogram(x=month_data, name=month, opacity=0.7))

fig.update_layout(
   title='Dwell Time Distribution by Departure Month (TD)',
   xaxis_title='Dwell Time (hours)',
   yaxis_title='Count',
   barmode='group'
)

fig.show()

In [1849]:
# Create box plot showing dwell time distribution for each month (TD)
fig = go.Figure()

for month in month_order:
    month_data = df_clean[df_clean['TD_MONTH'] == month]['DWELL_TIME']
    fig.add_trace(go.Box(y=month_data, name=month))

fig.update_layout(
    title='Dwell Time Distribution by Departure Month (TD)',
    yaxis_title='Dwell Time (hours)',
    xaxis_title='Month'
)

fig.show()

In [1850]:
# Calculate statistics for each month (TD)
print("DWELL TIME STATISTICS BY DEPARTURE MONTH (TD) (excluding outliers)")

for month in month_order:
    month_data = df_clean[df_clean['TD_MONTH'] == month]['DWELL_TIME']
    if len(month_data) > 0:
        print(f"\n{month}: {len(month_data)} trains")
        print(f"  Mean: {month_data.mean():.3f} hours")
        print(f"  Median: {month_data.median():.3f} hours")
        print(f"  Std Dev: {month_data.std():.3f} hours")
        print(f"  Min: {month_data.min():.3f} hours")
        print(f"  Max: {month_data.max():.3f} hours")


DWELL TIME STATISTICS BY DEPARTURE MONTH (TD) (excluding outliers)

January: 34 trains
  Mean: 1.541 hours
  Median: 1.300 hours
  Std Dev: 0.756 hours
  Min: 0.750 hours
  Max: 2.900 hours

February: 32 trains
  Mean: 1.664 hours
  Median: 1.350 hours
  Std Dev: 0.744 hours
  Min: 0.783 hours
  Max: 3.000 hours

March: 39 trains
  Mean: 1.427 hours
  Median: 1.417 hours
  Std Dev: 0.564 hours
  Min: 0.517 hours
  Max: 2.750 hours

April: 32 trains
  Mean: 1.390 hours
  Median: 1.192 hours
  Std Dev: 0.580 hours
  Min: 0.617 hours
  Max: 2.583 hours

May: 30 trains
  Mean: 1.377 hours
  Median: 1.100 hours
  Std Dev: 0.685 hours
  Min: 0.583 hours
  Max: 2.900 hours

June: 19 trains
  Mean: 1.824 hours
  Median: 1.917 hours
  Std Dev: 0.754 hours
  Min: 0.667 hours
  Max: 2.883 hours

July: 23 trains
  Mean: 1.575 hours
  Median: 1.433 hours
  Std Dev: 0.644 hours
  Min: 0.683 hours
  Max: 2.733 hours

August: 10 trains
  Mean: 1.567 hours
  Median: 1.350 hours
  Std Dev: 0.678 hours
 

In [1851]:
# Create histogram showing dwell time distribution for each hour (TD)
fig = go.Figure()

for hour in range(24):
   hour_data = df_clean[df_clean['TD_HOUR'] == hour]['DWELL_TIME']
   fig.add_trace(go.Histogram(x=hour_data, name=f'{hour}:00', opacity=0.7))

fig.update_layout(
   title='Dwell Time Distribution by Departure Hour (TD)',
   xaxis_title='Dwell Time (hours)',
   yaxis_title='Count',
   barmode='group'
)

fig.show()

In [1852]:
# Create box plot showing dwell time distribution for each hour (TD)
fig = go.Figure()

for hour in range(24):
    hour_data = df_clean[df_clean['TD_HOUR'] == hour]['DWELL_TIME']
    fig.add_trace(go.Box(y=hour_data, name=f'{hour}:00'))

fig.update_layout(
    title='Dwell Time Distribution by Departure Hour (TD)',
    yaxis_title='Dwell Time (hours)',
    xaxis_title='Hour of Day'
)

fig.show()

In [1853]:
# Average dwell time by hour (TD)
hour_avg = df_clean.groupby('TD_HOUR')['DWELL_TIME'].mean().sort_index()

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=hour_avg.index,
    y=hour_avg.values,
    mode='lines+markers',
    line=dict(width=3),
    marker=dict(size=8)
))

fig.update_layout(
    title='Average Dwell Time by Departure Hour of Day (TD)',
    xaxis_title='Hour of Day',
    yaxis_title='Average Dwell Time (hours)',
    xaxis=dict(tickmode='linear', tick0=0, dtick=1)
)
fig.show()

In [1854]:
# Calculate statistics for each hour (TD)
print("DWELL TIME STATISTICS BY DEPARTURE HOUR (TD) (excluding outliers)")
for hour in range(24):
    hour_data = df_clean[df_clean['TD_HOUR'] == hour]['DWELL_TIME']
    if len(hour_data) > 0:
        print(f"\n{hour}:00 - {hour+1}:00: {len(hour_data)} trains")
        print(f"  Mean: {hour_data.mean():.3f} hours")
        print(f"  Median: {hour_data.median():.3f} hours")
        print(f"  Std Dev: {hour_data.std():.3f} hours")
        print(f"  Min: {hour_data.min():.3f} hours")
        print(f"  Max: {hour_data.max():.3f} hours")


DWELL TIME STATISTICS BY DEPARTURE HOUR (TD) (excluding outliers)

0:00 - 1:00: 20 trains
  Mean: 1.473 hours
  Median: 1.308 hours
  Std Dev: 0.594 hours
  Min: 0.767 hours
  Max: 2.850 hours

1:00 - 2:00: 21 trains
  Mean: 1.345 hours
  Median: 1.017 hours
  Std Dev: 0.675 hours
  Min: 0.633 hours
  Max: 2.817 hours

2:00 - 3:00: 19 trains
  Mean: 1.635 hours
  Median: 1.450 hours
  Std Dev: 0.697 hours
  Min: 0.783 hours
  Max: 3.000 hours

3:00 - 4:00: 11 trains
  Mean: 1.300 hours
  Median: 1.417 hours
  Std Dev: 0.659 hours
  Min: 0.617 hours
  Max: 2.650 hours

4:00 - 5:00: 9 trains
  Mean: 1.485 hours
  Median: 1.550 hours
  Std Dev: 0.787 hours
  Min: 0.517 hours
  Max: 2.883 hours

5:00 - 6:00: 22 trains
  Mean: 1.380 hours
  Median: 1.325 hours
  Std Dev: 0.632 hours
  Min: 0.517 hours
  Max: 3.000 hours

6:00 - 7:00: 17 trains
  Mean: 1.361 hours
  Median: 1.150 hours
  Std Dev: 0.633 hours
  Min: 0.583 hours
  Max: 2.917 hours

7:00 - 8:00: 19 trains
  Mean: 1.582 hours
  

In [1855]:
All_minot_dwell = pd.read_csv('final_df.csv')
All_minot_dwell['EVT_DTTM_TA'] = pd.to_datetime(All_minot_dwell['EVT_DTTM_TA'], utc=True)
All_minot_dwell['EVT_DTTM_TD'] = pd.to_datetime(All_minot_dwell['EVT_DTTM_TD'], utc=True)

In [1856]:
# Check column names
print(All_minot_dwell.columns.tolist())

# Check if these specific columns exist
print('DT_TM_TA' in All_minot_dwell.columns)  # Will print False
print('DT_TM_TD' in All_minot_dwell.columns)  # Will print False

['TRN_TYPE', 'TRN_SYM', 'TRN_SECT', 'TRN_PRTY', 'TRN_DAY', 'EVT_DT_TA', 'EVT_TM_TA', 'EVT_CD_TA', 'STN_333', 'STN_ST', 'TRN_SCH_DPT_DT', 'STN_TYPE_CD', 'STN_SEQ_NBR', 'AZ_LOAD_TS', 'PROC_DTTM_TA', 'SDIV_NUMB', 'TRAIN_ID', 'REQ_INSP', 'CREW_CHG', 'DPT_DIR_TA', 'EVT_DTTM_TA', 'EVT_DT_TD', 'EVT_TM_TD', 'EVT_CD_TD', 'PROC_DTTM_TD', 'DPT_DIR_TD', 'EVT_DTTM_TD']
False
False


In [1857]:
# Creating the dwell features..
# Creating the dwell features for existing input data.

dwell_data_x = processed_df.copy()
dwell_data_x[['AllTrains','MainlineTrains','YardTrains','PriorityTrains', 'PriorityMainline', 'PriorityYard']] = np.nan

for i in range(len(dwell_data_x)):
   trn_id = dwell_data_x["TRAIN_ID"].iloc[i]
   TA = dwell_data_x["TA"].iloc[i]
   
   AllTrains = All_minot_dwell[(All_minot_dwell["TRAIN_ID"]!=trn_id)&\
                              (((All_minot_dwell['EVT_DTTM_TA']>=TA) & (All_minot_dwell['EVT_DTTM_TD'].isna()))|\
                              ((All_minot_dwell['EVT_DTTM_TA']<=TA)&(All_minot_dwell['EVT_DTTM_TD']>=TA)))]
   
   inspection = ['Q', 'Z', 'C', 'X', 'V', 'U', 'W', 'K', 'L']
   YardTrains = All_minot_dwell[(All_minot_dwell["TRAIN_ID"]!=trn_id)&\
                               (All_minot_dwell['REQ_INSP'].isin(inspection))&\
                              (((All_minot_dwell['EVT_DTTM_TA']>=TA) & (All_minot_dwell['EVT_DTTM_TD'].isna()))|\
                              ((All_minot_dwell['EVT_DTTM_TA']<=TA)&(All_minot_dwell['EVT_DTTM_TD']>=TA)))]
   
   HighTrain = ['A', 'B', 'G', 'H', 'Q', 'Z', 'S', 'V']
   PriorityTrains = All_minot_dwell[(All_minot_dwell["TRAIN_ID"]!=trn_id)&\
                                   (All_minot_dwell['TRN_TYPE'].isin(HighTrain))&\
                                   (((All_minot_dwell['EVT_DTTM_TA']>=TA) & (All_minot_dwell['EVT_DTTM_TD'].isna()))|\
                                   ((All_minot_dwell['EVT_DTTM_TA']<=TA)&(All_minot_dwell['EVT_DTTM_TD']>=TA)))]
   
   PriorityYard= All_minot_dwell[(All_minot_dwell["TRAIN_ID"]!=trn_id)&\
                                 (All_minot_dwell['TRN_TYPE'].isin(HighTrain))&\
                                 (All_minot_dwell['REQ_INSP'].isin(inspection))&\
                                 (((All_minot_dwell['EVT_DTTM_TA']>=TA) & (All_minot_dwell['EVT_DTTM_TD'].isna()))|\
                                 ((All_minot_dwell['EVT_DTTM_TA']<=TA)&(All_minot_dwell['EVT_DTTM_TD']>=TA)))]

   dwell_data_x.loc[i, 'AllTrains'] = len(AllTrains)
   dwell_data_x.loc[i, 'YardTrains'] = len(YardTrains)
   dwell_data_x.loc[i, 'MainlineTrains'] = len(AllTrains)-len(YardTrains)
   dwell_data_x.loc[i, 'PriorityTrains'] = len(PriorityTrains)
   dwell_data_x.loc[i, 'PriorityYard'] = len(PriorityYard)
   dwell_data_x.loc[i, 'PriorityMainline'] = len(PriorityTrains)-len(PriorityYard)

In [1858]:
dwell_data_x.head()

Unnamed: 0,TRAIN_ID,STN_333,STN_ST,STN_TYPE_CD,STN_SEQ_NBR,TRN_MILES_TOT_TD,crew_order,DPT_DIR,REQ_INSP,case,...,DATE,DWELL_TIME_BIN,IS_OUTLIER,IS_HOLIDAY,AllTrains,MainlineTrains,YardTrains,PriorityTrains,PriorityMainline,PriorityYard
0,XBERBSB928H2021-05-28,MINOT,ND,C,40,22.7,1,E,N,General,...,2021-05-30,"(0.0, 0.5]",True,0,0.0,0.0,0.0,0.0,0.0,0.0
2,XCROBSB921A2023-09-21,MINOT,ND,C,90,63.6,1,E,N,General,...,2023-09-21,"(0.5, 1.0]",False,0,3.0,0.0,3.0,1.0,0.0,1.0
3,XCROEGE922A2023-09-22,MINOT,ND,C,90,63.6,1,E,N,General,...,2023-09-22,"(1.0, 1.5]",False,0,1.0,1.0,0.0,1.0,1.0,0.0
4,XCROHBG919A2023-09-19,MINOT,ND,C,90,63.6,1,E,N,General,...,2023-09-19,"(1.0, 1.5]",False,0,6.0,4.0,2.0,5.0,3.0,2.0
5,XPOWBNY925A2021-09-25,MINOT,ND,C,110,80.0,1,E,N,General,...,2021-09-25,"(0.5, 1.0]",False,0,2.0,1.0,1.0,2.0,1.0,1.0


In [1859]:
# Create dataframe without outliers
dwell_data_x_no_outliers = dwell_data_x[dwell_data_x['IS_OUTLIER'] == False].copy()

In [1860]:
# Check AllTrains values
print(sorted(dwell_data_x['AllTrains'].unique()))
print(dwell_data_x['AllTrains'].value_counts().sort_index())

[np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0)]
AllTrains
0.0      22
1.0      52
2.0     145
3.0     135
4.0     138
5.0      98
6.0      59
7.0      30
8.0      11
9.0       2
10.0      1
Name: count, dtype: int64


In [1861]:
# Create histogram showing dwell time distribution for each AllTrains value (with outliers)
fig = go.Figure()

for i in range(10):
   alltrains_data = dwell_data_x[dwell_data_x['AllTrains'] == i]['DWELL_TIME']
   fig.add_trace(go.Histogram(x=alltrains_data, name=f'AllTrains={i}'))

fig.update_layout(
   title='Dwell Time Distribution by AllTrains Count',
   xaxis_title='Dwell Time (hours)',
   yaxis_title='Count',
   barmode='group'
)

fig.show()

In [1862]:
# Create histogram showing dwell time distribution for each AllTrains value (without outliers)
fig = go.Figure()

for i in range(10):
   alltrains_data = dwell_data_x_no_outliers[dwell_data_x_no_outliers['AllTrains'] == i]['DWELL_TIME']
   fig.add_trace(go.Histogram(x=alltrains_data, name=f'AllTrains={i}'))

fig.update_layout(
   title='Dwell Time Distribution by AllTrains Count (Outliers Removed)',
   xaxis_title='Dwell Time (hours)',
   yaxis_title='Count',
   barmode='group'
)

fig.show()

In [1863]:
# Box plot for dwell_data_x (with outliers)
fig1 = go.Figure()

for i in range(10):
   alltrains_data = dwell_data_x[dwell_data_x['AllTrains'] == i]['DWELL_TIME']
   fig1.add_trace(go.Box(y=alltrains_data, name=f'{i}'))

fig1.update_layout(
   title='Dwell Time Distribution by AllTrains Count (With Outliers)',
   xaxis_title='AllTrains Count',
   yaxis_title='Dwell Time (hours)'
)

fig1.show()


In [1864]:
# Box plot for dwell_data_x_no_outliers (without outliers)
fig2 = go.Figure()

for i in range(10):
   alltrains_data = dwell_data_x_no_outliers[dwell_data_x_no_outliers['AllTrains'] == i]['DWELL_TIME']
   fig2.add_trace(go.Box(y=alltrains_data, name=f'{i}'))

fig2.update_layout(
   title='Dwell Time Distribution by AllTrains Count (Without Outliers)',
   xaxis_title='AllTrains Count',
   yaxis_title='Dwell Time (hours)'
)

fig2.show()

In [1865]:
stats_with_outliers = []

for i in range(10):
   subset_data = dwell_data_x[dwell_data_x['AllTrains'] == i]['DWELL_TIME']
   if len(subset_data) > 0:
       stats_with_outliers.append({
           'AllTrains': i,
           'Count': len(subset_data),
           'Mean': subset_data.mean(),
           'Median': subset_data.median(),
           'Std Dev': subset_data.std(),
           'Min': subset_data.min(),
           'Max': subset_data.max()
       })

df_stats_with_outliers = pd.DataFrame(stats_with_outliers)
print("DWELL TIME STATISTICS BY ALLTRAINS COUNT (WITH OUTLIERS)")
print("="*55)
print(df_stats_with_outliers.round(2).to_string(index=False))

DWELL TIME STATISTICS BY ALLTRAINS COUNT (WITH OUTLIERS)
 AllTrains  Count  Mean  Median  Std Dev  Min    Max
         0     22  3.81    1.73     4.21 0.30  13.62
         1     52  3.13    1.93     2.99 0.58  14.38
         2    145  4.82    2.28    11.97 0.43 137.00
         3    135  4.55    2.62     5.59 0.45  39.43
         4    138  3.92    2.52     4.21 0.42  34.65
         5     98  4.58    2.28    13.34 0.47 131.28
         6     59  2.81    1.85     2.38 0.52  10.65
         7     30  5.44    3.38     6.55 0.53  28.63
         8     11  2.33    0.98     2.52 0.82   8.10
         9      2  6.82    6.82     7.31 1.65  11.98


In [1866]:
# Statistics table for dwell_data_x_no_outliers (without outliers)
stats_no_outliers = []

for i in range(10):
    subset_data = dwell_data_x_no_outliers[dwell_data_x_no_outliers['AllTrains'] == i]['DWELL_TIME']
    if len(subset_data) > 0:
        stats_no_outliers.append({
            'AllTrains': i,
            'Count': len(subset_data),
            'Mean': subset_data.mean(),
            'Median': subset_data.median(),
            'Std Dev': subset_data.std(),
            'Min': subset_data.min(),
            'Max': subset_data.max()
        })

df_stats_no_outliers = pd.DataFrame(stats_no_outliers)
print("DWELL TIME STATISTICS BY ALLTRAINS COUNT (WITHOUT OUTLIERS)")
print("="*55)
print(df_stats_no_outliers.round(2).to_string(index=False))

DWELL TIME STATISTICS BY ALLTRAINS COUNT (WITHOUT OUTLIERS)
 AllTrains  Count  Mean  Median  Std Dev  Min  Max
         0     13  1.36    1.13     0.58 0.58 2.43
         1     32  1.27    1.02     0.63 0.58 2.85
         2     85  1.54    1.35     0.69 0.52 2.92
         3     73  1.52    1.43     0.65 0.60 2.90
         4     75  1.47    1.28     0.69 0.52 3.00
         5     62  1.56    1.41     0.75 0.52 3.00
         6     39  1.44    1.37     0.58 0.52 2.88
         7     14  1.39    1.26     0.69 0.53 2.78
         8      9  1.24    0.95     0.62 0.82 2.70
         9      1  1.65    1.65      NaN 1.65 1.65


In [1867]:
# Check MainlineTrains values
print("Unique values:", sorted(dwell_data_x['MainlineTrains'].unique()))
print("\nValue counts:")
print(dwell_data_x['MainlineTrains'].value_counts().sort_index())

Unique values: [np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0)]

Value counts:
MainlineTrains
0.0    250
1.0    237
2.0    141
3.0     47
4.0     18
Name: count, dtype: int64


In [1868]:
# Create histogram showing dwell time distribution for each MainlineTrains value (with outliers)
fig = go.Figure()

for i in range(5):
    mainlinetrains_data = dwell_data_x[dwell_data_x['MainlineTrains'] == i]['DWELL_TIME']
    fig.add_trace(go.Histogram(x=mainlinetrains_data, name=f'MainlineTrains={i}'))

fig.update_layout(
    title='Dwell Time Distribution by MainlineTrains Count',
    xaxis_title='Dwell Time (hours)',
    yaxis_title='Count',
    barmode='group'
)

fig.show()

In [1869]:
# Create histogram showing dwell time distribution for each MainlineTrains value (without outliers)
fig = go.Figure()

for i in range(5):
    mainlinetrains_data = dwell_data_x_no_outliers[dwell_data_x_no_outliers['MainlineTrains'] == i]['DWELL_TIME']
    fig.add_trace(go.Histogram(x=mainlinetrains_data, name=f'MainlineTrains={i}'))

fig.update_layout(
    title='Dwell Time Distribution by MainlineTrains Count (Outliers Removed)',
    xaxis_title='Dwell Time (hours)',
    yaxis_title='Count',
    barmode='group'
)

fig.show()

In [1870]:
# Box plot for dwell_data_x (with outliers)
fig1 = go.Figure()

for i in range(5):
    mainlinetrains_data = dwell_data_x[dwell_data_x['MainlineTrains'] == i]['DWELL_TIME']
    fig1.add_trace(go.Box(y=mainlinetrains_data, name=f'{i}'))

fig1.update_layout(
    title='Dwell Time Distribution by MainlineTrains Count (With Outliers)',
    xaxis_title='MainlineTrains Count',
    yaxis_title='Dwell Time (hours)'
)

fig1.show()

In [1871]:
# Box plot for dwell_data_x_no_outliers (without outliers)
fig2 = go.Figure()

for i in range(5):
    mainlinetrains_data = dwell_data_x_no_outliers[dwell_data_x_no_outliers['MainlineTrains'] == i]['DWELL_TIME']
    fig2.add_trace(go.Box(y=mainlinetrains_data, name=f'{i}'))

fig2.update_layout(
    title='Dwell Time Distribution by MainlineTrains Count (Without Outliers)',
    xaxis_title='MainlineTrains Count',
    yaxis_title='Dwell Time (hours)'
)

fig2.show()

In [1872]:
# Statistics table for dwell_data_x (with outliers)
stats_with_outliers = []

for i in range(5):
    subset_data = dwell_data_x[dwell_data_x['MainlineTrains'] == i]['DWELL_TIME']
    if len(subset_data) > 0:
        stats_with_outliers.append({
            'MainlineTrains': i,
            'Count': len(subset_data),
            'Mean': subset_data.mean(),
            'Median': subset_data.median(),
            'Std Dev': subset_data.std(),
            'Min': subset_data.min(),
            'Max': subset_data.max()
        })

df_stats_with_outliers = pd.DataFrame(stats_with_outliers)
print("DWELL TIME STATISTICS BY MAINLINETRAINS COUNT (WITH OUTLIERS)")
print("="*55)
print(df_stats_with_outliers.round(2).to_string(index=False))

DWELL TIME STATISTICS BY MAINLINETRAINS COUNT (WITH OUTLIERS)
 MainlineTrains  Count  Mean  Median  Std Dev  Min    Max
              0    250  4.19    2.26     9.50 0.30 137.00
              1    237  3.74    2.18     4.02 0.42  26.63
              2    141  4.38    2.67     5.21 0.45  39.43
              3     47  7.00    2.78    19.14 0.52 131.28
              4     18  2.96    2.10     2.45 0.72   8.97


In [1873]:
# Statistics table for dwell_data_x_no_outliers (without outliers)
stats_no_outliers = []

for i in range(5):
    subset_data = dwell_data_x_no_outliers[dwell_data_x_no_outliers['MainlineTrains'] == i]['DWELL_TIME']
    if len(subset_data) > 0:
        stats_no_outliers.append({
            'MainlineTrains': i,
            'Count': len(subset_data),
            'Mean': subset_data.mean(),
            'Median': subset_data.median(),
            'Std Dev': subset_data.std(),
            'Min': subset_data.min(),
            'Max': subset_data.max()
        })

df_stats_no_outliers = pd.DataFrame(stats_no_outliers)
print("DWELL TIME STATISTICS BY MAINLINETRAINS COUNT (WITHOUT OUTLIERS)")
print("="*55)
print(df_stats_no_outliers.round(2).to_string(index=False))

DWELL TIME STATISTICS BY MAINLINETRAINS COUNT (WITHOUT OUTLIERS)
 MainlineTrains  Count  Mean  Median  Std Dev  Min  Max
              0    146  1.47    1.34     0.66 0.52 2.92
              1    144  1.45    1.27     0.69 0.52 3.00
              2     77  1.54    1.40     0.67 0.52 2.92
              3     24  1.45    1.27     0.68 0.52 2.78
              4     12  1.57    1.49     0.73 0.72 2.82


In [1874]:
# Check YardTrains values
print("Unique values:", sorted(dwell_data_x['YardTrains'].unique()))
print("\nValue counts:")
print(dwell_data_x['YardTrains'].value_counts().sort_index())

Unique values: [np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0)]

Value counts:
YardTrains
0.0     62
1.0    133
2.0    171
3.0    152
4.0    102
5.0     49
6.0     16
7.0      7
8.0      1
Name: count, dtype: int64


In [1875]:
# Create histogram showing dwell time distribution for each YardTrains value (with outliers)
fig = go.Figure()

for i in range(8):
    yardtrains_data = dwell_data_x[dwell_data_x['YardTrains'] == i]['DWELL_TIME']
    fig.add_trace(go.Histogram(x=yardtrains_data, name=f'YardTrains={i}'))

fig.update_layout(
    title='Dwell Time Distribution by YardTrains Count',
    xaxis_title='Dwell Time (hours)',
    yaxis_title='Count',
    barmode='group'
)

fig.show()

In [1876]:
# Create histogram showing dwell time distribution for each YardTrains value (without outliers)
fig = go.Figure()

for i in range(8):
    yardtrains_data = dwell_data_x_no_outliers[dwell_data_x_no_outliers['YardTrains'] == i]['DWELL_TIME']
    fig.add_trace(go.Histogram(x=yardtrains_data, name=f'YardTrains={i}'))

fig.update_layout(
    title='Dwell Time Distribution by YardTrains Count (Outliers Removed)',
    xaxis_title='Dwell Time (hours)',
    yaxis_title='Count',
    barmode='group'
)

fig.show()

In [1877]:
# Box plot for dwell_data_x (with outliers)
fig1 = go.Figure()

for i in range(8):
    yardtrains_data = dwell_data_x[dwell_data_x['YardTrains'] == i]['DWELL_TIME']
    fig1.add_trace(go.Box(y=yardtrains_data, name=f'{i}'))

fig1.update_layout(
    title='Dwell Time Distribution by YardTrains Count (With Outliers)',
    xaxis_title='YardTrains Count',
    yaxis_title='Dwell Time (hours)'
)

fig1.show()

In [1878]:
# Box plot for dwell_data_x_no_outliers (without outliers)
fig2 = go.Figure()

for i in range(8):
    yardtrains_data = dwell_data_x_no_outliers[dwell_data_x_no_outliers['YardTrains'] == i]['DWELL_TIME']
    fig2.add_trace(go.Box(y=yardtrains_data, name=f'{i}'))

fig2.update_layout(
    title='Dwell Time Distribution by YardTrains Count (Without Outliers)',
    xaxis_title='YardTrains Count',
    yaxis_title='Dwell Time (hours)'
)

fig2.show()

In [1879]:
# Statistics table for dwell_data_x (with outliers)
stats_with_outliers = []

for i in range(8):
    subset_data = dwell_data_x[dwell_data_x['YardTrains'] == i]['DWELL_TIME']
    if len(subset_data) > 0:
        stats_with_outliers.append({
            'YardTrains': i,
            'Count': len(subset_data),
            'Mean': subset_data.mean(),
            'Median': subset_data.median(),
            'Std Dev': subset_data.std(),
            'Min': subset_data.min(),
            'Max': subset_data.max()
        })

df_stats_with_outliers = pd.DataFrame(stats_with_outliers)
print("DWELL TIME STATISTICS BY YARDTRAINS COUNT (WITH OUTLIERS)")
print("="*55)
print(df_stats_with_outliers.round(2).to_string(index=False))

DWELL TIME STATISTICS BY YARDTRAINS COUNT (WITH OUTLIERS)
 YardTrains  Count  Mean  Median  Std Dev  Min    Max
          0     62  3.76    2.32     3.80 0.30  16.85
          1    133  4.14    2.47     5.22 0.45  39.43
          2    171  5.43    2.25    14.64 0.43 137.00
          3    152  3.75    2.42     4.35 0.42  36.00
          4    102  3.79    2.43     4.67 0.67  34.65
          5     49  3.56    2.27     4.11 0.58  22.82
          6     16  3.27    1.81     3.77 0.53  11.98
          7      7  4.88    2.22     5.16 0.82  14.92


In [1880]:
# Statistics table for dwell_data_x_no_outliers (without outliers)
stats_no_outliers = []

for i in range(8):
    subset_data = dwell_data_x_no_outliers[dwell_data_x_no_outliers['YardTrains'] == i]['DWELL_TIME']
    if len(subset_data) > 0:
        stats_no_outliers.append({
            'YardTrains': i,
            'Count': len(subset_data),
            'Mean': subset_data.mean(),
            'Median': subset_data.median(),
            'Std Dev': subset_data.std(),
            'Min': subset_data.min(),
            'Max': subset_data.max()
        })

df_stats_no_outliers = pd.DataFrame(stats_no_outliers)
print("DWELL TIME STATISTICS BY YARDTRAINS COUNT (WITHOUT OUTLIERS)")
print("="*55)
print(df_stats_no_outliers.round(2).to_string(index=False))

DWELL TIME STATISTICS BY YARDTRAINS COUNT (WITHOUT OUTLIERS)
 YardTrains  Count  Mean  Median  Std Dev  Min  Max
          0     36  1.50    1.38     0.68 0.58 2.90
          1     77  1.46    1.15     0.70 0.58 2.88
          2     96  1.53    1.42     0.63 0.52 2.92
          3     89  1.43    1.35     0.70 0.52 3.00
          4     58  1.46    1.18     0.68 0.67 3.00
          5     30  1.54    1.43     0.67 0.58 2.88
          6     12  1.39    1.10     0.72 0.53 2.70
          7      4  1.43    1.35     0.58 0.82 2.22


In [1881]:
# Check PriorityTrains values
print(sorted(dwell_data_x['PriorityTrains'].unique()))
print(dwell_data_x['PriorityTrains'].value_counts().sort_index())

[np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0)]
PriorityTrains
0.0     47
1.0    137
2.0    187
3.0    136
4.0    112
5.0     47
6.0     20
7.0      5
8.0      1
9.0      1
Name: count, dtype: int64


In [1882]:
# Create histogram showing dwell time distribution for each PriorityTrains value (with outliers)
fig = go.Figure()

for i in range(8):
    prioritytrains_data = dwell_data_x[dwell_data_x['PriorityTrains'] == i]['DWELL_TIME']
    fig.add_trace(go.Histogram(x=prioritytrains_data, name=f'PriorityTrains={i}'))

fig.update_layout(
    title='Dwell Time Distribution by PriorityTrains Count',
    xaxis_title='Dwell Time (hours)',
    yaxis_title='Count',
    barmode='group'
)

fig.show()

In [1883]:
# Create histogram showing dwell time distribution for each PriorityTrains value (without outliers)
fig = go.Figure()

for i in range(8):
    prioritytrains_data = dwell_data_x_no_outliers[dwell_data_x_no_outliers['PriorityTrains'] == i]['DWELL_TIME']
    fig.add_trace(go.Histogram(x=prioritytrains_data, name=f'PriorityTrains={i}'))

fig.update_layout(
    title='Dwell Time Distribution by PriorityTrains Count (Outliers Removed)',
    xaxis_title='Dwell Time (hours)',
    yaxis_title='Count',
    barmode='group'
)

fig.show()

In [1884]:
# Box plot for dwell_data_x (with outliers)
fig1 = go.Figure()

for i in range(8):
    prioritytrains_data = dwell_data_x[dwell_data_x['PriorityTrains'] == i]['DWELL_TIME']
    fig1.add_trace(go.Box(y=prioritytrains_data, name=f'{i}'))

fig1.update_layout(
    title='Dwell Time Distribution by PriorityTrains Count (With Outliers)',
    xaxis_title='PriorityTrains Count',
    yaxis_title='Dwell Time (hours)'
)

fig1.show()

In [1885]:
# Box plot for dwell_data_x_no_outliers (without outliers)
fig2 = go.Figure()

for i in range(8):
    prioritytrains_data = dwell_data_x_no_outliers[dwell_data_x_no_outliers['PriorityTrains'] == i]['DWELL_TIME']
    fig2.add_trace(go.Box(y=prioritytrains_data, name=f'{i}'))

fig2.update_layout(
    title='Dwell Time Distribution by PriorityTrains Count (Without Outliers)',
    xaxis_title='PriorityTrains Count',
    yaxis_title='Dwell Time (hours)'
)

fig2.show()

In [1886]:
# Statistics table for dwell_data_x (with outliers)
stats_with_outliers = []

for i in range(8):
    subset_data = dwell_data_x[dwell_data_x['PriorityTrains'] == i]['DWELL_TIME']
    if len(subset_data) > 0:
        stats_with_outliers.append({
            'PriorityTrains': i,
            'Count': len(subset_data),
            'Mean': subset_data.mean(),
            'Median': subset_data.median(),
            'Std Dev': subset_data.std(),
            'Min': subset_data.min(),
            'Max': subset_data.max()
        })

df_stats_with_outliers = pd.DataFrame(stats_with_outliers)
print("DWELL TIME STATISTICS BY PRIORITYTRAINS COUNT (WITH OUTLIERS)")
print("="*55)
print(df_stats_with_outliers.round(2).to_string(index=False))

DWELL TIME STATISTICS BY PRIORITYTRAINS COUNT (WITH OUTLIERS)
 PriorityTrains  Count  Mean  Median  Std Dev  Min    Max
              0     47  3.71    1.80     3.68 0.30  13.62
              1    137  3.62    2.20     4.73 0.43  36.00
              2    187  4.98    2.58    11.12 0.47 137.00
              3    136  3.79    2.59     3.39 0.45  15.07
              4    112  5.13    2.56    12.70 0.42 131.28
              5     47  3.11    1.43     3.94 0.63  22.82
              6     20  2.90    2.23     2.26 0.87   8.45
              7      5  4.17    3.38     4.66 0.53  11.98


In [1887]:
# Statistics table for dwell_data_x_no_outliers (without outliers)
stats_no_outliers = []

for i in range(8):
    subset_data = dwell_data_x_no_outliers[dwell_data_x_no_outliers['PriorityTrains'] == i]['DWELL_TIME']
    if len(subset_data) > 0:
        stats_no_outliers.append({
            'PriorityTrains': i,
            'Count': len(subset_data),
            'Mean': subset_data.mean(),
            'Median': subset_data.median(),
            'Std Dev': subset_data.std(),
            'Min': subset_data.min(),
            'Max': subset_data.max()
        })

df_stats_no_outliers = pd.DataFrame(stats_no_outliers)
print("DWELL TIME STATISTICS BY PRIORITYTRAINS COUNT (WITHOUT OUTLIERS)")
print("="*55)
print(df_stats_no_outliers.round(2).to_string(index=False))

DWELL TIME STATISTICS BY PRIORITYTRAINS COUNT (WITHOUT OUTLIERS)
 PriorityTrains  Count  Mean  Median  Std Dev  Min  Max
              0     26  1.22    1.13     0.48 0.52 2.43
              1     85  1.49    1.30     0.70 0.52 3.00
              2    106  1.51    1.40     0.69 0.57 3.00
              3     75  1.51    1.42     0.70 0.52 2.92
              4     60  1.55    1.38     0.68 0.52 2.85
              5     34  1.30    1.18     0.53 0.63 2.88
              6     14  1.70    1.70     0.74 0.87 2.88
              7      2  0.62    0.62     0.13 0.53 0.72


In [1888]:
# Check PriorityMainline values
print(sorted(dwell_data_x['PriorityMainline'].unique()))
print(dwell_data_x['PriorityMainline'].value_counts().sort_index())

[np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0)]
PriorityMainline
0.0    345
1.0    236
2.0     94
3.0     15
4.0      3
Name: count, dtype: int64


In [1889]:
# Create histogram showing dwell time distribution for each PriorityMainline value (with outliers)
fig = go.Figure()

for i in range(5):
    prioritymainline_data = dwell_data_x[dwell_data_x['PriorityMainline'] == i]['DWELL_TIME']
    fig.add_trace(go.Histogram(x=prioritymainline_data, name=f'PriorityMainline={i}'))

fig.update_layout(
    title='Dwell Time Distribution by PriorityMainline Count',
    xaxis_title='Dwell Time (hours)',
    yaxis_title='Count',
    barmode='group'
)

fig.show()

In [1890]:
# Create histogram showing dwell time distribution for each PriorityMainline value (without outliers)
fig = go.Figure()

for i in range(5):
    prioritymainline_data = dwell_data_x_no_outliers[dwell_data_x_no_outliers['PriorityMainline'] == i]['DWELL_TIME']
    fig.add_trace(go.Histogram(x=prioritymainline_data, name=f'PriorityMainline={i}'))

fig.update_layout(
    title='Dwell Time Distribution by PriorityMainline Count (Outliers Removed)',
    xaxis_title='Dwell Time (hours)',
    yaxis_title='Count',
    barmode='group'
)

fig.show()

In [1891]:
# Box plot for dwell_data_x (with outliers)
fig1 = go.Figure()

for i in range(5):
    prioritymainline_data = dwell_data_x[dwell_data_x['PriorityMainline'] == i]['DWELL_TIME']
    fig1.add_trace(go.Box(y=prioritymainline_data, name=f'{i}'))

fig1.update_layout(
    title='Dwell Time Distribution by PriorityMainline Count (With Outliers)',
    xaxis_title='PriorityMainline Count',
    yaxis_title='Dwell Time (hours)'
)

fig1.show()

In [1892]:
# Box plot for dwell_data_x_no_outliers (without outliers)
fig2 = go.Figure()

for i in range(5):
    prioritymainline_data = dwell_data_x_no_outliers[dwell_data_x_no_outliers['PriorityMainline'] == i]['DWELL_TIME']
    fig2.add_trace(go.Box(y=prioritymainline_data, name=f'{i}'))

fig2.update_layout(
    title='Dwell Time Distribution by PriorityMainline Count (Without Outliers)',
    xaxis_title='PriorityMainline Count',
    yaxis_title='Dwell Time (hours)'
)

fig2.show()

In [1893]:
# Statistics table for dwell_data_x (with outliers)
stats_with_outliers = []

for i in range(5):
    subset_data = dwell_data_x[dwell_data_x['PriorityMainline'] == i]['DWELL_TIME']
    if len(subset_data) > 0:
        stats_with_outliers.append({
            'PriorityMainline': i,
            'Count': len(subset_data),
            'Mean': subset_data.mean(),
            'Median': subset_data.median(),
            'Std Dev': subset_data.std(),
            'Min': subset_data.min(),
            'Max': subset_data.max()
        })

df_stats_with_outliers = pd.DataFrame(stats_with_outliers)
print("DWELL TIME STATISTICS BY PRIORITYMAINLINE COUNT (WITH OUTLIERS)")
print("="*55)
print(df_stats_with_outliers.round(2).to_string(index=False))

DWELL TIME STATISTICS BY PRIORITYMAINLINE COUNT (WITH OUTLIERS)
 PriorityMainline  Count  Mean  Median  Std Dev  Min    Max
                0    345  4.10    2.23     8.46 0.30 137.00
                1    236  4.07    2.29     4.79 0.42  39.43
                2     94  5.15    2.57    13.59 0.45 131.28
                3     15  4.45    3.62     4.00 0.87  14.92
                4      3  2.31    2.82     1.40 0.72   3.38


In [1894]:
# Statistics table for dwell_data_x_no_outliers (without outliers)
stats_no_outliers = []

for i in range(5):
    subset_data = dwell_data_x_no_outliers[dwell_data_x_no_outliers['PriorityMainline'] == i]['DWELL_TIME']
    if len(subset_data) > 0:
        stats_no_outliers.append({
            'PriorityMainline': i,
            'Count': len(subset_data),
            'Mean': subset_data.mean(),
            'Median': subset_data.median(),
            'Std Dev': subset_data.std(),
            'Min': subset_data.min(),
            'Max': subset_data.max()
        })

df_stats_no_outliers = pd.DataFrame(stats_no_outliers)
print("DWELL TIME STATISTICS BY PRIORITYMAINLINE COUNT (WITHOUT OUTLIERS)")
print("="*55)
print(df_stats_no_outliers.round(2).to_string(index=False))

DWELL TIME STATISTICS BY PRIORITYMAINLINE COUNT (WITHOUT OUTLIERS)
 PriorityMainline  Count  Mean  Median  Std Dev  Min  Max
                0    205  1.46    1.33     0.67 0.52 3.00
                1    137  1.48    1.35     0.66 0.52 2.92
                2     52  1.52    1.33     0.70 0.52 2.90
                3      7  1.49    1.28     0.71 0.87 2.58
                4      2  1.77    1.77     1.48 0.72 2.82


In [1895]:
# Check PriorityYard values
print(sorted(dwell_data_x['PriorityYard'].unique()))
print(dwell_data_x['PriorityYard'].value_counts().sort_index())

[np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(8.0)]
PriorityYard
0.0    107
1.0    193
2.0    196
3.0    112
4.0     60
5.0     20
6.0      4
8.0      1
Name: count, dtype: int64


In [1896]:
# Create histogram showing dwell time distribution for each PriorityYard value (with outliers)
fig = go.Figure()

for i in range(7):
    priorityyard_data = dwell_data_x[dwell_data_x['PriorityYard'] == i]['DWELL_TIME']
    fig.add_trace(go.Histogram(x=priorityyard_data, name=f'PriorityYard={i}'))

fig.update_layout(
    title='Dwell Time Distribution by PriorityYard Count',
    xaxis_title='Dwell Time (hours)',
    yaxis_title='Count',
    barmode='group'
)

fig.show()

In [1897]:
# Create histogram showing dwell time distribution for each PriorityYard value (without outliers)
fig = go.Figure()

for i in range(7):
    priorityyard_data = dwell_data_x_no_outliers[dwell_data_x_no_outliers['PriorityYard'] == i]['DWELL_TIME']
    fig.add_trace(go.Histogram(x=priorityyard_data, name=f'PriorityYard={i}'))

fig.update_layout(
    title='Dwell Time Distribution by PriorityYard Count (Outliers Removed)',
    xaxis_title='Dwell Time (hours)',
    yaxis_title='Count',
    barmode='group'
)

fig.show()

In [1898]:
# Box plot for dwell_data_x (with outliers)
fig1 = go.Figure()

for i in range(7):
    priorityyard_data = dwell_data_x[dwell_data_x['PriorityYard'] == i]['DWELL_TIME']
    fig1.add_trace(go.Box(y=priorityyard_data, name=f'{i}'))

fig1.update_layout(
    title='Dwell Time Distribution by PriorityYard Count (With Outliers)',
    xaxis_title='PriorityYard Count',
    yaxis_title='Dwell Time (hours)'
)

fig1.show()

In [1899]:
# Box plot for dwell_data_x_no_outliers (without outliers)
fig2 = go.Figure()

for i in range(7):
    priorityyard_data = dwell_data_x_no_outliers[dwell_data_x_no_outliers['PriorityYard'] == i]['DWELL_TIME']
    fig2.add_trace(go.Box(y=priorityyard_data, name=f'{i}'))

fig2.update_layout(
    title='Dwell Time Distribution by PriorityYard Count (Without Outliers)',
    xaxis_title='PriorityYard Count',
    yaxis_title='Dwell Time (hours)'
)

fig2.show()

In [1900]:
# Statistics table for dwell_data_x (with outliers)
stats_with_outliers = []

for i in range(7):
    subset_data = dwell_data_x[dwell_data_x['PriorityYard'] == i]['DWELL_TIME']
    if len(subset_data) > 0:
        stats_with_outliers.append({
            'PriorityYard': i,
            'Count': len(subset_data),
            'Mean': subset_data.mean(),
            'Median': subset_data.median(),
            'Std Dev': subset_data.std(),
            'Min': subset_data.min(),
            'Max': subset_data.max()
        })

df_stats_with_outliers = pd.DataFrame(stats_with_outliers)
print("DWELL TIME STATISTICS BY PRIORITYYARD COUNT (WITH OUTLIERS)")
print("="*55)
print(df_stats_with_outliers.round(2).to_string(index=False))

DWELL TIME STATISTICS BY PRIORITYYARD COUNT (WITH OUTLIERS)
 PriorityYard  Count  Mean  Median  Std Dev  Min    Max
            0    107  3.76    2.33     3.67 0.30  16.85
            1    193  4.04    2.37     5.46 0.43  39.43
            2    196  5.26    2.40    13.79 0.47 137.00
            3    112  3.74    2.55     3.33 0.42  14.93
            4     60  3.46    1.74     4.84 0.58  28.63
            5     20  3.41    1.88     3.25 0.78  11.98
            6      4  5.54    3.35     6.43 0.53  14.92


In [1901]:
# Statistics table for dwell_data_x_no_outliers (without outliers)
stats_no_outliers = []

for i in range(7):
    subset_data = dwell_data_x_no_outliers[dwell_data_x_no_outliers['PriorityYard'] == i]['DWELL_TIME']
    if len(subset_data) > 0:
        stats_no_outliers.append({
            'PriorityYard': i,
            'Count': len(subset_data),
            'Mean': subset_data.mean(),
            'Median': subset_data.median(),
            'Std Dev': subset_data.std(),
            'Min': subset_data.min(),
            'Max': subset_data.max()
        })

df_stats_no_outliers = pd.DataFrame(stats_no_outliers)
print("DWELL TIME STATISTICS BY PRIORITYYARD COUNT (WITHOUT OUTLIERS)")
print("="*55)
print(df_stats_no_outliers.round(2).to_string(index=False))

DWELL TIME STATISTICS BY PRIORITYYARD COUNT (WITHOUT OUTLIERS)
 PriorityYard  Count  Mean  Median  Std Dev  Min  Max
            0     61  1.43    1.22     0.67 0.52 2.90
            1    114  1.52    1.32     0.70 0.58 3.00
            2    109  1.45    1.37     0.65 0.52 3.00
            3     61  1.50    1.42     0.70 0.58 2.90
            4     42  1.44    1.22     0.65 0.58 2.88
            5     13  1.53    1.43     0.62 0.78 2.88
            6      2  1.50    1.50     1.37 0.53 2.47


In [1902]:
dwell_data_x_no_outliers.columns

Index(['TRAIN_ID', 'STN_333', 'STN_ST', 'STN_TYPE_CD', 'STN_SEQ_NBR',
       'TRN_MILES_TOT_TD', 'crew_order', 'DPT_DIR', 'REQ_INSP', 'case', 'TA',
       'TD', 'DESTINATION', 'ArrivalDestination', 'TRN_MILES_TOT_DEST',
       'STN_SEQ_NBR_DEST', 'DISTANCE', 'LAST_CREW_STATION',
       'SECOND_LAST_CREW_STATION', 'TRAVEL_TIME', 'DWELL_TIME', 'DATE',
       'DWELL_TIME_BIN', 'IS_OUTLIER', 'IS_HOLIDAY', 'AllTrains',
       'MainlineTrains', 'YardTrains', 'PriorityTrains', 'PriorityMainline',
       'PriorityYard'],
      dtype='object')

In [1903]:
dwell_data_x_no_outliers['TRAIN_ID'].head()

2    XCROBSB921A2023-09-21
3    XCROEGE922A2023-09-22
4    XCROHBG919A2023-09-19
5    XPOWBNY925A2021-09-25
6    XPOWBRE920A2021-09-20
Name: TRAIN_ID, dtype: object

In [1904]:
# Check the length of each TRAIN_ID
dwell_data_x_no_outliers['TRAIN_ID'].str.len().value_counts()

TRAIN_ID
21    403
Name: count, dtype: int64

In [1905]:
# Create TRN_PRTY column by extracting priority code from TRAIN_ID
# Priority is the single letter at position 11 (e.g., 'H' in XBERBSB928H2021-05-28)
dwell_data_x_no_outliers['TRN_PRTY'] = dwell_data_x_no_outliers['TRAIN_ID'].str[10]

In [1906]:
dwell_data_x_no_outliers[['TRAIN_ID', 'TRN_PRTY']].head()

Unnamed: 0,TRAIN_ID,TRN_PRTY
2,XCROBSB921A2023-09-21,A
3,XCROEGE922A2023-09-22,A
4,XCROHBG919A2023-09-19,A
5,XPOWBNY925A2021-09-25,A
6,XPOWBRE920A2021-09-20,A


In [1907]:
dwell_data_x_no_outliers['TRN_PRTY'].unique()

array(['A', 'H', 'M', 'E'], dtype=object)

In [1908]:
# Box plot for dwell time distribution by train priority
fig = go.Figure()
for priority in ['A', 'H', 'M', 'E']:
    priority_data = dwell_data_x_no_outliers[dwell_data_x_no_outliers['TRN_PRTY'] == priority]['DWELL_TIME']
    fig.add_trace(go.Box(y=priority_data, name=f'Priority {priority}'))

fig.update_layout(
    title='Dwell Time Distribution by Train Priority',
    xaxis_title='Train Priority',
    yaxis_title='Dwell Time (hours)'
)
fig.show()

In [1909]:
dwell_data_x['TRAIN_ID'].str.len().value_counts()

TRAIN_ID
21    693
Name: count, dtype: int64

In [1910]:
# Create TRN_PRTY column by extracting priority code from TRAIN_ID
# Priority is the single letter at position 11 (e.g., 'H' in XBERBSB928H2021-05-28)
dwell_data_x['Train Priority'] = dwell_data_x['TRAIN_ID'].str[10]

In [1911]:
dwell_data_x[['TRAIN_ID', 'Train Priority']].head()

Unnamed: 0,TRAIN_ID,Train Priority
0,XBERBSB928H2021-05-28,H
2,XCROBSB921A2023-09-21,A
3,XCROEGE922A2023-09-22,A
4,XCROHBG919A2023-09-19,A
5,XPOWBNY925A2021-09-25,A


In [1912]:
#Load Lineup Data for last feature
Lineup_df = pd.read_csv('Lineup Data.csv')

In [1913]:
Lineup_df.head()

Unnamed: 0,TRN_SYM,TRN_DAY,TRN_TYPE,TRN_SECT,STN_333,STN_ST,EVT_CD,EVST_CD,PROC_DTTM,STN_SEQ_NBR,...,TRN_PRTY,CO_ABBR,COMMENTS,HAZ_TRN_FLG,IBND_CMNT,LST_MAINT_ID,LST_MAINT_TS,AZ_LOAD_TS,PARTTN_YR,PARTTN_MO
0,TACBSB,5,X,9,MINOT,ND,TP,P,2022-10-07T16:06:48.593-05:00,2100.0,...,H,BNSF,Pln Arr Time applied,,,ZDADMIN,2022-10-07T16:06:51.630-05:00,2022-10-10T04:01:02.715-05:00,2022,10
1,INBHBG,5,X,9,MINOT,ND,TP,P,2022-10-07T07:39:16.168-05:00,1730.0,...,H,BNSF,Send MP type D (ETD: 07 2158),,,ZDADMIN,2022-10-07T07:39:28.627-05:00,2022-10-10T04:01:02.715-05:00,2022,10
2,INBOTV,15,X,9,MINOT,ND,AI,04,2022-10-19T19:16:21.680-05:00,1970.0,...,H,BNSF,BNSF8759 10EF+-BNSF8012 8 EF+-BNSF610310 EF+ ...,,...,ZDADMIN,2022-10-19T19:16:31.083-05:00,2022-10-22T04:01:14.970-05:00,2022,10
3,TACBSB,5,X,9,MINOT,ND,TP,P,2022-10-07T10:07:36.713-05:00,2100.0,...,H,BNSF,Trk 6398 planned,,,ZDADMIN,2022-10-07T10:07:43.717-05:00,2022-10-10T04:01:02.715-05:00,2022,10
4,VAWVEA,24,X,9,MINOT,ND,TP,P,2022-10-28T05:49:47.390-05:00,2130.0,...,H,BNSF,Send MP type A (ETA: 28 1016),,,ZDADMIN,2022-10-28T05:49:52.821-05:00,2022-10-31T04:00:59.455-05:00,2022,10


In [1914]:
Lineup_df.columns

Index(['TRN_SYM', 'TRN_DAY', 'TRN_TYPE', 'TRN_SECT', 'STN_333', 'STN_ST',
       'EVT_CD', 'EVST_CD', 'PROC_DTTM', 'STN_SEQ_NBR', 'EVT_DT', 'EVT_TM',
       'CANC_USER_ID', 'LOG_USER', 'LOG_ACTION', 'PGM_NME', 'LOG_TID',
       'TRK_NUMB', 'FILL_CD', 'TRN_PRTY', 'CO_ABBR', 'COMMENTS', 'HAZ_TRN_FLG',
       'IBND_CMNT', 'LST_MAINT_ID', 'LST_MAINT_TS', 'AZ_LOAD_TS', 'PARTTN_YR',
       'PARTTN_MO'],
      dtype='object')

In [1915]:
Lineup_df['TRN_DAY'].unique()

array([ 5, 15, 24,  4, 29, 12,  3, 11,  6, 23, 10, 16, 18, 19,  9, 20, 25,
       14, 21, 22, 26, 27, 28, 31, 30, 13, 17,  1,  8,  7,  2])

In [1916]:
# Add leading zero to single-digit values in TRN_DAY column
Lineup_df['TRN_DAY'] = Lineup_df['TRN_DAY'].astype(str).str.zfill(2)

In [1917]:
Lineup_df['TRN_DAY'].unique()

array(['05', '15', '24', '04', '29', '12', '03', '11', '06', '23', '10',
       '16', '18', '19', '09', '20', '25', '14', '21', '22', '26', '27',
       '28', '31', '30', '13', '17', '01', '08', '07', '02'], dtype=object)

In [1918]:
# Create TRAIN_ID_NO by concatenating train components
Lineup_df['TRAIN_ID_NO'] = (Lineup_df['TRN_TYPE'].astype(str) + 
                            Lineup_df['TRN_SYM'].astype(str) + 
                            Lineup_df['TRN_SECT'].astype(str) + 
                            Lineup_df['TRN_DAY'].astype(str) + 
                            Lineup_df['TRN_PRTY'].astype(str))

In [1919]:
Lineup_df.head()

Unnamed: 0,TRN_SYM,TRN_DAY,TRN_TYPE,TRN_SECT,STN_333,STN_ST,EVT_CD,EVST_CD,PROC_DTTM,STN_SEQ_NBR,...,CO_ABBR,COMMENTS,HAZ_TRN_FLG,IBND_CMNT,LST_MAINT_ID,LST_MAINT_TS,AZ_LOAD_TS,PARTTN_YR,PARTTN_MO,TRAIN_ID_NO
0,TACBSB,5,X,9,MINOT,ND,TP,P,2022-10-07T16:06:48.593-05:00,2100.0,...,BNSF,Pln Arr Time applied,,,ZDADMIN,2022-10-07T16:06:51.630-05:00,2022-10-10T04:01:02.715-05:00,2022,10,XTACBSB905H
1,INBHBG,5,X,9,MINOT,ND,TP,P,2022-10-07T07:39:16.168-05:00,1730.0,...,BNSF,Send MP type D (ETD: 07 2158),,,ZDADMIN,2022-10-07T07:39:28.627-05:00,2022-10-10T04:01:02.715-05:00,2022,10,XINBHBG905H
2,INBOTV,15,X,9,MINOT,ND,AI,04,2022-10-19T19:16:21.680-05:00,1970.0,...,BNSF,BNSF8759 10EF+-BNSF8012 8 EF+-BNSF610310 EF+ ...,,...,ZDADMIN,2022-10-19T19:16:31.083-05:00,2022-10-22T04:01:14.970-05:00,2022,10,XINBOTV915H
3,TACBSB,5,X,9,MINOT,ND,TP,P,2022-10-07T10:07:36.713-05:00,2100.0,...,BNSF,Trk 6398 planned,,,ZDADMIN,2022-10-07T10:07:43.717-05:00,2022-10-10T04:01:02.715-05:00,2022,10,XTACBSB905H
4,VAWVEA,24,X,9,MINOT,ND,TP,P,2022-10-28T05:49:47.390-05:00,2130.0,...,BNSF,Send MP type A (ETA: 28 1016),,,ZDADMIN,2022-10-28T05:49:52.821-05:00,2022-10-31T04:00:59.455-05:00,2022,10,XVAWVEA924H


In [1920]:
Lineup_df['LOG_USER'].unique()

array(['B167962 ', 'BMP     ', 'B031547 ', 'B015614 ', 'ZC3PNC09',
       'B010832 ', 'B002703 ', 'ZC1PNC18', 'ZC1PNC68', 'ZC1PNC02',
       'ZC3PNC39', 'B009156 ', 'ZC1PNC95', 'ZC2PNC07', 'ZC3PNC02',
       'ZC1PNC66', 'ZC3PNC66', 'B013431 ', 'ZC3PNC95', 'B022717 ',
       'ZC2PNC12', 'ZC3PNC80', 'ZC1PNC12', 'ZC1PNC82', 'ZC3PNC16',
       'ZC3PNT28', 'B175497 ', 'ZC1PNC21', 'ZC2PNC02', 'ZC1PNC38',
       'ZC2PNC37', 'ZC3PNC14', 'ZC1PNC85', 'ZC1PNC28', 'ZC1PNC26',
       'ZC2PNC15', 'ZC1PNC39', 'ZC3PNC11', 'ZC1PNC27', 'B114901 ',
       'ZC2PNC23', 'ZC3PNC03', 'ZC3PNC19', 'ZC2PNC34', 'ZC2PNC04',
       'ZC3PNC65', 'ZC3PNC34', 'ZC3PNC22', 'ZC2PNC84', 'ZC3PNC32',
       'CAD LRK ', 'ZC2PNC18', 'ZC1PNC03', 'CAD AE  ', 'ZC1PNC07',
       'ZC3PNC24', 'ZC1PNC19', 'ZC2PNC24', 'ZC2PNC11', 'ZC2PNC13',
       'ZC2PNC27', 'ZC2PNC60', 'ZC3PNC86', 'ZC1PNC24', 'ZC2PNC94',
       'ZC1PNC84', 'ZC2PNC33', 'ZC2PNC42', 'TRNWOPR4', 'B166985 ',
       'ZC3PNC31', 'ZC3PNC30', 'ZC3PNC13', 'ZC2PNC06', 'ZC3PNT

In [1921]:
# Strip whitespace from LOG_USER column because they are being counted as a character
Lineup_df['LOG_USER'] = Lineup_df['LOG_USER'].astype(str).str.strip()

In [1922]:
# Kepp rows with 7 char in log user 
print(f"Rows before: {len(Lineup_df)}")
print(f"Rows with 7-char LOG_USER: {len(Lineup_df[Lineup_df['LOG_USER'].astype(str).str.len() == 7])}")

Rows before: 87498
Rows with 7-char LOG_USER: 50344


In [1923]:
# Then filter
Lineup_df = Lineup_df[Lineup_df['LOG_USER'].astype(str).str.len() == 7]
print(f"Rows after: {len(Lineup_df)}")

Rows after: 50344


In [1924]:
# Show the three columns
Lineup_df[['TRAIN_ID_NO', 'EVT_DT', 'TRK_NUMB']]

Unnamed: 0,TRAIN_ID_NO,EVT_DT,TRK_NUMB
0,XTACBSB905H,2022-10-08,
1,XINBHBG905H,2022-10-07,
3,XTACBSB905H,2022-10-07,
4,XVAWVEA924H,2022-10-28,
5,XINBVEA904M,2022-10-08,
...,...,...,...
87490,XKALHBG905H,2021-02-08,
87491,XLVWVEA906H,2021-12-11,
87492,XKAHOTV902A,2021-02-05,
87494,XKALBSB907A,2021-02-11,


In [1925]:
Lineup_df['TRK_NUMB'].unique()

array(['    ', '6398', '4201', '6399', '6298', '6299', '4206', '6301',
       '6300', '4203', '5685', '6302', '6294', '6303', '1781', '9116',
       '6305', '4923', '4204', '9535', '3237', '5808', '3385', '2983',
       '4205', '8218', '9239', '2179', '7771', '3377', '4842', '0162',
       '3378', '0166', '7765', '4578', '6401', '3485', '9258', '3784',
       '6362', '5184', '3975', '0406', '0818', '1240', '9071', '5896',
       '3431', '7436', '7935', '7098', '8942', '3178', '2807', '7092',
       '2441', '9238', '5086', '2440', '0408', '0887', '6304', '7609',
       '2474', '2649', '9233', '2868', '4886', '5192', '0922', '2986',
       '1727', '5329', '9198', '1624', '7018', '8372', '9192', '6185',
       '0210', '6306', '6758', '5705', '6293', '9237', '4202'],
      dtype=object)

In [1926]:
# Check the data type and see some non-empty values
print(Lineup_df['TRK_NUMB'].dtype)
print(Lineup_df[Lineup_df['TRK_NUMB'] != '']['TRK_NUMB'].head(10))

object
0         
1         
3         
4         
5         
7         
8         
9     6398
10        
11        
Name: TRK_NUMB, dtype: object


In [1927]:
# keep only the rows where TRK_NUMB (after removing spaces) is not empty
# show just the TRAIN_ID_NO, EVT_DT, and TRK_NUMB columns for those rows
Lineup_df[Lineup_df['TRK_NUMB'].str.strip() != ''][['TRAIN_ID_NO', 'EVT_DT', 'TRK_NUMB']]

Unnamed: 0,TRAIN_ID_NO,EVT_DT,TRK_NUMB
9,XVAWVEA924H,2022-10-28,6398
12,XINBVEA912H,2022-10-16,4201
27,XVAWVEA924H,2022-10-28,6398
31,XLVWPBN910H,2022-10-14,6399
35,XVAWHBG923H,2022-10-26,6298
...,...,...,...
87461,XRGTHBG926H,2020-11-28,6294
87468,XVAWOAN909H,2020-11-14,6399
87476,XKAHGWN908H,2021-02-10,6299
87482,XTACHBG928H,2020-12-01,6298


In [1928]:
dwell_data_x['TRAIN_ID'].unique()

array(['XBERBSB928H2021-05-28', 'XCROBSB921A2023-09-21',
       'XCROEGE922A2023-09-22', 'XCROHBG919A2023-09-19',
       'XPOWBNY925A2021-09-25', 'XPOWBRE920A2021-09-20',
       'XPOWBSB901A2021-10-01', 'XPOWEGE902A2021-10-02',
       'XPOWGWN903A2021-10-03', 'XPOWCSN921A2021-09-21',
       'XWFDEGE911I2021-09-11', 'XCROKND913A2022-09-13',
       'XKALBSB907A2021-09-07', 'XKALKND915H2020-06-15',
       'XLVWCSN910H2020-09-10', 'XVAWOAN913H2023-11-13',
       'XINBBRE911H2023-10-11', 'XTACBNY918H2021-11-18',
       'XINBBNY925H2023-11-25', 'XVAWPBN922H2023-11-22',
       'XKAHBSB920H2022-04-20', 'XWHICSN903H2022-10-03',
       'XWAWPBN911H2021-09-11', 'XINBKND931H2020-08-31',
       'XINBVEA908H2021-10-08', 'XKAHBND904H2022-10-04',
       'XKAHBNY921H2020-10-21', 'XINBVEA926H2023-01-26',
       'XINBPBN909H2023-05-09', 'XWHIPBN906H2023-03-06',
       'XINBOTV904H2020-11-04', 'XKALVEA907H2023-06-07',
       'XLVWCSN922H2022-03-22', 'XKAHBRE919H2021-09-19',
       'XKALOAN903H2023-01-03',

In [1929]:
# Check the length of each unique TRAIN_ID
dwell_data_x['TRAIN_ID'].str.len().unique()

array([21])

In [1930]:
dwell_data_x['TRAIN_ID_NO'] = dwell_data_x['TRAIN_ID'].str[:11]

In [1931]:
dwell_data_x[['TRAIN_ID', 'TRAIN_ID_NO']].head()

Unnamed: 0,TRAIN_ID,TRAIN_ID_NO
0,XBERBSB928H2021-05-28,XBERBSB928H
2,XCROBSB921A2023-09-21,XCROBSB921A
3,XCROEGE922A2023-09-22,XCROEGE922A
4,XCROHBG919A2023-09-19,XCROHBG919A
5,XPOWBNY925A2021-09-25,XPOWBNY925A


In [1932]:
dwell_data_x.columns

Index(['TRAIN_ID', 'STN_333', 'STN_ST', 'STN_TYPE_CD', 'STN_SEQ_NBR',
       'TRN_MILES_TOT_TD', 'crew_order', 'DPT_DIR', 'REQ_INSP', 'case', 'TA',
       'TD', 'DESTINATION', 'ArrivalDestination', 'TRN_MILES_TOT_DEST',
       'STN_SEQ_NBR_DEST', 'DISTANCE', 'LAST_CREW_STATION',
       'SECOND_LAST_CREW_STATION', 'TRAVEL_TIME', 'DWELL_TIME', 'DATE',
       'DWELL_TIME_BIN', 'IS_OUTLIER', 'IS_HOLIDAY', 'AllTrains',
       'MainlineTrains', 'YardTrains', 'PriorityTrains', 'PriorityMainline',
       'PriorityYard', 'Train Priority', 'TRAIN_ID_NO'],
      dtype='object')

In [1933]:
# Ensure both date columns are datetime type
dwell_data_x['TA'] = pd.to_datetime(dwell_data_x['TA'], utc=True)
Lineup_df['EVT_DT'] = pd.to_datetime(Lineup_df['EVT_DT'], utc=True)

# Sort both dataframes by date 
dwell_data_x = dwell_data_x.sort_values('TA')
Lineup_df = Lineup_df.sort_values('EVT_DT')

# Merge with 5-day tolerance
merged_df = pd.merge_asof(
    dwell_data_x,
    Lineup_df,
    left_on='TA',
    right_on='EVT_DT',
    left_by='TRAIN_ID_NO',
    right_by='TRAIN_ID_NO',
    tolerance=pd.Timedelta('5 days'),
    direction='nearest'
)

In [1934]:
merged_df.head()

Unnamed: 0,TRAIN_ID,STN_333_x,STN_ST_x,STN_TYPE_CD,STN_SEQ_NBR_x,TRN_MILES_TOT_TD,crew_order,DPT_DIR,REQ_INSP,case,...,TRN_PRTY,CO_ABBR,COMMENTS,HAZ_TRN_FLG,IBND_CMNT,LST_MAINT_ID,LST_MAINT_TS,AZ_LOAD_TS,PARTTN_YR,PARTTN_MO
0,XKALBSB902H2020-05-02,MINOT,ND,C,2050,1308.4,1,E,N,General,...,,,,,,,,,,
1,XINBOTV902H2020-05-02,MINOT,ND,C,2160,1334.9,1,E,N,General,...,,,,,,,,,,
2,XKAHOTV915A2020-05-15,MINOT,ND,C,2050,1308.4,1,E,N,General,...,,,,,,,,,,
3,XKAHOTV906A2020-06-06,MINOT,ND,C,2350,1434.5,1,E,N,General,...,A,BNSF,Pln Depart Time modified,,,ZDADMIN,2020-06-11T23:28:40.638-05:00,2020-06-14T04:31:07.680-05:00,2020.0,6.0
4,XKALHBG911H2020-06-11,MINOT,ND,C,2350,1434.5,1,E,N,General,...,H,BNSF,,,,ZDADMIN,2020-06-14T09:40:43.117-05:00,2020-06-17T04:30:37.692-05:00,2020.0,6.0


In [1935]:
# Create MAINLINE flag : 1 if track number is 6399 or 6398, 0 otherwise
merged_df['MAINLINE'] = merged_df['TRK_NUMB'].isin(['6399', '6398']).astype(int)

In [1936]:
merged_df.head()

Unnamed: 0,TRAIN_ID,STN_333_x,STN_ST_x,STN_TYPE_CD,STN_SEQ_NBR_x,TRN_MILES_TOT_TD,crew_order,DPT_DIR,REQ_INSP,case,...,CO_ABBR,COMMENTS,HAZ_TRN_FLG,IBND_CMNT,LST_MAINT_ID,LST_MAINT_TS,AZ_LOAD_TS,PARTTN_YR,PARTTN_MO,MAINLINE
0,XKALBSB902H2020-05-02,MINOT,ND,C,2050,1308.4,1,E,N,General,...,,,,,,,,,,0
1,XINBOTV902H2020-05-02,MINOT,ND,C,2160,1334.9,1,E,N,General,...,,,,,,,,,,0
2,XKAHOTV915A2020-05-15,MINOT,ND,C,2050,1308.4,1,E,N,General,...,,,,,,,,,,0
3,XKAHOTV906A2020-06-06,MINOT,ND,C,2350,1434.5,1,E,N,General,...,BNSF,Pln Depart Time modified,,,ZDADMIN,2020-06-11T23:28:40.638-05:00,2020-06-14T04:31:07.680-05:00,2020.0,6.0,0
4,XKALHBG911H2020-06-11,MINOT,ND,C,2350,1434.5,1,E,N,General,...,BNSF,,,,ZDADMIN,2020-06-14T09:40:43.117-05:00,2020-06-17T04:30:37.692-05:00,2020.0,6.0,0


In [1937]:
# Count how many are mainline trains (MAINLINE=1)
print(f"Number of mainline trains: {(merged_df['MAINLINE'] == 1).sum()}")
print(f"Total trains: {len(merged_df)}")

Number of mainline trains: 82
Total trains: 693


In [1938]:
# Show which trains are mainline (MAINLINE=1)
merged_df[merged_df['MAINLINE'] == 1].head()

Unnamed: 0,TRAIN_ID,STN_333_x,STN_ST_x,STN_TYPE_CD,STN_SEQ_NBR_x,TRN_MILES_TOT_TD,crew_order,DPT_DIR,REQ_INSP,case,...,CO_ABBR,COMMENTS,HAZ_TRN_FLG,IBND_CMNT,LST_MAINT_ID,LST_MAINT_TS,AZ_LOAD_TS,PARTTN_YR,PARTTN_MO,MAINLINE
6,XKALBRE914H2020-06-14,MINOT,ND,C,550,430.2,1,E,N,General,...,BNSF,Fuel out date recorded,,,ZDADMIN,2020-06-15T18:56:20.695-05:00,2020-06-18T12:21:37.790-05:00,2020.0,6.0,1
20,XKAHHBG917H2020-07-17,MINOT,ND,C,2350,1434.5,1,E,N,General,...,BNSF,Pull time applied Trk: 6398,,,ZDADMIN,2020-07-20T12:08:19.023-05:00,2020-07-23T04:00:56.174-05:00,2020.0,7.0,1
29,XINBLDG904H2020-09-04,MINOT,ND,C,2160,1334.9,1,E,N,General,...,BNSF,Fuel out date recorded,,,ZDADMIN,2020-09-09T02:43:38.324-05:00,2020-09-11T04:12:41.636-05:00,2020.0,9.0,1
30,XLVWBNY906H2020-09-06,MINOT,ND,C,2350,1430.1,1,E,N,General,...,BNSF,YDSTRK msg sent to MP,,,ZDADMIN,2020-09-09T12:10:01.932-05:00,2020-09-12T04:01:03.188-05:00,2020.0,9.0,1
40,XKALBRE924H2020-09-24,MINOT,ND,C,2350,1434.5,1,E,N,General,...,BNSF,Plan/Access Pt changed to FUEL,,,ZDADMIN,2020-09-29T14:08:28.723-05:00,2020-10-02T04:00:42.214-05:00,2020.0,9.0,1


In [1939]:
merged_df.columns

Index(['TRAIN_ID', 'STN_333_x', 'STN_ST_x', 'STN_TYPE_CD', 'STN_SEQ_NBR_x',
       'TRN_MILES_TOT_TD', 'crew_order', 'DPT_DIR', 'REQ_INSP', 'case', 'TA',
       'TD', 'DESTINATION', 'ArrivalDestination', 'TRN_MILES_TOT_DEST',
       'STN_SEQ_NBR_DEST', 'DISTANCE', 'LAST_CREW_STATION',
       'SECOND_LAST_CREW_STATION', 'TRAVEL_TIME', 'DWELL_TIME', 'DATE',
       'DWELL_TIME_BIN', 'IS_OUTLIER', 'IS_HOLIDAY', 'AllTrains',
       'MainlineTrains', 'YardTrains', 'PriorityTrains', 'PriorityMainline',
       'PriorityYard', 'Train Priority', 'TRAIN_ID_NO', 'TRN_SYM', 'TRN_DAY',
       'TRN_TYPE', 'TRN_SECT', 'STN_333_y', 'STN_ST_y', 'EVT_CD', 'EVST_CD',
       'PROC_DTTM', 'STN_SEQ_NBR_y', 'EVT_DT', 'EVT_TM', 'CANC_USER_ID',
       'LOG_USER', 'LOG_ACTION', 'PGM_NME', 'LOG_TID', 'TRK_NUMB', 'FILL_CD',
       'TRN_PRTY', 'CO_ABBR', 'COMMENTS', 'HAZ_TRN_FLG', 'IBND_CMNT',
       'LST_MAINT_ID', 'LST_MAINT_TS', 'AZ_LOAD_TS', 'PARTTN_YR', 'PARTTN_MO',
       'MAINLINE'],
      dtype='object')

In [1940]:
merged_df['Train Priority'].head()

0    H
1    H
2    A
3    A
4    H
Name: Train Priority, dtype: object

In [1941]:
# Box plot for dwell time by mainline status
fig = go.Figure()

# Add box plot for non-mainline trains (MAINLINE=0)
non_mainline_data = merged_df[merged_df['MAINLINE'] == 0]['DWELL_TIME']
fig.add_trace(go.Box(y=non_mainline_data, name='Non-Mainline'))

# Add box plot for mainline trains (MAINLINE=1)
mainline_data = merged_df[merged_df['MAINLINE'] == 1]['DWELL_TIME']
fig.add_trace(go.Box(y=mainline_data, name='Mainline'))

fig.update_layout(
    title='Dwell Time Distribution: Mainline vs Non-Mainline Trains',
    xaxis_title='Train Type',
    yaxis_title='Dwell Time (hours)',
    showlegend=False
)

fig.show()

In [1942]:
# Box plot for dwell time by mainline status (without outliers)
fig = go.Figure()

# Filter for non-outliers only
non_outlier_data = merged_df[merged_df['IS_OUTLIER'] == False]

# Add box plot for non-mainline trains (MAINLINE=0)
non_mainline_data = non_outlier_data[non_outlier_data['MAINLINE'] == 0]['DWELL_TIME']
fig.add_trace(go.Box(y=non_mainline_data, name='Non-Mainline'))

# Add box plot for mainline trains (MAINLINE=1)
mainline_data = non_outlier_data[non_outlier_data['MAINLINE'] == 1]['DWELL_TIME']
fig.add_trace(go.Box(y=mainline_data, name='Mainline'))

fig.update_layout(
    title='Dwell Time Distribution: Mainline vs Non-Mainline Trains (Without Outliers)',
    xaxis_title='Train Type',
    yaxis_title='Dwell Time (hours)',
    showlegend=False
)

fig.show()

In [1943]:
    dwell_data_x_no_outliers.head()

Unnamed: 0,TRAIN_ID,STN_333,STN_ST,STN_TYPE_CD,STN_SEQ_NBR,TRN_MILES_TOT_TD,crew_order,DPT_DIR,REQ_INSP,case,...,DWELL_TIME_BIN,IS_OUTLIER,IS_HOLIDAY,AllTrains,MainlineTrains,YardTrains,PriorityTrains,PriorityMainline,PriorityYard,TRN_PRTY
2,XCROBSB921A2023-09-21,MINOT,ND,C,90,63.6,1,E,N,General,...,"(0.5, 1.0]",False,0,3.0,0.0,3.0,1.0,0.0,1.0,A
3,XCROEGE922A2023-09-22,MINOT,ND,C,90,63.6,1,E,N,General,...,"(1.0, 1.5]",False,0,1.0,1.0,0.0,1.0,1.0,0.0,A
4,XCROHBG919A2023-09-19,MINOT,ND,C,90,63.6,1,E,N,General,...,"(1.0, 1.5]",False,0,6.0,4.0,2.0,5.0,3.0,2.0,A
5,XPOWBNY925A2021-09-25,MINOT,ND,C,110,80.0,1,E,N,General,...,"(0.5, 1.0]",False,0,2.0,1.0,1.0,2.0,1.0,1.0,A
6,XPOWBRE920A2021-09-20,MINOT,ND,C,110,80.0,1,E,N,General,...,"(2.0, 2.5]",False,0,2.0,0.0,2.0,1.0,0.0,1.0,A


In [1944]:
dwell_data_x.head()

Unnamed: 0,TRAIN_ID,STN_333,STN_ST,STN_TYPE_CD,STN_SEQ_NBR,TRN_MILES_TOT_TD,crew_order,DPT_DIR,REQ_INSP,case,...,IS_OUTLIER,IS_HOLIDAY,AllTrains,MainlineTrains,YardTrains,PriorityTrains,PriorityMainline,PriorityYard,Train Priority,TRAIN_ID_NO
229,XKALBSB902H2020-05-02,MINOT,ND,C,2050,1308.4,1,E,N,General,...,True,0,4.0,3.0,1.0,3.0,3.0,0.0,H,XKALBSB902H
382,XINBOTV902H2020-05-02,MINOT,ND,C,2160,1334.9,1,E,N,General,...,False,0,5.0,2.0,3.0,3.0,1.0,2.0,H,XINBOTV902H
246,XKAHOTV915A2020-05-15,MINOT,ND,C,2050,1308.4,1,E,N,General,...,False,0,2.0,1.0,1.0,1.0,0.0,1.0,A,XKAHOTV915A
538,XKAHOTV906A2020-06-06,MINOT,ND,C,2350,1434.5,1,E,N,General,...,False,0,8.0,3.0,5.0,5.0,2.0,3.0,A,XKAHOTV906A
482,XKALHBG911H2020-06-11,MINOT,ND,C,2350,1434.5,1,E,N,General,...,True,0,7.0,3.0,4.0,5.0,1.0,4.0,H,XKALHBG911H


In [1945]:
dwell_data_x.columns

Index(['TRAIN_ID', 'STN_333', 'STN_ST', 'STN_TYPE_CD', 'STN_SEQ_NBR',
       'TRN_MILES_TOT_TD', 'crew_order', 'DPT_DIR', 'REQ_INSP', 'case', 'TA',
       'TD', 'DESTINATION', 'ArrivalDestination', 'TRN_MILES_TOT_DEST',
       'STN_SEQ_NBR_DEST', 'DISTANCE', 'LAST_CREW_STATION',
       'SECOND_LAST_CREW_STATION', 'TRAVEL_TIME', 'DWELL_TIME', 'DATE',
       'DWELL_TIME_BIN', 'IS_OUTLIER', 'IS_HOLIDAY', 'AllTrains',
       'MainlineTrains', 'YardTrains', 'PriorityTrains', 'PriorityMainline',
       'PriorityYard', 'Train Priority', 'TRAIN_ID_NO'],
      dtype='object')

In [1946]:
df_clean.columns

Index(['TRAIN_ID', 'STN_333', 'STN_ST', 'STN_TYPE_CD', 'STN_SEQ_NBR',
       'TRN_MILES_TOT_TD', 'crew_order', 'DPT_DIR', 'REQ_INSP', 'case', 'TA',
       'TD', 'DESTINATION', 'ArrivalDestination', 'TRN_MILES_TOT_DEST',
       'STN_SEQ_NBR_DEST', 'DISTANCE', 'LAST_CREW_STATION',
       'SECOND_LAST_CREW_STATION', 'TRAVEL_TIME', 'DWELL_TIME', 'DATE',
       'DWELL_TIME_BIN', 'IS_OUTLIER', 'IS_HOLIDAY', 'INSPECTION_REQUIRED',
       'TA_DAY', 'TA_MONTH', 'TA_HOUR', 'IS_WEEKEND', 'DAY_TYPE', 'TD_DAY',
       'TD_MONTH', 'TD_HOUR'],
      dtype='object')

In [1947]:
df_clean.head()

Unnamed: 0,TRAIN_ID,STN_333,STN_ST,STN_TYPE_CD,STN_SEQ_NBR,TRN_MILES_TOT_TD,crew_order,DPT_DIR,REQ_INSP,case,...,IS_HOLIDAY,INSPECTION_REQUIRED,TA_DAY,TA_MONTH,TA_HOUR,IS_WEEKEND,DAY_TYPE,TD_DAY,TD_MONTH,TD_HOUR
2,XCROBSB921A2023-09-21,MINOT,ND,C,90,63.6,1,E,N,General,...,0,N,Friday,September,0,0,Weekday,Friday,September,1
3,XCROEGE922A2023-09-22,MINOT,ND,C,90,63.6,1,E,N,General,...,0,N,Friday,September,23,0,Weekday,Saturday,September,0
4,XCROHBG919A2023-09-19,MINOT,ND,C,90,63.6,1,E,N,General,...,0,N,Tuesday,September,21,0,Weekday,Tuesday,September,22
5,XPOWBNY925A2021-09-25,MINOT,ND,C,110,80.0,1,E,N,General,...,0,N,Saturday,September,20,1,Weekend,Saturday,September,21
6,XPOWBRE920A2021-09-20,MINOT,ND,C,110,80.0,1,E,N,General,...,0,N,Tuesday,September,1,0,Weekday,Tuesday,September,3


In [1948]:
merged_df.columns

Index(['TRAIN_ID', 'STN_333_x', 'STN_ST_x', 'STN_TYPE_CD', 'STN_SEQ_NBR_x',
       'TRN_MILES_TOT_TD', 'crew_order', 'DPT_DIR', 'REQ_INSP', 'case', 'TA',
       'TD', 'DESTINATION', 'ArrivalDestination', 'TRN_MILES_TOT_DEST',
       'STN_SEQ_NBR_DEST', 'DISTANCE', 'LAST_CREW_STATION',
       'SECOND_LAST_CREW_STATION', 'TRAVEL_TIME', 'DWELL_TIME', 'DATE',
       'DWELL_TIME_BIN', 'IS_OUTLIER', 'IS_HOLIDAY', 'AllTrains',
       'MainlineTrains', 'YardTrains', 'PriorityTrains', 'PriorityMainline',
       'PriorityYard', 'Train Priority', 'TRAIN_ID_NO', 'TRN_SYM', 'TRN_DAY',
       'TRN_TYPE', 'TRN_SECT', 'STN_333_y', 'STN_ST_y', 'EVT_CD', 'EVST_CD',
       'PROC_DTTM', 'STN_SEQ_NBR_y', 'EVT_DT', 'EVT_TM', 'CANC_USER_ID',
       'LOG_USER', 'LOG_ACTION', 'PGM_NME', 'LOG_TID', 'TRK_NUMB', 'FILL_CD',
       'TRN_PRTY', 'CO_ABBR', 'COMMENTS', 'HAZ_TRN_FLG', 'IBND_CMNT',
       'LST_MAINT_ID', 'LST_MAINT_TS', 'AZ_LOAD_TS', 'PARTTN_YR', 'PARTTN_MO',
       'MAINLINE'],
      dtype='object')

In [1949]:
merged_df['TRN_TYPE'].head(10)

0    NaN
1    NaN
2    NaN
3      X
4      X
5      X
6      X
7      X
8      X
9      X
Name: TRN_TYPE, dtype: object

In [1950]:
merged_df.head()

Unnamed: 0,TRAIN_ID,STN_333_x,STN_ST_x,STN_TYPE_CD,STN_SEQ_NBR_x,TRN_MILES_TOT_TD,crew_order,DPT_DIR,REQ_INSP,case,...,CO_ABBR,COMMENTS,HAZ_TRN_FLG,IBND_CMNT,LST_MAINT_ID,LST_MAINT_TS,AZ_LOAD_TS,PARTTN_YR,PARTTN_MO,MAINLINE
0,XKALBSB902H2020-05-02,MINOT,ND,C,2050,1308.4,1,E,N,General,...,,,,,,,,,,0
1,XINBOTV902H2020-05-02,MINOT,ND,C,2160,1334.9,1,E,N,General,...,,,,,,,,,,0
2,XKAHOTV915A2020-05-15,MINOT,ND,C,2050,1308.4,1,E,N,General,...,,,,,,,,,,0
3,XKAHOTV906A2020-06-06,MINOT,ND,C,2350,1434.5,1,E,N,General,...,BNSF,Pln Depart Time modified,,,ZDADMIN,2020-06-11T23:28:40.638-05:00,2020-06-14T04:31:07.680-05:00,2020.0,6.0,0
4,XKALHBG911H2020-06-11,MINOT,ND,C,2350,1434.5,1,E,N,General,...,BNSF,,,,ZDADMIN,2020-06-14T09:40:43.117-05:00,2020-06-17T04:30:37.692-05:00,2020.0,6.0,0


In [1951]:
# Select features 
features = ['IS_HOLIDAY', 'AllTrains', 'MainlineTrains', 'YardTrains', 
            'PriorityTrains', 'PriorityMainline', 'PriorityYard']

# Calculate correlation matrix
correlation_matrix = dwell_data_x_no_outliers[features].corr()

correlation_matrix.round(2)

Unnamed: 0,IS_HOLIDAY,AllTrains,MainlineTrains,YardTrains,PriorityTrains,PriorityMainline,PriorityYard
IS_HOLIDAY,1.0,0.04,0.01,0.04,0.06,0.01,0.07
AllTrains,0.04,1.0,0.5,0.83,0.85,0.38,0.75
MainlineTrains,0.01,0.5,1.0,-0.07,0.4,0.82,-0.03
YardTrains,0.04,0.83,-0.07,1.0,0.73,-0.1,0.88
PriorityTrains,0.06,0.85,0.4,0.73,1.0,0.47,0.86
PriorityMainline,0.01,0.38,0.82,-0.1,0.47,1.0,-0.05
PriorityYard,0.07,0.75,-0.03,0.88,0.86,-0.05,1.0


In [1952]:
import plotly.graph_objects as go

# Create heatmap
fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    text=correlation_matrix.round(2),
    texttemplate='%{text}',
    colorscale='RdBu',
    zmid=0
))

fig.update_layout(
    title='Feature Correlation Heatmap',
    width=800,
    height=700
)

fig.show()

In [1953]:
merged_df.columns

Index(['TRAIN_ID', 'STN_333_x', 'STN_ST_x', 'STN_TYPE_CD', 'STN_SEQ_NBR_x',
       'TRN_MILES_TOT_TD', 'crew_order', 'DPT_DIR', 'REQ_INSP', 'case', 'TA',
       'TD', 'DESTINATION', 'ArrivalDestination', 'TRN_MILES_TOT_DEST',
       'STN_SEQ_NBR_DEST', 'DISTANCE', 'LAST_CREW_STATION',
       'SECOND_LAST_CREW_STATION', 'TRAVEL_TIME', 'DWELL_TIME', 'DATE',
       'DWELL_TIME_BIN', 'IS_OUTLIER', 'IS_HOLIDAY', 'AllTrains',
       'MainlineTrains', 'YardTrains', 'PriorityTrains', 'PriorityMainline',
       'PriorityYard', 'Train Priority', 'TRAIN_ID_NO', 'TRN_SYM', 'TRN_DAY',
       'TRN_TYPE', 'TRN_SECT', 'STN_333_y', 'STN_ST_y', 'EVT_CD', 'EVST_CD',
       'PROC_DTTM', 'STN_SEQ_NBR_y', 'EVT_DT', 'EVT_TM', 'CANC_USER_ID',
       'LOG_USER', 'LOG_ACTION', 'PGM_NME', 'LOG_TID', 'TRK_NUMB', 'FILL_CD',
       'TRN_PRTY', 'CO_ABBR', 'COMMENTS', 'HAZ_TRN_FLG', 'IBND_CMNT',
       'LST_MAINT_ID', 'LST_MAINT_TS', 'AZ_LOAD_TS', 'PARTTN_YR', 'PARTTN_MO',
       'MAINLINE'],
      dtype='object')

In [1954]:
dwell_data_x_no_outliers.columns

Index(['TRAIN_ID', 'STN_333', 'STN_ST', 'STN_TYPE_CD', 'STN_SEQ_NBR',
       'TRN_MILES_TOT_TD', 'crew_order', 'DPT_DIR', 'REQ_INSP', 'case', 'TA',
       'TD', 'DESTINATION', 'ArrivalDestination', 'TRN_MILES_TOT_DEST',
       'STN_SEQ_NBR_DEST', 'DISTANCE', 'LAST_CREW_STATION',
       'SECOND_LAST_CREW_STATION', 'TRAVEL_TIME', 'DWELL_TIME', 'DATE',
       'DWELL_TIME_BIN', 'IS_OUTLIER', 'IS_HOLIDAY', 'AllTrains',
       'MainlineTrains', 'YardTrains', 'PriorityTrains', 'PriorityMainline',
       'PriorityYard', 'TRN_PRTY'],
      dtype='object')

In [1955]:
df_clean.columns

Index(['TRAIN_ID', 'STN_333', 'STN_ST', 'STN_TYPE_CD', 'STN_SEQ_NBR',
       'TRN_MILES_TOT_TD', 'crew_order', 'DPT_DIR', 'REQ_INSP', 'case', 'TA',
       'TD', 'DESTINATION', 'ArrivalDestination', 'TRN_MILES_TOT_DEST',
       'STN_SEQ_NBR_DEST', 'DISTANCE', 'LAST_CREW_STATION',
       'SECOND_LAST_CREW_STATION', 'TRAVEL_TIME', 'DWELL_TIME', 'DATE',
       'DWELL_TIME_BIN', 'IS_OUTLIER', 'IS_HOLIDAY', 'INSPECTION_REQUIRED',
       'TA_DAY', 'TA_MONTH', 'TA_HOUR', 'IS_WEEKEND', 'DAY_TYPE', 'TD_DAY',
       'TD_MONTH', 'TD_HOUR'],
      dtype='object')

In [1956]:
dwell_data_x_no_outliers.columns

Index(['TRAIN_ID', 'STN_333', 'STN_ST', 'STN_TYPE_CD', 'STN_SEQ_NBR',
       'TRN_MILES_TOT_TD', 'crew_order', 'DPT_DIR', 'REQ_INSP', 'case', 'TA',
       'TD', 'DESTINATION', 'ArrivalDestination', 'TRN_MILES_TOT_DEST',
       'STN_SEQ_NBR_DEST', 'DISTANCE', 'LAST_CREW_STATION',
       'SECOND_LAST_CREW_STATION', 'TRAVEL_TIME', 'DWELL_TIME', 'DATE',
       'DWELL_TIME_BIN', 'IS_OUTLIER', 'IS_HOLIDAY', 'AllTrains',
       'MainlineTrains', 'YardTrains', 'PriorityTrains', 'PriorityMainline',
       'PriorityYard', 'TRN_PRTY'],
      dtype='object')

In [1957]:
non_outlier_data.columns

Index(['TRAIN_ID', 'STN_333_x', 'STN_ST_x', 'STN_TYPE_CD', 'STN_SEQ_NBR_x',
       'TRN_MILES_TOT_TD', 'crew_order', 'DPT_DIR', 'REQ_INSP', 'case', 'TA',
       'TD', 'DESTINATION', 'ArrivalDestination', 'TRN_MILES_TOT_DEST',
       'STN_SEQ_NBR_DEST', 'DISTANCE', 'LAST_CREW_STATION',
       'SECOND_LAST_CREW_STATION', 'TRAVEL_TIME', 'DWELL_TIME', 'DATE',
       'DWELL_TIME_BIN', 'IS_OUTLIER', 'IS_HOLIDAY', 'AllTrains',
       'MainlineTrains', 'YardTrains', 'PriorityTrains', 'PriorityMainline',
       'PriorityYard', 'Train Priority', 'TRAIN_ID_NO', 'TRN_SYM', 'TRN_DAY',
       'TRN_TYPE', 'TRN_SECT', 'STN_333_y', 'STN_ST_y', 'EVT_CD', 'EVST_CD',
       'PROC_DTTM', 'STN_SEQ_NBR_y', 'EVT_DT', 'EVT_TM', 'CANC_USER_ID',
       'LOG_USER', 'LOG_ACTION', 'PGM_NME', 'LOG_TID', 'TRK_NUMB', 'FILL_CD',
       'TRN_PRTY', 'CO_ABBR', 'COMMENTS', 'HAZ_TRN_FLG', 'IBND_CMNT',
       'LST_MAINT_ID', 'LST_MAINT_TS', 'AZ_LOAD_TS', 'PARTTN_YR', 'PARTTN_MO',
       'MAINLINE'],
      dtype='object')

In [1958]:
# Find the actual common columns between df_clean and non_outlier_data
common_columns = list(set(df_clean.columns) & set(non_outlier_data.columns))
print("Common columns between df_clean and non_outlier_data:")
print(sorted(common_columns))
print(f"\nNumber of common columns: {len(common_columns)}")

Common columns between df_clean and non_outlier_data:
['ArrivalDestination', 'DATE', 'DESTINATION', 'DISTANCE', 'DPT_DIR', 'DWELL_TIME', 'DWELL_TIME_BIN', 'IS_HOLIDAY', 'IS_OUTLIER', 'LAST_CREW_STATION', 'REQ_INSP', 'SECOND_LAST_CREW_STATION', 'STN_SEQ_NBR_DEST', 'STN_TYPE_CD', 'TA', 'TD', 'TRAIN_ID', 'TRAVEL_TIME', 'TRN_MILES_TOT_DEST', 'TRN_MILES_TOT_TD', 'case', 'crew_order']

Number of common columns: 22


In [1959]:
# First, let's drop the TD columns we don't need from df_clean
df_clean = df_clean.drop(['TD_DAY', 'TD_MONTH', 'TD_HOUR'], axis=1)

# Check and convert datetime columns - using UTC aware
df_clean['TD'] = pd.to_datetime(df_clean['TD'], utc=True)
df_clean['TA'] = pd.to_datetime(df_clean['TA'], utc=True)
df_clean['DATE'] = pd.to_datetime(df_clean['DATE'], utc=True)

non_outlier_data['TD'] = pd.to_datetime(non_outlier_data['TD'], utc=True)
non_outlier_data['TA'] = pd.to_datetime(non_outlier_data['TA'], utc=True)
non_outlier_data['DATE'] = pd.to_datetime(non_outlier_data['DATE'], utc=True)

# Verify the data types
print("df_clean datetime columns:")
print(df_clean[['TA', 'TD', 'DATE']].dtypes)
print("\nnon_outlier_data datetime columns:")
print(non_outlier_data[['TA', 'TD', 'DATE']].dtypes)

# Check the shape of both dataframes
print(f"\ndf_clean shape: {df_clean.shape}")
print(f"non_outlier_data shape: {non_outlier_data.shape}")

# Show a sample to verify everything looks good
print("\nSample of df_clean dates:")
print(df_clean[['TRAIN_ID', 'TA', 'TD', 'DATE']].head(3))

df_clean datetime columns:
TA      datetime64[ns, UTC]
TD      datetime64[ns, UTC]
DATE    datetime64[ns, UTC]
dtype: object

non_outlier_data datetime columns:
TA      datetime64[ns, UTC]
TD      datetime64[ns, UTC]
DATE    datetime64[ns, UTC]
dtype: object

df_clean shape: (403, 31)
non_outlier_data shape: (403, 63)

Sample of df_clean dates:
                TRAIN_ID                        TA                        TD  \
2  XCROBSB921A2023-09-21 2023-09-22 00:53:00+00:00 2023-09-22 01:49:00+00:00   
3  XCROEGE922A2023-09-22 2023-09-22 23:36:00+00:00 2023-09-23 00:37:00+00:00   
4  XCROHBG919A2023-09-19 2023-09-19 21:21:00+00:00 2023-09-19 22:45:00+00:00   

                       DATE  
2 2023-09-21 00:00:00+00:00  
3 2023-09-22 00:00:00+00:00  
4 2023-09-19 00:00:00+00:00  




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [1960]:
# Merge the dataframes using the common columns
features_df = pd.merge(df_clean, non_outlier_data, 
                     on=['TRAIN_ID', 'DATE', 'DESTINATION', 'ArrivalDestination', 
                         'DPT_DIR', 'REQ_INSP', 'case', 'TA', 'TD', 
                         'TRN_MILES_TOT_TD', 'crew_order', 'TRN_MILES_TOT_DEST', 
                         'STN_SEQ_NBR_DEST', 'DISTANCE', 'LAST_CREW_STATION', 
                         'SECOND_LAST_CREW_STATION', 'TRAVEL_TIME', 'DWELL_TIME', 
                         'DWELL_TIME_BIN', 'IS_OUTLIER', 'IS_HOLIDAY', 'STN_TYPE_CD'], 
                     how='inner')

# Define the features order
features_order = ['REQ_INSP', 'IS_HOLIDAY', 'TRN_PRTY', 'MAINLINE', 'TA_DAY', 'DAY_TYPE', 
                 'TA_MONTH', 'TA_HOUR', 'AllTrains', 'MainlineTrains', 
                 'YardTrains', 'PriorityTrains', 'PriorityMainline', 'PriorityYard']

# Reorder columns - put features at the end
other_columns = [col for col in features_df.columns if col not in features_order]
features_df = features_df[other_columns + features_order]

# Check the result
print(f"Features dataframe shape: {features_df.shape}")
print(f"\nLast 15 columns (should be your features):")
for col in features_df.columns[-15:]:
   print(f"  - {col}")
   
# Verify all required features are present
missing_features = [f for f in features_order if f not in features_df.columns]
if missing_features:
   print(f"\nMissing features: {missing_features}")
else:
   print("\nAll required features are present!")

Features dataframe shape: (403, 72)

Last 15 columns (should be your features):
  - PARTTN_MO
  - REQ_INSP
  - IS_HOLIDAY
  - TRN_PRTY
  - MAINLINE
  - TA_DAY
  - DAY_TYPE
  - TA_MONTH
  - TA_HOUR
  - AllTrains
  - MainlineTrains
  - YardTrains
  - PriorityTrains
  - PriorityMainline
  - PriorityYard

All required features are present!


In [1961]:
# Rename the feature columns
rename_mapping = {
   'REQ_INSP': 'Inspection Requirement',
   'IS_HOLIDAY': 'Is Holiday',
   'TRN_PRTY': 'Train Priority',
   'MAINLINE': 'Mainline',
   'TA_DAY': 'Day of Week',
   'DAY_TYPE': 'Day Type',
   'TA_MONTH': 'Month',
   'TA_HOUR': 'Hour of Day',
   'AllTrains': 'AllTrains',
   'MainlineTrains': 'MainlineTrains',
   'YardTrains': 'YardTrains',
   'PriorityTrains': 'PriorityTrains',
   'PriorityMainline': 'PriorityMainline',
   'PriorityYard': 'PriorityYard'
}

# Apply the renaming
features_df = features_df.rename(columns=rename_mapping)

# Check the last columns to verify renaming
print("Last 15 columns after renaming:")
for col in features_df.columns[-15:]:
   print(f"  - {col}")

Last 15 columns after renaming:
  - PARTTN_MO
  - Inspection Requirement
  - Is Holiday
  - Train Priority
  - Mainline
  - Day of Week
  - Day Type
  - Month
  - Hour of Day
  - AllTrains
  - MainlineTrains
  - YardTrains
  - PriorityTrains
  - PriorityMainline
  - PriorityYard


In [1962]:
# Display the first few rows showing TRAIN_ID and all features
print("Features DataFrame - First 5 rows:")
print(features_df.head())

# Also show just the feature columns to see them more clearly
feature_cols = ['Inspection Requirement', 'Is Holiday', 'Train Priority', 'Mainline', 
               'Day of Week', 'Day Type', 'Month', 'Hour of Day', 'AllTrains', 
               'MainlineTrains', 'YardTrains', 'PriorityTrains', 'PriorityMainline', 'PriorityYard']

Features DataFrame - First 5 rows:
                TRAIN_ID STN_333 STN_ST STN_TYPE_CD  STN_SEQ_NBR  \
0  XCROBSB921A2023-09-21   MINOT     ND           C           90   
1  XCROEGE922A2023-09-22   MINOT     ND           C           90   
2  XCROHBG919A2023-09-19   MINOT     ND           C           90   
3  XPOWBNY925A2021-09-25   MINOT     ND           C          110   
4  XPOWBRE920A2021-09-20   MINOT     ND           C          110   

   TRN_MILES_TOT_TD  crew_order DPT_DIR     case                        TA  \
0              63.6           1       E  General 2023-09-22 00:53:00+00:00   
1              63.6           1       E  General 2023-09-22 23:36:00+00:00   
2              63.6           1       E  General 2023-09-19 21:21:00+00:00   
3              80.0           1       E  General 2021-09-25 20:49:00+00:00   
4              80.0           1       E  General 2021-09-21 01:41:00+00:00   

   ... Day of Week Day Type      Month  Hour of Day  AllTrains  \
0  ...      Friday  W

In [1963]:
features_df[['TRAIN_ID'] + feature_cols].head()

Unnamed: 0,TRAIN_ID,Inspection Requirement,Is Holiday,Train Priority,Train Priority.1,Mainline,Day of Week,Day Type,Month,Hour of Day,AllTrains,MainlineTrains,YardTrains,PriorityTrains,PriorityMainline,PriorityYard
0,XCROBSB921A2023-09-21,N,0,A,A,1,Friday,Weekday,September,0,3.0,0.0,3.0,1.0,0.0,1.0
1,XCROEGE922A2023-09-22,N,0,A,A,1,Friday,Weekday,September,23,1.0,1.0,0.0,1.0,1.0,0.0
2,XCROHBG919A2023-09-19,N,0,A,A,0,Tuesday,Weekday,September,21,6.0,4.0,2.0,5.0,3.0,2.0
3,XPOWBNY925A2021-09-25,N,0,A,A,0,Saturday,Weekend,September,20,2.0,1.0,1.0,2.0,1.0,1.0
4,XPOWBRE920A2021-09-20,N,0,A,A,0,Tuesday,Weekday,September,1,2.0,0.0,2.0,1.0,0.0,1.0


In [1964]:
features_df['Inspection Requirement'].unique()

array(['N', 'C', 'V', 'X'], dtype=object)

In [1965]:
# Change Inspection Requirement values: keep 'N' as 'N', change all others to 'Y'
features_df['Inspection Requirement'] = features_df['Inspection Requirement'].apply(lambda x: 'N' if x == 'N' else 'Y')

# Verify the change
print(features_df['Inspection Requirement'].unique())

['N' 'Y']


In [1966]:
# Remove duplicate columns
features_df = features_df.loc[:, ~features_df.columns.duplicated()]

In [1967]:
features_df['Train Priority'].unique()

array(['A', 'H', 'M', 'E'], dtype=object)

In [1968]:
features_df[['TRAIN_ID'] + feature_cols].head()

Unnamed: 0,TRAIN_ID,Inspection Requirement,Is Holiday,Train Priority,Mainline,Day of Week,Day Type,Month,Hour of Day,AllTrains,MainlineTrains,YardTrains,PriorityTrains,PriorityMainline,PriorityYard
0,XCROBSB921A2023-09-21,N,0,A,1,Friday,Weekday,September,0,3.0,0.0,3.0,1.0,0.0,1.0
1,XCROEGE922A2023-09-22,N,0,A,1,Friday,Weekday,September,23,1.0,1.0,0.0,1.0,1.0,0.0
2,XCROHBG919A2023-09-19,N,0,A,0,Tuesday,Weekday,September,21,6.0,4.0,2.0,5.0,3.0,2.0
3,XPOWBNY925A2021-09-25,N,0,A,0,Saturday,Weekend,September,20,2.0,1.0,1.0,2.0,1.0,1.0
4,XPOWBRE920A2021-09-20,N,0,A,0,Tuesday,Weekday,September,1,2.0,0.0,2.0,1.0,0.0,1.0


In [1969]:
# Create a copy for numeric conversion
numeric_df = features_df.copy()

# Direct mapping for categorical variables
numeric_df['Day of Week'] = numeric_df['Day of Week'].map({'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6})
numeric_df['Month'] = numeric_df['Month'].map({'January': 0, 'February': 1, 'March': 2, 'April': 3, 'May': 4, 'June': 5, 'July': 6, 'August': 7, 'September': 8, 'October': 9, 'November': 10, 'December': 11})
numeric_df['Inspection Requirement'] = numeric_df['Inspection Requirement'].map({'N': 0, 'Y': 1})
numeric_df['Train Priority'] = numeric_df['Train Priority'].map({'A': 0, 'H': 1, 'M': 2, 'E': 3})
numeric_df['Day Type'] = numeric_df['Day Type'].map({'Weekday': 0, 'Weekend': 1})

# Select all features
features = ['Inspection Requirement', 'Is Holiday', 'Train Priority', 'Mainline', 
           'Day of Week', 'Day Type', 'Month', 'Hour of Day', 'AllTrains', 
           'MainlineTrains', 'YardTrains', 'PriorityTrains', 'PriorityMainline', 'PriorityYard']

# Calculate correlation matrix
correlation_matrix = numeric_df[features].corr()

# Display correlation matrix
correlation_matrix

Unnamed: 0,Inspection Requirement,Is Holiday,Train Priority,Mainline,Day of Week,Day Type,Month,Hour of Day,AllTrains,MainlineTrains,YardTrains,PriorityTrains,PriorityMainline,PriorityYard
Inspection Requirement,1.0,-0.025445,0.048974,-0.073767,0.084912,0.051743,0.058548,-0.031137,0.045015,0.009735,0.045533,0.002479,-0.013921,0.010893
Is Holiday,-0.025445,1.0,-0.060673,-0.073767,-0.023934,-0.057622,0.075617,-0.070378,0.036306,0.009735,0.035506,0.063654,0.005973,0.068653
Train Priority,0.048974,-0.060673,1.0,-0.081876,0.056212,0.064128,0.061014,-0.006549,-0.035732,-0.038634,-0.016158,0.000316,0.008357,-0.004495
Mainline,-0.073767,-0.073767,-0.081876,1.0,0.010613,-0.019687,0.064662,0.043761,-0.008542,0.033921,-0.031769,-0.020289,0.044122,-0.048609
Day of Week,0.084912,-0.023934,0.056212,0.010613,1.0,0.782452,0.052855,0.04219,0.02903,-0.002897,0.035296,0.037863,-0.013289,0.050616
Day Type,0.051743,-0.057622,0.064128,-0.019687,0.782452,1.0,0.058383,0.0751,0.03206,0.022762,0.022193,0.036714,0.016426,0.032059
Month,0.058548,0.075617,0.061014,0.064662,0.052855,0.058383,1.0,-0.012265,0.088764,0.098141,0.038737,0.05503,0.081488,0.015031
Hour of Day,-0.031137,-0.070378,-0.006549,0.043761,0.04219,0.0751,-0.012265,1.0,0.036663,0.057864,0.004795,0.031778,0.068875,-0.00399
AllTrains,0.045015,0.036306,-0.035732,-0.008542,0.02903,0.03206,0.088764,0.036663,1.0,0.499478,0.828367,0.852691,0.379404,0.745801
MainlineTrains,0.009735,0.009735,-0.038634,0.033921,-0.002897,0.022762,0.098141,0.057864,0.499478,1.0,-0.071553,0.396811,0.824143,-0.028974


In [1970]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Heatmap(
   z=correlation_matrix.values,
   x=features,
   y=features,
   text=correlation_matrix.round(2).values,
   texttemplate='%{text}',
   colorscale='Blues'
))

fig.update_layout(
   width=1000,
   height=800,
   xaxis_tickangle=-45
)

fig.show()

In [1971]:


fig = go.Figure(data=go.Heatmap(
   z=correlation_matrix.values,
   x=features,
   y=features,
   text=correlation_matrix.round(2).values,
   texttemplate='%{text}',
   colorscale='Blues',
   zmin=0,
   zmax=1
))

fig.update_layout(
   width=1000,
   height=800,
   xaxis_tickangle=-45
)

fig.show()

In [1972]:
features_df.columns

Index(['TRAIN_ID', 'STN_333', 'STN_ST', 'STN_TYPE_CD', 'STN_SEQ_NBR',
       'TRN_MILES_TOT_TD', 'crew_order', 'DPT_DIR', 'case', 'TA', 'TD',
       'DESTINATION', 'ArrivalDestination', 'TRN_MILES_TOT_DEST',
       'STN_SEQ_NBR_DEST', 'DISTANCE', 'LAST_CREW_STATION',
       'SECOND_LAST_CREW_STATION', 'TRAVEL_TIME', 'DWELL_TIME', 'DATE',
       'DWELL_TIME_BIN', 'IS_OUTLIER', 'INSPECTION_REQUIRED', 'IS_WEEKEND',
       'STN_333_x', 'STN_ST_x', 'STN_SEQ_NBR_x', 'Train Priority',
       'TRAIN_ID_NO', 'TRN_SYM', 'TRN_DAY', 'TRN_TYPE', 'TRN_SECT',
       'STN_333_y', 'STN_ST_y', 'EVT_CD', 'EVST_CD', 'PROC_DTTM',
       'STN_SEQ_NBR_y', 'EVT_DT', 'EVT_TM', 'CANC_USER_ID', 'LOG_USER',
       'LOG_ACTION', 'PGM_NME', 'LOG_TID', 'TRK_NUMB', 'FILL_CD', 'CO_ABBR',
       'COMMENTS', 'HAZ_TRN_FLG', 'IBND_CMNT', 'LST_MAINT_ID', 'LST_MAINT_TS',
       'AZ_LOAD_TS', 'PARTTN_YR', 'PARTTN_MO', 'Inspection Requirement',
       'Is Holiday', 'Mainline', 'Day of Week', 'Day Type', 'Month',
       'Hour 

In [1973]:
col_needed = ['TRAIN_ID','TA','Inspection Requirement', 'Is Holiday', 'Mainline', 'Day of Week', 'Month', 'Hour of Day', 'AllTrains', 'MainlineTrains','DWELL_TIME']

In [1974]:
features_df = features_df[col_needed]
features_df.head()

Unnamed: 0,TRAIN_ID,TA,Inspection Requirement,Is Holiday,Mainline,Day of Week,Month,Hour of Day,AllTrains,MainlineTrains,DWELL_TIME
0,XCROBSB921A2023-09-21,2023-09-22 00:53:00+00:00,N,0,1,Friday,September,0,3.0,0.0,0.933
1,XCROEGE922A2023-09-22,2023-09-22 23:36:00+00:00,N,0,1,Friday,September,23,1.0,1.0,1.017
2,XCROHBG919A2023-09-19,2023-09-19 21:21:00+00:00,N,0,0,Tuesday,September,21,6.0,4.0,1.4
3,XPOWBNY925A2021-09-25,2021-09-25 20:49:00+00:00,N,0,0,Saturday,September,20,2.0,1.0,0.983
4,XPOWBRE920A2021-09-20,2021-09-21 01:41:00+00:00,N,0,0,Tuesday,September,1,2.0,0.0,2.1


In [1975]:
features_df_encded = pd.get_dummies(features_df,columns= ['Day of Week','Month','Inspection Requirement'
                                                                        ], dtype = int)
features_df_encded.head()

Unnamed: 0,TRAIN_ID,TA,Is Holiday,Mainline,Hour of Day,AllTrains,MainlineTrains,DWELL_TIME,Day of Week_Friday,Day of Week_Monday,...,Month_January,Month_July,Month_June,Month_March,Month_May,Month_November,Month_October,Month_September,Inspection Requirement_N,Inspection Requirement_Y
0,XCROBSB921A2023-09-21,2023-09-22 00:53:00+00:00,0,1,0,3.0,0.0,0.933,1,0,...,0,0,0,0,0,0,0,1,1,0
1,XCROEGE922A2023-09-22,2023-09-22 23:36:00+00:00,0,1,23,1.0,1.0,1.017,1,0,...,0,0,0,0,0,0,0,1,1,0
2,XCROHBG919A2023-09-19,2023-09-19 21:21:00+00:00,0,0,21,6.0,4.0,1.4,0,0,...,0,0,0,0,0,0,0,1,1,0
3,XPOWBNY925A2021-09-25,2021-09-25 20:49:00+00:00,0,0,20,2.0,1.0,0.983,0,0,...,0,0,0,0,0,0,0,1,1,0
4,XPOWBRE920A2021-09-20,2021-09-21 01:41:00+00:00,0,0,1,2.0,0.0,2.1,0,0,...,0,0,0,0,0,0,0,1,1,0


In [1976]:
features_df_encded.dtypes

TRAIN_ID                                 object
TA                          datetime64[ns, UTC]
Is Holiday                                int64
Mainline                                  int64
Hour of Day                               int32
AllTrains                               float64
MainlineTrains                          float64
DWELL_TIME                              float64
Day of Week_Friday                        int64
Day of Week_Monday                        int64
Day of Week_Saturday                      int64
Day of Week_Sunday                        int64
Day of Week_Thursday                      int64
Day of Week_Tuesday                       int64
Day of Week_Wednesday                     int64
Month_April                               int64
Month_August                              int64
Month_December                            int64
Month_February                            int64
Month_January                             int64
Month_July                              

In [1977]:
features_df_encded['TA'] = features_df_encded['TA'].astype(str)
features_df_encded['TA'].unique()

array(['2023-09-22 00:53:00+00:00', '2023-09-22 23:36:00+00:00',
       '2023-09-19 21:21:00+00:00', '2021-09-25 20:49:00+00:00',
       '2021-09-21 01:41:00+00:00', '2021-10-01 03:45:00+00:00',
       '2021-10-02 01:05:00+00:00', '2021-09-22 05:52:00+00:00',
       '2021-09-08 03:13:00+00:00', '2020-06-16 08:16:00+00:00',
       '2023-11-13 23:25:00+00:00', '2023-10-12 04:36:00+00:00',
       '2022-04-21 07:16:00+00:00', '2022-10-04 09:56:00+00:00',
       '2021-09-12 05:47:00+00:00', '2020-09-01 06:09:00+00:00',
       '2021-10-09 04:55:00+00:00', '2022-10-05 11:30:00+00:00',
       '2023-01-27 13:18:00+00:00', '2023-03-08 05:49:00+00:00',
       '2020-11-05 21:01:00+00:00', '2023-03-31 12:46:00+00:00',
       '2020-10-06 23:55:00+00:00', '2023-02-03 15:45:00+00:00',
       '2020-06-15 23:05:00+00:00', '2023-05-13 12:19:00+00:00',
       '2023-10-02 22:13:00+00:00', '2022-02-10 14:17:00+00:00',
       '2023-10-05 08:53:00+00:00', '2023-02-03 06:22:00+00:00',
       '2022-06-30 04:44:

In [1978]:
features_df_encded.head()

Unnamed: 0,TRAIN_ID,TA,Is Holiday,Mainline,Hour of Day,AllTrains,MainlineTrains,DWELL_TIME,Day of Week_Friday,Day of Week_Monday,...,Month_January,Month_July,Month_June,Month_March,Month_May,Month_November,Month_October,Month_September,Inspection Requirement_N,Inspection Requirement_Y
0,XCROBSB921A2023-09-21,2023-09-22 00:53:00+00:00,0,1,0,3.0,0.0,0.933,1,0,...,0,0,0,0,0,0,0,1,1,0
1,XCROEGE922A2023-09-22,2023-09-22 23:36:00+00:00,0,1,23,1.0,1.0,1.017,1,0,...,0,0,0,0,0,0,0,1,1,0
2,XCROHBG919A2023-09-19,2023-09-19 21:21:00+00:00,0,0,21,6.0,4.0,1.4,0,0,...,0,0,0,0,0,0,0,1,1,0
3,XPOWBNY925A2021-09-25,2021-09-25 20:49:00+00:00,0,0,20,2.0,1.0,0.983,0,0,...,0,0,0,0,0,0,0,1,1,0
4,XPOWBRE920A2021-09-20,2021-09-21 01:41:00+00:00,0,0,1,2.0,0.0,2.1,0,0,...,0,0,0,0,0,0,0,1,1,0


In [1979]:
features_df_encded_new = features_df_encded.drop(columns ='TRAIN_ID')
# features_df_encded.drop(columns='TRAIN_ID',inplace=True)

In [1980]:
features_df_encded_new.head()

Unnamed: 0,TA,Is Holiday,Mainline,Hour of Day,AllTrains,MainlineTrains,DWELL_TIME,Day of Week_Friday,Day of Week_Monday,Day of Week_Saturday,...,Month_January,Month_July,Month_June,Month_March,Month_May,Month_November,Month_October,Month_September,Inspection Requirement_N,Inspection Requirement_Y
0,2023-09-22 00:53:00+00:00,0,1,0,3.0,0.0,0.933,1,0,0,...,0,0,0,0,0,0,0,1,1,0
1,2023-09-22 23:36:00+00:00,0,1,23,1.0,1.0,1.017,1,0,0,...,0,0,0,0,0,0,0,1,1,0
2,2023-09-19 21:21:00+00:00,0,0,21,6.0,4.0,1.4,0,0,0,...,0,0,0,0,0,0,0,1,1,0
3,2021-09-25 20:49:00+00:00,0,0,20,2.0,1.0,0.983,0,0,1,...,0,0,0,0,0,0,0,1,1,0
4,2021-09-21 01:41:00+00:00,0,0,1,2.0,0.0,2.1,0,0,0,...,0,0,0,0,0,0,0,1,1,0


In [1981]:
train_data_dwell = features_df_encded_new[features_df_encded_new['TA']< '2024-01-01 00:00:00+00:00']
test_data_dwell  = features_df_encded_new[features_df_encded_new['TA']>= '2024-01-01 00:00:00+00:00']
print(train_data_dwell.shape)
print(test_data_dwell.shape)

(318, 28)
(85, 28)


In [1982]:
train_data_dwell.drop(columns = 'TA',inplace = True)
test_data_dwell.drop(columns = 'TA',inplace = True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [1983]:
# Dwell time Model_Training
from sklearn.linear_model import ElasticNet
regressor_EN = ElasticNet(alpha= 0.00001,
                          l1_ratio= 1)
X_train_dwell , y_train_dwell = train_data_dwell.drop(columns = ['DWELL_TIME']) , train_data_dwell['DWELL_TIME']
X_test_dwell,y_test_dwell = test_data_dwell.drop(columns = ['DWELL_TIME']) , test_data_dwell['DWELL_TIME']
X_train_dwell = X_train_dwell.reset_index(drop = True)
y_train_dwell = y_train_dwell.reset_index(drop = True)
# Model Fitting
model_DT = regressor_EN.fit(X_train_dwell ,y_train_dwell)


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.592e+00, tolerance: 1.476e-02



In [1984]:
features_df_encded.columns

Index(['TRAIN_ID', 'TA', 'Is Holiday', 'Mainline', 'Hour of Day', 'AllTrains',
       'MainlineTrains', 'DWELL_TIME', 'Day of Week_Friday',
       'Day of Week_Monday', 'Day of Week_Saturday', 'Day of Week_Sunday',
       'Day of Week_Thursday', 'Day of Week_Tuesday', 'Day of Week_Wednesday',
       'Month_April', 'Month_August', 'Month_December', 'Month_February',
       'Month_January', 'Month_July', 'Month_June', 'Month_March', 'Month_May',
       'Month_November', 'Month_October', 'Month_September',
       'Inspection Requirement_N', 'Inspection Requirement_Y'],
      dtype='object')

In [1985]:
test_data_dwell_new  = features_df_encded[features_df_encded['TA']>= '2024-01-01 00:00:00+00:00']

In [1986]:
cols = ['Is Holiday', 'Mainline', 'Hour of Day', 'AllTrains',
       'MainlineTrains', 'DWELL_TIME', 'Day of Week_Friday',
       'Day of Week_Monday', 'Day of Week_Saturday', 'Day of Week_Sunday',
       'Day of Week_Thursday', 'Day of Week_Tuesday', 'Day of Week_Wednesday',
       'Month_April', 'Month_August', 'Month_December', 'Month_February',
       'Month_January', 'Month_July', 'Month_June', 'Month_March', 'Month_May',
       'Month_November', 'Month_October', 'Month_September',
       'Inspection Requirement_N', 'Inspection Requirement_Y']

In [1987]:
test_data_dwell_new.columns

Index(['TRAIN_ID', 'TA', 'Is Holiday', 'Mainline', 'Hour of Day', 'AllTrains',
       'MainlineTrains', 'DWELL_TIME', 'Day of Week_Friday',
       'Day of Week_Monday', 'Day of Week_Saturday', 'Day of Week_Sunday',
       'Day of Week_Thursday', 'Day of Week_Tuesday', 'Day of Week_Wednesday',
       'Month_April', 'Month_August', 'Month_December', 'Month_February',
       'Month_January', 'Month_July', 'Month_June', 'Month_March', 'Month_May',
       'Month_November', 'Month_October', 'Month_September',
       'Inspection Requirement_N', 'Inspection Requirement_Y'],
      dtype='object')

In [1988]:
test_data_dwell.columns

Index(['Is Holiday', 'Mainline', 'Hour of Day', 'AllTrains', 'MainlineTrains',
       'DWELL_TIME', 'Day of Week_Friday', 'Day of Week_Monday',
       'Day of Week_Saturday', 'Day of Week_Sunday', 'Day of Week_Thursday',
       'Day of Week_Tuesday', 'Day of Week_Wednesday', 'Month_April',
       'Month_August', 'Month_December', 'Month_February', 'Month_January',
       'Month_July', 'Month_June', 'Month_March', 'Month_May',
       'Month_November', 'Month_October', 'Month_September',
       'Inspection Requirement_N', 'Inspection Requirement_Y'],
      dtype='object')

In [1989]:
X_train_dwell.columns

Index(['Is Holiday', 'Mainline', 'Hour of Day', 'AllTrains', 'MainlineTrains',
       'Day of Week_Friday', 'Day of Week_Monday', 'Day of Week_Saturday',
       'Day of Week_Sunday', 'Day of Week_Thursday', 'Day of Week_Tuesday',
       'Day of Week_Wednesday', 'Month_April', 'Month_August',
       'Month_December', 'Month_February', 'Month_January', 'Month_July',
       'Month_June', 'Month_March', 'Month_May', 'Month_November',
       'Month_October', 'Month_September', 'Inspection Requirement_N',
       'Inspection Requirement_Y'],
      dtype='object')

In [1990]:
# # Storing the results into new dataframe.
# dwell_time_result = pd.DataFrame()
# dwell_time_result[cols] = test_data_dwell[cols]
pred_val = model_DT.predict(X_test_dwell)
dwell_time_result['DWELL_TIME'] = np.round(pred_val,3)

In [1991]:
dwell_time_result  = dwell_time_result.reset_index(drop=True)

In [1992]:
test_data_dwell_new.shape

(85, 29)

In [1993]:
test_data_dwell_new.head()

Unnamed: 0,TRAIN_ID,TA,Is Holiday,Mainline,Hour of Day,AllTrains,MainlineTrains,DWELL_TIME,Day of Week_Friday,Day of Week_Monday,...,Month_January,Month_July,Month_June,Month_March,Month_May,Month_November,Month_October,Month_September,Inspection Requirement_N,Inspection Requirement_Y
318,XINBHBG913H2024-06-13,2024-06-14 12:36:00+00:00,0,0,12,4.0,1.0,2.75,1,0,...,0,0,1,0,0,0,0,0,1,0
319,XVAWBRE922H2024-07-22,2024-07-23 07:58:00+00:00,0,0,7,2.0,0.0,0.9,0,0,...,0,1,0,0,0,0,0,0,1,0
320,XINBOAN913H2024-04-13,2024-04-14 00:34:00+00:00,0,1,0,1.0,0.0,0.967,0,0,...,0,0,0,0,0,0,0,0,1,0
321,XKALGWN909H2024-04-09,2024-04-10 17:39:00+00:00,0,0,17,3.0,2.0,1.867,0,0,...,0,0,0,0,0,0,0,0,1,0
322,XKALAYR902H2024-07-02,2024-07-04 19:19:00+00:00,1,0,19,6.0,1.0,1.883,0,0,...,0,1,0,0,0,0,0,0,1,0


In [1994]:
total_test = pd.concat([test_data_dwell_new,dwell_time_result],axis=1)
total_test.head()

Unnamed: 0,TRAIN_ID,TA,Is Holiday,Mainline,Hour of Day,AllTrains,MainlineTrains,DWELL_TIME,Day of Week_Friday,Day of Week_Monday,...,Month_June,Month_March,Month_May,Month_November,Month_October,Month_September,Inspection Requirement_N,Inspection Requirement_Y,index,DWELL_TIME.1
318,XINBHBG913H2024-06-13,2024-06-14 12:36:00+00:00,0.0,0.0,12.0,4.0,1.0,2.75,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,
319,XVAWBRE922H2024-07-22,2024-07-23 07:58:00+00:00,0.0,0.0,7.0,2.0,0.0,0.9,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,
320,XINBOAN913H2024-04-13,2024-04-14 00:34:00+00:00,0.0,1.0,0.0,1.0,0.0,0.967,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,
321,XKALGWN909H2024-04-09,2024-04-10 17:39:00+00:00,0.0,0.0,17.0,3.0,2.0,1.867,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,
322,XKALAYR902H2024-07-02,2024-07-04 19:19:00+00:00,1.0,0.0,19.0,6.0,1.0,1.883,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,
