## 1. Setup

In [1]:
%load_ext rpy2.ipython
%load_ext autoreload
%autoreload 2

%matplotlib inline  
from matplotlib import rcParams
rcParams['figure.figsize'] = (16, 100)

import warnings
from rpy2.rinterface import RRuntimeWarning
warnings.filterwarnings("ignore") # Ignore all warnings
# warnings.filterwarnings("ignore", category=RRuntimeWarning) # Show some warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
from datetime import datetime

In [2]:
%%javascript
// Disable auto-scrolling
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [3]:
%%R

# My commonly used R imports
require('flexplot')
require('tidyverse')
library(ggrepel)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors


Loading required package: flexplot
Loading required package: tidyverse
In library(package, lib.loc = lib.loc, character.only = TRUE, logical.return = TRUE,  :
  there is no package called ‘flexplot’


## 2. Load census variable file

In [4]:
%%R -o df
df <- read_csv('data/intermediary/2023_subway_censusvar.csv')

Rows: 18051 Columns: 50
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (28): Common Name, Equipment Description, Executive Comment, Outage Cod...
dbl  (18): GEOID, Outage, Station MRN, Station ID, Complex ID, lat, long, AD...
dttm  (4): Out of Service Date, Estimated Return to Service Date, Actual Ret...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [5]:
df.shape

(18051, 50)

## 3. Creating column for 'downtime' & 'external' elevators

In [6]:
%%R -o df

# create column 'external' for rows in `Common Name` contains `X`
df <- df %>%
    mutate(external = grepl('X', `Common Name`)) 

In [7]:
# create new column 'downtime' 
# calculate the time between 'Out of Service Date' and 'Actual Return to Service Date'

df['Out of Service Date'] = pd.to_datetime(df['Out of Service Date'])
df['Actual Return to Service Date'] = pd.to_datetime(df['Actual Return to Service Date'])

df['downtime'] = df['Actual Return to Service Date'] - df['Out of Service Date'] 

# df['maintenance_time'] into hours
df['downtime'] = df['downtime'].dt.total_seconds() / 3600

# rename 'Outage Code' to 'outage_code'
df.rename(columns={'Outage Code': 'outage_code'}, inplace=True)
df.to_csv('data/output/2023_subway_downtime-external.csv', index=False)

# missing values to be dealt with
df[df['downtime'].isna()].to_csv('data/missing-value/2023_null_downtime.csv', index=False)

In [8]:
# check how many of the elevators are externally managed
df['external'].value_counts()

external
False    17457
True       594
Name: count, dtype: int64

In [9]:
# downtime summary stat
df['downtime'].describe()

count    15012.000000
mean         7.418431
std         55.326444
min          0.000000
25%          1.566667
50%          3.433333
75%          5.000000
max       3211.600000
Name: downtime, dtype: float64

## 4. t-test for downtime between MTA and external elevators

In [10]:
%%R -i df

t.test(df %>% filter(external == 1) %>% select(downtime),
       df %>% filter(external == 0) %>% select(downtime))


	Welch Two Sample t-test

data:  df %>% filter(external == 1) %>% select(downtime) and df %>% filter(external == 0) %>% select(downtime)
t = 7.809, df = 591.09, p-value = 2.639e-14
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 45.75343 76.50092
sample estimates:
mean of x mean of y 
66.143192  5.016017 

