## 1. Setup

In [1]:
%load_ext rpy2.ipython
%load_ext autoreload
%autoreload 2

%matplotlib inline  
from matplotlib import rcParams
rcParams['figure.figsize'] = (16, 100)

import warnings
from rpy2.rinterface import RRuntimeWarning
warnings.filterwarnings("ignore") # Ignore all warnings
# warnings.filterwarnings("ignore", category=RRuntimeWarning) # Show some warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
from datetime import datetime

In [2]:
%%javascript
// Disable auto-scrolling
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [3]:
%%R

# My commonly used R imports
require('flexplot')
require('tidyverse')
library(ggrepel)
library(ggbeeswarm)


── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors


Loading required package: flexplot
Loading required package: tidyverse
In library(package, lib.loc = lib.loc, character.only = TRUE, logical.return = TRUE,  :
  there is no package called ‘flexplot’


## 2. Load data

In [4]:
%%R -o df
df <- read_csv('data/output/2023_subway_downtime-external.csv')

Rows: 18051 Columns: 52
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (28): Common Name, Equipment Description, Executive Comment, outage_cod...
dbl  (19): GEOID, Outage, Station MRN, Station ID, Complex ID, lat, long, AD...
lgl   (1): external
dttm  (4): Out of Service Date, Estimated Return to Service Date, Actual Ret...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.


## 3. Visualize the data

#### 1 -  Elevators with the highest downtime

In [5]:
df_long_downtime = df.groupby('Common Name')['downtime'].sum().sort_values(ascending=False).head(10)

# format df
df_long_downtime = df_long_downtime.reset_index().rename(columns={'Common Name':'common_name'})
df_long_downtime

Unnamed: 0,common_name,downtime
0,EL279X,4045.233333
1,EL295X,3792.216667
2,EL404,3510.433333
3,EL728X,3335.083333
4,EL407,3292.566667
5,EL290X,2263.3
6,EL327X,2229.816667
7,EL727X,2102.4
8,EL445X,1979.666667
9,EL287X,1728.466667


In [6]:
# add the columns to the df_long_downtime based on 'common_name'
df_merged = df_long_downtime.merge(df[['Common Name', 'external','Stop Name', 'Daytime Routes']].drop_duplicates(), left_on='common_name', right_on='Common Name')

# calculate the percentage of downtime in a year
df_merged['percent_out'] = (df_merged['downtime'] / 8760)

# format the df
df_merged = df_merged.rename(columns={'Stop Name':'stop_name', 'Daytime Routes':'daytime_routes'})
df_merged.drop('Common Name', axis=1, inplace=True)

In [7]:
%%R -i df_merged -w 800 -h 300

df_merged %>%
    mutate(common_name = fct_reorder(common_name, percent_out)) %>%
    ggplot() +
    aes(y = common_name, x = percent_out, fill = external, label = stop_name) +
    geom_bar(stat='identity') +
    labs(
        title = 'Out of service elevators are mostly managed by third parties',
        subtitle = '10 elevators with the highest percentage of downtime in 2023',
    ) +  
    theme_minimal() +
    theme(
        legend.title=element_blank(),
        panel.grid.minor = element_blank(),
        panel.grid.major = element_blank(),
        axis.line.x = element_line(color = "grey80", linewidth = .4),
        axis.ticks.x = element_line(color = "grey80", linewidth = .4),
        axis.title.y = element_blank(),
        axis.title.x = element_blank(),
        axis.text.y = element_text(hjust = 0),
        plot.margin = margin(10, 15, 10, 15)
    ) +
    scale_x_continuous(
        expand = c(0, 0), 
        limits = c(0, .55),
        labels = scales::label_percent(),
  ) +
    scale_fill_discrete(
        labels = c('MTA managed', 'Third party managed')
        ) +
    # add daytime_routes and stop name as labels
    geom_text_repel(
        aes(label = daytime_routes), 
        nudge_x = 0.05, 
        segment.size = 0,
        color = 'black',
        hjust = 0.0,
        vjust = 0.5,
     ) +
    geom_text_repel(
        aes(label = stop_name), 
        nudge_x = -0.05, 
        segment.size = 0,
        color = 'black',
        hjust = 0.0,
        vjust = 0.5,
     ) 
    
    # save and adjust resolution
    ggsave('top_10_elevators_downtime.svg', dpi=300)

Saving 11.1 x 4.17 in image
