<a href="https://colab.research.google.com/github/saminyc/Subway_Data_Analysis/blob/main/Subway_Incidents_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Subway Incident Analysis
*   Data Source: [MTA Subway Major Incidents: Beginning 2020 - Janurary 2025
](https://data.ny.gov/Transportation/MTA-Subway-Major-Incidents-Beginning-2020/j6d2-s8m2/about_data)
*   Data Overview: https://data.ny.gov/api/views/j6d2-s8m2/files/4a8a54a1-49cd-41a7-abc0-d337c7dbba2c?download=true&filename=MTA_SubwayMajorIncidents_Overview.pdf

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

## File reading

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Data_Analysis_Projects/Subway_Incidents_Analysis/MTA_Subway_Major_Incidents__Beginning_2020_20250207.csv')
df.head()

## Data cleaning

In [None]:
df['division'].unique()

In [None]:
df['line'].unique()

In [None]:
df['category'].unique()

In [None]:
df['count'].unique()

### Changing non-null values

In [None]:
df['division'] = df['division'].fillna('Unlabeled')

In [None]:
df['division'].unique()

In [None]:
df['line'] = df['line'].fillna('Unlabeled')

In [None]:
df['line'].unique()

### Changing values from binary

In [None]:
df['day_type'].unique()

In [None]:
# if 1 change to 'weekday'
# if 2 change it to 'weekend'
df['day_type'] = df['day_type'].apply(lambda x: 'weekday' if x==1 else 'weekend')

In [None]:
df['day_type'].unique()

### Renaming columns

In [None]:
df = df.rename(columns={'day_type': 'weekday/weekend'})
df

In [None]:
df = df.rename(columns={'day_type': 'weekday/weekend'})
df = df.rename(columns={'month': 'date'})

## Analysis

### Major incidents occurring per month

In [None]:
df.date.unique()

In [None]:
df['month'] = pd.to_datetime(df['date']).dt.month.astype(str).str.zfill(2)
df.head()

In [None]:
df['year'] = pd.to_datetime(df['date']).dt.year
df.head()

In [None]:
monthly_counts = df.groupby('month')['count'].sum().reset_index()
monthly_counts

In [None]:
plt.plot(monthly_counts['month'], monthly_counts['count'], marker='o')
plt.xlabel('Month')
plt.ylabel('Count')
plt.title('Monthly Incident Counts')
plt.show()

### Division based incident analysis

In [None]:
division_count = df.groupby('division')['count'].sum().reset_index()
division_count

In [None]:
fig, ax = plt.subplots()
bar_labels = ['A Division', 'B Division','Unlabeled']
bar_colors = ['tab:red', 'tab:blue', 'tab:grey']
ax.bar( division_count['division'],division_count['count'], label=bar_labels, color=bar_colors)

ax.set_ylabel('Count of incidents')
ax.set_title('Count of incidents per division')

plt.show()

### Subway lines experiencing the highest number of major incidents

In [None]:
df_count_sorted_desc = df.groupby(['line'])['count'].sum()
# Sorting values from highest to lowest
df_count_sorted_desc = df_count_sorted_desc.sort_values(ascending=False)
df_count_sorted_desc

In [None]:
df_count_sorted_desc.plot(kind='barh', figsize=(10, 8))
plt.title('Subway Line Incident Counts')
plt.xlabel('Incident Count')
plt.ylabel('Subway Lines')
plt.show()

In [None]:
df_count_sorted_desc.head(1)

### Subway lines prone to specific types of incidents

In [None]:
df_category_accident_count = df.loc[df.groupby('line')['count'].idxmax(), ['line', 'category', 'count']].reset_index(drop=True)
df_category_accident_count

### Incidents on weekdays vs weekends

In [None]:
df=df.rename(columns={'weekday/weekend': 'day_of_week'})
df.head()

In [None]:
df_weekday_weekend = df.groupby(['day_of_week'])['count'].sum().reset_index()
df_weekday_weekend

In [None]:
fig, ax = plt.subplots()
bar_labels=['Weekday','Weekend']
bar_colors=['tab:red','tab:blue']
ax.bar(df_weekday_weekend['day_of_week'], df_weekday_weekend['count'], label=bar_labels, color=bar_colors)
ax.set_ylabel('Count')
ax.set_xlabel('Day of the week')
ax.set_title('Incidents on weekday vs weekend')
ax.legend(title='Weekday/Weekend')

plt.show()

### Incident Categories

In [None]:
df_category_count = df.groupby(['category'])['count'].sum().reset_index()
df_category_count = df_category_count.sort_values(ascending=False, by='count')
df_category_count

In [None]:
plt.barh(df_category_count['category'], df_category_count['count'])
plt.xlabel('Incidents')
plt.ylabel('Categories')
plt.title('Incidents by Categories')

plt.show()

### Subway incidents over the years

In [None]:
df['year'].unique()

In [None]:
df_over_years = df.groupby(['year'])['count'].sum().reset_index()
df_over_years

In [None]:
plt.plot(df_over_years['year'], df_over_years['count'])
plt.xticks(df_over_years['year'])
plt.xlabel('Years')
plt.ylabel('Incidents')
plt.title('Subway Incidents Over The Years')
plt.show()