# Background



During my time in NS SPF, in Public Transport Security Command (Transcom), i have noticed that :

1. Certain MRT stations often require more police attention due to three main factors, namely higher passenger volumes, higher incident rates and proximity of the station to key locations.
2. For some background, Singapore's MRT networks are used by 3.49 million commuters daily (Chelvan, 2026) and crime rates are a possible concern with 19,969 cases reported in 2024 (Singapore Police Force, 2024). Hence it is vital we implement data-driven models to monitor, identify and potentially reduce the amount of crime we encounter in public transport networks.
3. With 143 MRT stations across Singapore and more to come, I believe that we should find a systematic way to determine where to deploy officers based on what stations should be patrolled more, where incidents happen and what factors increase risk.



# Problems Identified

# Why this problem needs to be adressed

# Importing Libraries

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Reading the data

In [69]:
# Path to file
mrt_file_path = '/kaggle/input/datasets/kathirvelsasikumar/mrt-stations/mrt_stations.csv'

# Read data
mrt_data = pd.read_csv(mrt_file_path, index_col=0)

# Brief look at the data
print(mrt_data.head())
print(mrt_data.shape)

  code   station_name               line color     opening type blk_no  \
0  NS1    Jurong East  North-South Line    Red  1990-03-10  MRT     10   
1  NS2    Bukit Batok  North-South Line    Red  1990-03-10  MRT     10   
2  NS3   Bukit Gombak  North-South Line    Red  1990-03-10  MRT    802   
3  NS4  Choa Chu Kang  North-South Line    Red  1990-03-10  MRT     10   
4  NS5        Yew Tee  North-South Line    Red  1996-02-10  MRT     61   

                   road_name                              building  \
0      JURONG EAST STREET 12  JURONG EAST MRT STATION (EW24 / NS1)   
1        BUKIT BATOK CENTRAL         BUKIT BATOK MRT STATION (NS2)   
2  BUKIT BATOK WEST AVENUE 5        BUKIT GOMBAK MRT STATION (NS3)   
3     CHOA CHU KANG AVENUE 4       CHOA CHU KANG MRT STATION (NS4)   
4        CHOA CHU KANG DRIVE             YEW TEE MRT STATION (NS5)   

                                             address  postal             x  \
0  10 JURONG EAST STREET 12 JURONG EAST MRT STATI...  60

# Exploration of data

In [70]:
# Columns
print("\nColumn names:")
print(mrt_data.columns.tolist())

# Check for missing values
print("\nMissing values:")
print(mrt_data.isnull().sum())

# Summary statistics
print("\nBasic stats:")
print(mrt_data.describe())


Column names:
['code', 'station_name', 'line', 'color', 'opening', 'type', 'blk_no', 'road_name', 'building', 'address', 'postal', 'x', 'y', 'latitude', 'longitude', 'planning_area_ura', 'region_ura']

Missing values:
code                 0
station_name         0
line                 0
color                0
opening              0
type                 0
blk_no               0
road_name            0
building             0
address              0
postal               0
x                    0
y                    0
latitude             0
longitude            0
planning_area_ura    0
region_ura           0
dtype: int64

Basic stats:
              postal             x             y    latitude   longitude
count     211.000000    211.000000    211.000000  211.000000  211.000000
mean   441537.971564  29121.983379  35827.055648    1.340281  103.843400
std    250959.496389   7055.693044   5070.772616    0.045858    0.063400
min     18925.000000   6150.862788  27545.853426    1.265389  103.63699

# Feature Engineering

In [71]:
# Convert opening date to datetime
mrt_data['opening'] = pd.to_datetime(mrt_data['opening'])
mrt_data['opening_year'] = mrt_data['opening'].dt.year
mrt_data['station_age'] = 2026 - mrt_data['opening_year']

# Is it an interchange station?
# Interchange stations appear multiple times (different lines at same location)
# Count how many times each station name appears
station_counts = mrt_data['station_name'].value_counts()
mrt_data['is_interchange'] = mrt_data['station_name'].map(station_counts)
mrt_data['is_interchange'] = (mrt_data['is_interchange'] > 1).astype(int)
mrt_data['num_lines'] = mrt_data['station_name'].map(station_counts)

# Create dummy variables for regions
region_dummies = pd.get_dummies(mrt_data['region_ura'], prefix='region')
mrt_data = pd.concat([mrt_data, region_dummies], axis=1)

# Create features for CBD proximity
cbd_areas = ['DOWNTOWN CORE', 'ORCHARD', 'MUSEUM', 'MARINA SOUTH', 'STRAITS VIEW']
mrt_data['near_cbd'] = mrt_data['planning_area_ura'].isin(cbd_areas).astype(int)

# Nightlife/entertainment areas
nightlife = ['ORCHARD', 'CLARKE QUAY', 'RIVER VALLEY', 'BUGIS']
mrt_data['nightlife_zone'] = mrt_data['planning_area_ura'].isin(nightlife).astype(int)

# Count stations per area (density)
area_station_counts = mrt_data['planning_area_ura'].value_counts()
mrt_data['area_density'] = mrt_data['planning_area_ura'].map(area_station_counts)

print("\nNew features created:")
print(f"- station_age: {mrt_data['station_age'].min()} to {mrt_data['station_age'].max()} years")
print(f"- is_interchange: {mrt_data[mrt_data['is_interchange']==1]['station_name'].nunique()} unique interchange stations")
print(f"- interchange entries: {mrt_data['is_interchange'].sum()} (counting all line instances)")
print(f"- near_cbd: {mrt_data['near_cbd'].sum()} stations")
print(f"- nightlife_zone: {mrt_data['nightlife_zone'].sum()} stations")

# Show examples
print("\nSample interchange stations:")
interchange_examples = mrt_data[mrt_data['is_interchange']==1].groupby('station_name')['line'].apply(list).head(5)
for station, lines in interchange_examples.items():
    print(f"  {station}: {len(lines)} lines")
print()


New features created:
- station_age: 2 to 39 years
- is_interchange: 29 unique interchange stations
- interchange entries: 61 (counting all line instances)
- near_cbd: 24 stations
- nightlife_zone: 5 stations

Sample interchange stations:
  Bayfront: 2 lines
  Bishan: 2 lines
  Botanic Gardens: 2 lines
  Bugis: 2 lines
  Bukit Panjang: 2 lines



# Create target variable (simulated incident risk)

Note: In a real scenario, I'd have actual incident data from SPF. However , i cannot access actual SPF incident data due to confidentiality and security restrictions hence , to abid by ethical guidelands, I would create a realistic simulation model that generates incident data based on known risk factors.


In [72]:
# Varibles
def generate_incident_risk(row):
    """Generate risk score based on station characteristics"""
    base_risk = 5  # ← LOWER baseline
    
    # Interchange stations have more incidents (but with variation)
    if row['is_interchange']:
        base_risk += np.random.uniform(8, 18)  # ← RANDOM RANGE 8-18
    
    # CBD areas have higher risk (but not all the same)
    if row['near_cbd']:
        base_risk += np.random.uniform(5, 15)  # ← RANDOM RANGE 5-15
    
    # Nightlife zones peak at night
    if row['nightlife_zone']:
        base_risk += np.random.uniform(3, 12)  # ← RANDOM RANGE 3-12
    
    # Older stations might have more issues (not always)
    if row['station_age'] > 20:
        base_risk += np.random.uniform(0, 8)  # ← RANDOM RANGE 0-8
    
    # High density areas (variable crowding effects)
    if row['area_density'] > 5:
        base_risk += np.random.uniform(2, 10)  # ← RANDOM RANGE 2-10
    
    # Add significant randomness to simulate unpredictable factors
    noise = np.random.normal(0, 8)  # ← BIGGER noise (was 3, now 8)
    
    # Some stations just have random spikes (NEW!)
    if np.random.random() < 0.1:  # 10% chance
        noise += np.random.uniform(5, 15)  # Random spike
    
    return max(0, base_risk + noise)

# Generate risk scores
mrt_data['monthly_incidents'] = mrt_data.apply(generate_incident_risk, axis=1)

# Create binary classification target (high risk = 1, low risk = 0)
risk_threshold = mrt_data['monthly_incidents'].median()
mrt_data['high_risk'] = (mrt_data['monthly_incidents'] > risk_threshold).astype(int)

print(f"\nTarget variable created:")
print(f"- Average incidents per station: {mrt_data['monthly_incidents'].mean():.1f}")
print(f"- High risk stations: {mrt_data['high_risk'].sum()}")
print(f"- Low risk stations: {(1-mrt_data['high_risk']).sum()}")


Target variable created:
- Average incidents per station: 18.3
- High risk stations: 105
- Low risk stations: 106


# Select features for modeling

In [73]:
# Choose which columns to use as numerical features

feature_cols = [
    'station_age', 
    'is_interchange', 
    'near_cbd', 
    'nightlife_zone',
    'area_density',
    'latitude',
    'longitude'
] + [col for col in mrt_data.columns if col.startswith('region_')]

# Remove any non-numeric columns
X = mrt_data[feature_cols].copy()
# Keep only numeric columns
X = X.select_dtypes(include=[np.number])

y = mrt_data['high_risk'].copy()

print(f"\nFeatures selected: {len(X.columns)}")
print("Feature list:", X.columns.tolist())


Features selected: 7
Feature list: ['station_age', 'is_interchange', 'near_cbd', 'nightlife_zone', 'area_density', 'latitude', 'longitude']


# Split data into training and validation sets

In [74]:
# Break off validation set from training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1, test_size=0.25)

print(f"\nTraining set size: {len(train_X)}")
print(f"Validation set size: {len(val_X)}")


Training set size: 158
Validation set size: 53


# Build Model

In [75]:
# Define the model
# Using Random Forest
rf_model = GradientBoostingClassifier(n_estimators=100, random_state=1, max_depth=5)

# Fit model
rf_model.fit(train_X, train_y)

# Make predictions
train_preds = rf_model.predict(train_X)
val_preds = rf_model.predict(val_X)

print("MODEL PERFORMANCE")

# Check accuracy
train_accuracy = (train_preds == train_y).mean()
val_accuracy = (val_preds == val_y).mean()

print(f"\nTraining Accuracy: {train_accuracy:.3f}")
print(f"Validation Accuracy: {val_accuracy:.3f}")

# Detailed classification report
print("\nValidation Set Performance:")
print(classification_report(val_y, val_preds, 
                          target_names=['Low Risk', 'High Risk']))


MODEL PERFORMANCE

Training Accuracy: 1.000
Validation Accuracy: 0.717

Validation Set Performance:
              precision    recall  f1-score   support

    Low Risk       0.65      0.74      0.69        23
   High Risk       0.78      0.70      0.74        30

    accuracy                           0.72        53
   macro avg       0.72      0.72      0.72        53
weighted avg       0.72      0.72      0.72        53



# Feature Importance

In [76]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


Top 10 Most Important Features:
          feature  importance
6       longitude    0.296596
5        latitude    0.294812
1  is_interchange    0.287067
0     station_age    0.059816
4    area_density    0.042487
2        near_cbd    0.016414
3  nightlife_zone    0.002809


# Make Predictions on Full Dataset

In [77]:
# Probability predictions for all stations
mrt_data['risk_probability'] = rf_model.predict_proba(X)[:, 1]
mrt_data['predicted_high_risk'] = rf_model.predict(X)

# Create risk categories
mrt_data['risk_category'] = pd.cut(mrt_data['risk_probability'], 
                                     bins=[0, 0.3, 0.7, 1.0],
                                     labels=['Low', 'Medium', 'High'])

print("RISK ASSESSMENT SUMMARY")

print(mrt_data['risk_category'].value_counts().sort_index())


RISK ASSESSMENT SUMMARY
risk_category
Low       102
Medium     11
High       98
Name: count, dtype: int64


# Identify High Priority Stations

In [78]:
# Sort by risk probability
high_priority = mrt_data.nlargest(20, 'risk_probability')[
    ['station_name', 'line', 'planning_area_ura', 'is_interchange', 
     'near_cbd', 'risk_probability', 'risk_category']
]

print("\n" + "="*60)
print("TOP 20 STATIONS REQUIRING INCREASED PATROL")
print("="*60)
print(high_priority.to_string())


TOP 20 STATIONS REQUIRING INCREASED PATROL
      station_name                     line planning_area_ura  is_interchange  near_cbd  risk_probability risk_category
170  Choa Chu Kang        Bukit Panjang LRT     CHOA CHU KANG               1         0          0.999201          High
3    Choa Chu Kang        North-South Line      CHOA CHU KANG               1         0          0.998883          High
66     Dhoby Ghaut          North East Line            MUSEUM               1         1          0.998531          High
38           Bugis           East-West Line            ROCHOR               1         0          0.998168          High
90       Serangoon              Circle Line         SERANGOON               1         0          0.998000          High
72       Serangoon          North East Line         SERANGOON               1         0          0.997962          High
25      Marina Bay        North-South Line      DOWNTOWN CORE               1         1          0.997705          H

# Deployment Recommendation (Theorectical)

**Assumptions** :  
- 3-man patrol groups cover multiple nearby stations
- Each group gets 2 × 1-hour breaks during their shift
- Groups prioritize high-risk stations within their region

In [79]:
# Calculate officer allocation per station
total_officers = 70  # Average per shift
mrt_data['recommended_officers'] = (mrt_data['risk_probability'] * total_officers / mrt_data['risk_probability'].sum()).round()

# Ensure at least 1 officer at high risk stations
mrt_data.loc[mrt_data['risk_category'] == 'High', 'recommended_officers'] = \
    mrt_data.loc[mrt_data['risk_category'] == 'High', 'recommended_officers'].clip(lower=1)



# Define regions - combine Central and South (CBD areas are high-risk)
regions_mapping = {
    'NORTH REGION': 'NORTH REGION',
    'SOUTH REGION': 'CENTRAL REGION (CBD)',  # South is part of CBD
    'EAST REGION': 'EAST REGION',
    'WEST REGION': 'WEST REGION',
    'CENTRAL REGION': 'CENTRAL REGION (CBD)',
    'NORTH-EAST REGION': 'NORTH-EAST REGION'
}

# Map regions in data
mrt_data['patrol_region'] = mrt_data['region_ura'].map(regions_mapping)

# Define 4 patrol groups
patrol_regions = ['NORTH REGION', 'CENTRAL REGION (CBD)', 'EAST REGION', 'WEST REGION']

# Create 4 patrol groups (one per region)
for region_idx, region in enumerate(patrol_regions, 1):
    print(f"\n{'='*70}")
    print(f"PATROL GROUP {region_idx}: {region}")
    print(f"{'='*70}")
    
    # Get stations in this region, sorted by risk
    region_stations = mrt_data[mrt_data['patrol_region'] == region].copy()
    
    if len(region_stations) == 0:
        print(f"   No stations in {region}")
        continue
    
    # Get unique stations sorted by risk
    unique_stations = region_stations.groupby('station_name').agg({
        'risk_probability': 'mean',
        'is_interchange': 'first',
        'num_lines': 'first',
        'planning_area_ura': 'first',
        'latitude': 'first',
        'longitude': 'first'
    }).sort_values('risk_probability', ascending=False)
    
    # Get top 5 high-risk stations in this region
    top_stations = unique_stations.head(5)

    
    print(f"{'Priority':<10} {'Station':<25} {'Type':<20} {'Risk':<10} {'Area'}")
    print("-" * 70)
    
    for idx, (station_name, row) in enumerate(top_stations.iterrows(), 1):
        station_type = f"Interchange ({int(row['num_lines'])} lines)" if row['is_interchange'] else "Regular"
        risk_score = f"{row['risk_probability']:.3f}"
        
        print(f"#{idx:<9} {station_name:<25} {station_type:<20} {risk_score:<10} {row['planning_area_ura']}")
    
    # Calculate distance clustering 
    if len(top_stations) >= 2:
        print(f"\n Patrol Route Suggestion:")
        print(f"   Start → ", end="")
        for i, station in enumerate(top_stations.head(3).index):
            if i < len(top_stations.head(3)) - 1:
                print(f"{station} → ", end="")
            else:
                print(f"{station}")
        print(f"   (Covering stations in close proximity)")


PATROL GROUP 1: NORTH REGION
Priority   Station                   Type                 Risk       Area
----------------------------------------------------------------------
#1         Woodlands                 Interchange (2 lines) 0.993      WOODLANDS
#2         Khatib                    Regular              0.951      YISHUN
#3         Yishun                    Regular              0.646      YISHUN
#4         Sembawang                 Regular              0.025      SEMBAWANG
#5         Marsiling                 Regular              0.015      WOODLANDS

 Patrol Route Suggestion:
   Start → Woodlands → Khatib → Yishun
   (Covering stations in close proximity)

PATROL GROUP 2: CENTRAL REGION (CBD)
Priority   Station                   Type                 Risk       Area
----------------------------------------------------------------------
#1         Bugis                     Interchange (2 lines) 0.998      ROCHOR
#2         Dhoby Ghaut               Interchange (3 lines) 0.998   