In [1]:
import pandas as pd
import numpy as np

# Constants
num_samples = 1000

# Data Generation
np.random.seed(42)  # For reproducibility

df = pd.DataFrame({
    'RouteID': np.arange(num_samples),
    'POL': np.random.choice(['PortA', 'PortB', 'PortC'], num_samples),
    'POD': np.random.choice(['PortX', 'PortY', 'PortZ'], num_samples),
    'High_Incidents': np.random.randint(0, 10, num_samples),
    'Medium_Incidents': np.random.randint(0, 20, num_samples),
    'Low_Incidents': np.random.randint(0, 50, num_samples),
    'No_Of_Transhipments': np.random.randint(0, 7, num_samples),
    'Average_Transit_Days': np.random.normal(30, 5, num_samples),
    'Distance': np.random.normal(2000, 300, num_samples),
    'Crew_Experience_Level': np.random.normal(10, 2, num_samples),
    'Training_Programs': np.random.choice(['regular', 'irregular'], num_samples),
    'Navigational_Risks': np.random.choice(['congested areas', 'shallow waters', 'clear'], num_samples),
    'Carbon_Emissions': np.random.normal(10000, 2000, num_samples),
    'Operational_Cost': np.random.normal(500000, 100000, num_samples),
    'Insurance_Premium': np.random.normal(50000, 10000, num_samples),
    'Total_Revenue': np.random.normal(1000000, 200000, num_samples),
    'Regulatory_Compliance': np.random.choice(['compliant', 'non-compliant'], num_samples),
    'Previous_Safety_Awards': np.random.choice(['yes', 'no'], num_samples),
    'Emergency_Response_Plans': np.random.choice(['in place', 'not in place'], num_samples),
    'Cargo_Type': np.random.choice(['hazardous', 'non-hazardous'], num_samples),
    'Ship_Type': np.random.choice(['container', 'bulk carrier'], num_samples),
    'Average_Vessel_Age': np.random.normal(12, 5, num_samples),
    'Navigational_Risks': np.random.choice(['congested areas', 'shallow waters', 'clear'], num_samples),
})

# Define weights for each feature group
weights = {
    'Historical_Incidents': 0.3,
    'Operational_Factors': 0.25,
    'Environmental_Factors': 0.2,
    'Financial_Factors': 0.15,
    'Compliance_Factors': 0.1
}

def calculate_risk_score(row):
    # Historical Incidents
    historical_incidents_score = (
        row['High_Incidents'] * 1.5 + 
        row['Medium_Incidents'] * 1.0 + 
        row['Low_Incidents'] * 0.5
    ) * weights['Historical_Incidents']
    
    # Operational Factors
    operational_factors_score = (
        row['No_Of_Transhipments'] * 1.2 + 
        row['Average_Transit_Days'] * 1.1 + 
        row['Distance'] * 1.0 + 
        (row['Crew_Experience_Level'] > 8) * -1 +  # Adjusted for Crew Experience Level
        (row['Training_Programs'] == 'irregular') * 1.5
    ) * weights['Operational_Factors']
    
    # Environmental Factors
    environmental_factors_score = (
        (row['Navigational_Risks'] == 'congested areas') * 1.5 + 
        (row['Navigational_Risks'] == 'shallow waters') * 1.2 + 
        row['Carbon_Emissions'] * 1.0
    ) * weights['Environmental_Factors']
    
    # Financial Factors
    financial_factors_score = (
        row['Operational_Cost'] * 0.8 + 
        row['Insurance_Premium'] * 0.5 - 
        row['Total_Revenue'] * 0.2  # Higher revenue reduces risk
    ) * weights['Financial_Factors']
    
    # Compliance Factors
    compliance_factors_score = (
        (row['Regulatory_Compliance'] == 'non-compliant') * 2.0 + 
        (row['Previous_Safety_Awards'] == 'no') * 1.5 + 
        (row['Emergency_Response_Plans'] == 'not in place') * 1.2
    ) * weights['Compliance_Factors']
    
    # Combine all contributions to get the final risk score
    risk_score = (
        historical_incidents_score + 
        operational_factors_score + 
        environmental_factors_score + 
        financial_factors_score + 
        compliance_factors_score
    )
    
    return risk_score

# Apply the risk score calculation to the DataFrame
df['RiskScore'] = df.apply(calculate_risk_score, axis=1)

# Scale the RiskScore within 0-100
min_risk_score = df['RiskScore'].min()
max_risk_score = df['RiskScore'].max()

df['Scaled_RiskScore'] = (df['RiskScore'] - min_risk_score) / (max_risk_score - min_risk_score) * 100

# Categorize the RiskScore
bins = [0, 25, 50, 75, 100]
labels = ['Low', 'Medium', 'High', 'Critical']
df['Risk_Category'] = pd.cut(df['Scaled_RiskScore'], bins=bins, labels=labels)


df.to_csv('enhanced_shipping_data_with_route_and_risks19.csv', index=False)
# Display results



   RouteID    POL    POD     RiskScore  Scaled_RiskScore Risk_Category
0        0  PortC  PortZ  40167.917997         56.208462          High
1        1  PortA  PortZ  46531.010567         64.737024          High
2        2  PortC  PortZ  33677.356734         47.509052        Medium
3        3  PortC  PortZ  45601.777762         63.491557          High
4        4  PortA  PortX  30593.758499         43.376052        Medium
