In [None]:
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset
data = pd.read_csv("Volkswagen_OEM_AsiaPacific_2023_SyntheticDataset_withAge.csv")

# Preprocess the dataset
data['Buying_Date'] = pd.to_datetime(data['Buying_Date'])
data['Timestamp'] = pd.to_datetime(data['Timestamp'])
data['Efficiency_Score'] = data['Efficiency_Score'].fillna(data['Efficiency_Score'].mean())
data['Energy_Rating'] = data['Energy_Rating'].fillna(data['Energy_Rating'].mean())

# --- PLOT 1: High-Risk Machines for Predictive Maintenance ---
from sklearn.ensemble import RandomForestClassifier

# Train Predictive Maintenance Model
X = data[['Efficiency_Score', 'Energy_Rating', 'Machine_Temperature']]
y = (data['Operational_Status'] == 'Off').astype(int)

maintenance_model = RandomForestClassifier(random_state=42)
maintenance_model.fit(X, y)

# Predict downtime probabilities and add to the dataset
data['Downtime_Probability'] = maintenance_model.predict_proba(X)[:, 1]

# Filter machines that are not already "Off"
high_risk_machines = data[data['Operational_Status'] != 'Off'].nlargest(10, 'Downtime_Probability')

# Plot
fig1 = px.bar(
    high_risk_machines,
    x='Machine_ID',
    y='Downtime_Probability',
    color='Operational_Status',
    title="Top 10 High-Risk Machines for Downtime (Excluding 'Off')",
    labels={'Downtime_Probability': 'Downtime Probability'},
    text='Downtime_Probability'
)
fig1.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig1.show()

# --- PLOT 2: Energy Consumption by Machine Type ---
# Aggregate energy consumption by machine type
machine_energy = data.groupby('Machine_Type')['Energy_Consumption'].sum().reset_index()

# Plot
fig2 = px.bar(
    machine_energy,
    x='Machine_Type',
    y='Energy_Consumption',
    title="Total Energy Consumption by Machine Type",
    labels={'Energy_Consumption': 'Energy (kWh)', 'Machine_Type': 'Machine Type'},
    text='Energy_Consumption'
)
fig2.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig2.update_layout(xaxis_tickangle=45)
fig2.show()

# --- PLOT 3: Seasonal Energy Consumption Trends ---
# Aggregate energy consumption by season
seasonal_energy = data.groupby('Season')['Energy_Consumption'].mean().reset_index()

# Plot
fig3 = px.line(
    seasonal_energy,
    x='Season',
    y='Energy_Consumption',
    title="Seasonal Trends in Energy Consumption",
    labels={'Energy_Consumption': 'Average Energy (kWh)', 'Season': 'Season'},
    markers=True
)
fig3.show()

# --- PLOT 4: Factory Performance Comparison ---
# Aggregate performance metrics by factory
factory_performance = data.groupby('Factory_ID').agg({
    'Energy_Consumption': 'sum',
    'Operational_Status': lambda x: (x == 'Off').sum()  # Count of downtime instances
}).reset_index()

factory_performance.rename(columns={'Operational_Status': 'Downtime_Count'}, inplace=True)

# Plot
fig4 = px.scatter(
    factory_performance,
    x='Energy_Consumption',
    y='Downtime_Count',
    size='Energy_Consumption',
    color='Factory_ID',
    title="Factory Performance: Energy vs Downtime",
    labels={'Energy_Consumption': 'Total Energy (kWh)', 'Downtime_Count': 'Downtime Count'},
    hover_name='Factory_ID'
)
fig4.show()

# --- PLOT 5: Operational Status Distribution ---
# Count operational statuses
status_counts = data['Operational_Status'].value_counts().reset_index()
status_counts.columns = ['Operational_Status', 'Count']

# Plot
fig5 = px.pie(
    status_counts,
    values='Count',
    names='Operational_Status',
    title="Operational Status Distribution",
    hole=0.4
)
fig5.show()

# --- PLOT 6: Machine Efficiency vs Energy Consumption ---
fig6 = px.scatter(
    data,
    x='Efficiency_Score',
    y='Energy_Consumption',
    color='Machine_Type',
    title="Efficiency vs Energy Consumption",
    labels={'Efficiency_Score': 'Efficiency Score', 'Energy_Consumption': 'Energy (kWh)'},
    hover_data=['Factory_ID', 'Machine_ID']
)
fig6.show()

# --- PLOT 7: Energy Consumption Heatmap by Factory and Season ---
# Prepare data for heatmap
heatmap_data = data.groupby(['Factory_ID', 'Season'])['Energy_Consumption'].mean().unstack()

# Plot
plt.figure(figsize=(12, 6))
sns.heatmap(heatmap_data, annot=True, fmt=".1f", cmap='coolwarm')
plt.title("Energy Consumption Heatmap by Factory and Season")
plt.xlabel("Season")
plt.ylabel("Factory ID")
plt.show()
