In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

drive.mount('/content/drive')
total_capture_7k = pd.read_csv(r'drive/My Drive/correlation_wide.csv')

Mounted at /content/drive


In [2]:
# --------------------- Imports ---------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
from google.colab import drive

# --------------------- Matplotlib Setup ---------------------
mpl.rcParams.update({
    'font.size': 14,
    'axes.titlesize': 15,
    'axes.labelsize': 12,
    'xtick.labelsize': 11,
    'ytick.labelsize': 11,
    'legend.fontsize': 11,
    'figure.dpi': 300,
    'savefig.dpi': 300,
    'figure.autolayout': True,
})

# --------------------- Load Data ---------------------
print("Mounting Google Drive and loading dataset...")
drive.mount('/content/drive')
total_capture_7k = pd.read_csv('drive/My Drive/correlation_wide.csv')
print(f"Loaded dataset with shape: {total_capture_7k.shape}")

# --------------------- Identify Unique Static Parameter Sets ---------------------
static_cols = [
    'MikeSorghum', 'Quartz', 'Plagioclase', 'Apatite', 'Ilmenite',
    'Diopside_Mn', 'Diopside', 'Olivine', 'Alkali-feldspar',
    'Montmorillonite', 'Glass', 'temp', 'shift', 'year'
]

# Add timestep count per file_id
file_lengths = total_capture_7k.groupby('file_id').size().rename("num_timesteps").reset_index()
static_rows = total_capture_7k.groupby('file_id')[static_cols].first().reset_index()
static_rows = static_rows.merge(file_lengths, on='file_id')

# Filter only unique static parameter sets
unique_static_rows = static_rows.drop_duplicates(subset=static_cols)
unique_file_ids = unique_static_rows['file_id'].tolist()

# --------------------- Extract Time Series Data ---------------------
filtered_df = total_capture_7k[total_capture_7k['file_id'].isin(unique_file_ids)].copy()

# Truncate each group to 101 timesteps
filtered_df = filtered_df.groupby('file_id').head(101).reset_index(drop=True)

# --------------------- Static Feature Table ---------------------
Input_Link_Table = filtered_df.groupby('file_id').agg({col: 'first' for col in static_cols}).reset_index()
print(f"Static feature table created: Input_Link_Table.shape = {Input_Link_Table.shape}")

# --------------------- Time Series Structuring ---------------------
result = filtered_df[['Total_CO2_capture', 'year', 'file_id']]
file_ids = result['file_id'].unique()
num_file_ids = len(file_ids)
max_timesteps = 101
relevant_data = np.zeros((num_file_ids, max_timesteps))
file_id_order = np.zeros(num_file_ids)

for i, file_id in enumerate(file_ids):
    file_data = result[result['file_id'] == file_id]['Total_CO2_capture'].values
    relevant_data[i, :len(file_data)] = file_data
    file_id_order[i] = file_id
print(f"Time series matrix constructed: relevant_data.shape = {relevant_data.shape}")

# --------------------- Clustering ---------------------
scaler = StandardScaler()
normalized_data = scaler.fit_transform(relevant_data)
kmeans = KMeans(n_clusters=8, random_state=42)
clusters = kmeans.fit_predict(normalized_data)
print("Performed KMeans clustering into 8 clusters")

# Compute boundary stats
cluster_boundaries = []
for cluster_id in range(8):
    cluster_data = normalized_data[clusters == cluster_id]
    min_v = scaler.inverse_transform(np.min(cluster_data, axis=0).reshape(1, -1)).flatten()
    median_v = scaler.inverse_transform(np.median(cluster_data, axis=0).reshape(1, -1)).flatten()
    mean_v = scaler.inverse_transform(np.mean(cluster_data, axis=0).reshape(1, -1)).flatten()
    max_v = scaler.inverse_transform(np.max(cluster_data, axis=0).reshape(1, -1)).flatten()
    cluster_boundaries.append((min_v, median_v, mean_v, max_v))
cluster_boundaries = np.array(cluster_boundaries)
print(f"Cluster boundary stats calculated: cluster_boundaries.shape = {cluster_boundaries.shape}")

# --------------------- Merge Static Features with Clusters ---------------------
Clustering_link_table = pd.DataFrame({'file_id': file_id_order.astype(int), 'cluster': clusters})
Clustering_link_table = Clustering_link_table.sort_values(by='file_id').reset_index(drop=True)
merged_df = pd.merge(Input_Link_Table, Clustering_link_table, on='file_id')
print(f"Final input features (static + cluster): merged_df.shape = {merged_df.shape}")

# --------------------- Create Output Time Series DataFrame ---------------------
data = [[file_id_order[i].astype(int), t, relevant_data[i, t]] for i in range(len(file_id_order)) for t in range(max_timesteps)]
df_output = pd.DataFrame(data, columns=['file_id', 'timestep', 'CO2']).sort_values(by=['file_id', 'timestep'])
print(f"Final output time series: df_output.shape = {df_output.shape}")

# --------------------- Summary ---------------------
print("Data Preparation Summary:")
print(f"Static Input Table: merged_df [{merged_df.shape[0]} rows × {merged_df.shape[1]} columns]")
print(f"Time Series Output: df_output [{df_output.shape[0]} rows × 3 columns]")
print(f"Cluster Boundaries: cluster_boundaries [{cluster_boundaries.shape}]")

Mounting Google Drive and loading dataset...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded dataset with shape: (1192157, 17)
Static feature table created: Input_Link_Table.shape = (2703, 15)
Time series matrix constructed: relevant_data.shape = (2703, 101)
Performed KMeans clustering into 8 clusters
Cluster boundary stats calculated: cluster_boundaries.shape = (8, 4, 101)
Final input features (static + cluster): merged_df.shape = (2703, 16)
Final output time series: df_output.shape = (273003, 3)
Data Preparation Summary:
Static Input Table: merged_df [2703 rows × 16 columns]
Time Series Output: df_output [273003 rows × 3 columns]
Cluster Boundaries: cluster_boundaries [(8, 4, 101)]


Elbow method

In [5]:
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.cluster import KMeans
import numpy as np
from sklearn.preprocessing import StandardScaler

# 🛠 Big fonts for consistency with LaTeX paper
mpl.rcParams.update({
    'font.size': 20,
    'axes.titlesize': 24,
    'axes.labelsize': 22,
    'xtick.labelsize': 18,
    'ytick.labelsize': 18,
    'legend.fontsize': 20,
    'figure.dpi': 300,
    'savefig.dpi': 300,
    'figure.autolayout': True,
})

# Step 1: Normalize data
scaler = StandardScaler()
data_normalized = scaler.fit_transform(relevant_data)

# Step 2: Calculate SSE for different number of clusters
sse = []
K_range = range(1, 16)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(data_normalized)
    sse.append(kmeans.inertia_)

# Step 3: Plot Elbow Curve with 8 manually highlighted
plt.figure(figsize=(10, 6))
plt.plot(K_range, sse, 'o-', markersize=8, linewidth=3)

# Highlight k=8
optimal_k = 8
optimal_k_index = optimal_k - 1  # Because K_range starts at 1
plt.axvline(x=optimal_k, color='red', linestyle='--', linewidth=2)
plt.text(optimal_k + 0.5, sse[optimal_k_index] + 20000, f'Estimated clusters: {optimal_k}', color='red', fontsize=18)

plt.xlabel('Number of Clusters')
plt.ylabel('Sum of Squared Errors (SSE)')
plt.title('Elbow Method for Optimal Number of Clusters')

plt.tight_layout(pad=2.5)

# 💾 Save Final PDF
plt.savefig('drive/My Drive/DSSM-Figures-final2/Shoulder_Method_Clustering.pdf', format='pdf', bbox_inches='tight')
plt.close()

In [6]:
# Create a dictionary from the arrays
data = {
    'file_id': file_id_order.astype(int),
    'cluster': clusters
}

# Convert the dictionary into a DataFrame
Clustering_link_table = pd.DataFrame(data)
print(Clustering_link_table)

      file_id  cluster
0           1        1
1          10        1
2        1000        3
3        1001        3
4        1006        3
...       ...      ...
2698      982        3
2699      986        3
2700       99        2
2701      992        3
2702      999        3

[2703 rows x 2 columns]


In [7]:
Clustering_link_table = Clustering_link_table.sort_values(by='file_id').reset_index(drop=True)
Clustering_link_table

Unnamed: 0,file_id,cluster
0,1,1
1,2,1
2,3,1
3,4,1
4,6,1
...,...,...
2698,6866,0
2699,6876,0
2700,6892,0
2701,6896,5


In [8]:
merged_df = pd.merge(Input_Link_Table, Clustering_link_table, on='file_id')

In [9]:
merged_df

Unnamed: 0,file_id,MikeSorghum,Quartz,Plagioclase,Apatite,Ilmenite,Diopside_Mn,Diopside,Olivine,Alkali-feldspar,Montmorillonite,Glass,temp,shift,year,cluster
0,1,0.01,1.00,1.0,1.0,1.00,1.00,1.0,1.0,1.0,1.0,1.0,5,100,0.0,1
1,2,0.10,1.00,1.0,1.0,1.00,1.00,1.0,1.0,1.0,1.0,1.0,5,100,0.0,1
2,3,1.00,1.00,1.0,1.0,1.00,1.00,1.0,1.0,1.0,1.0,1.0,5,100,0.0,1
3,4,10.00,1.00,1.0,1.0,1.00,1.00,1.0,1.0,1.0,1.0,1.0,5,100,0.0,1
4,6,1.00,0.01,1.0,1.0,1.00,1.00,1.0,1.0,1.0,1.0,1.0,5,100,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2698,6866,1.00,1.00,1.0,1.0,0.01,1.00,1.0,1.0,1.0,1.0,1.0,25,300,0.0,0
2699,6876,0.01,1.00,1.0,1.0,1.00,1.00,1.0,1.0,1.0,1.0,1.0,25,350,0.0,0
2700,6892,1.00,1.00,1.0,0.1,1.00,1.00,1.0,1.0,1.0,1.0,1.0,25,350,0.0,0
2701,6896,1.00,1.00,1.0,1.0,1.00,0.01,1.0,1.0,1.0,1.0,1.0,25,350,0.0,5


In [10]:
file_path = 'drive/My Drive/merged_df.csv'
merged_df.to_csv(file_path, index=False)

In [11]:
# --------------------- Imports ---------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
from google.colab import drive

# --------------------- Matplotlib Setup ---------------------
mpl.rcParams.update({
    'font.size': 14,
    'axes.titlesize': 15,
    'axes.labelsize': 12,
    'xtick.labelsize': 11,
    'ytick.labelsize': 11,
    'legend.fontsize': 11,
    'figure.dpi': 300,
    'savefig.dpi': 300,
    'figure.autolayout': True,
})

# --------------------- Load Data ---------------------
print("Mounting Google Drive and loading dataset...")
drive.mount('/content/drive')
total_capture_7k = pd.read_csv('drive/My Drive/correlation_wide.csv')
print(f"Loaded dataset with shape: {total_capture_7k.shape}")

# --------------------- Identify Unique Static Parameter Sets ---------------------
static_cols = [
    'MikeSorghum', 'Quartz', 'Plagioclase', 'Apatite', 'Ilmenite',
    'Diopside_Mn', 'Diopside', 'Olivine', 'Alkali-feldspar',
    'Montmorillonite', 'Glass', 'temp', 'shift', 'year'
]

# Add timestep count per file_id
file_lengths = total_capture_7k.groupby('file_id').size().rename("num_timesteps").reset_index()
static_rows = total_capture_7k.groupby('file_id')[static_cols].first().reset_index()
static_rows = static_rows.merge(file_lengths, on='file_id')

# Filter only unique static parameter sets
unique_static_rows = static_rows.drop_duplicates(subset=static_cols)
unique_file_ids = unique_static_rows['file_id'].tolist()

# --------------------- Extract Time Series Data ---------------------
filtered_df = total_capture_7k[total_capture_7k['file_id'].isin(unique_file_ids)].copy()

# Truncate each group to 101 timesteps
filtered_df = filtered_df.groupby('file_id').head(101).reset_index(drop=True)

# --------------------- Static Feature Table ---------------------
Input_Link_Table = filtered_df.groupby('file_id').agg({col: 'first' for col in static_cols}).reset_index()
print(f"Static feature table created: Input_Link_Table.shape = {Input_Link_Table.shape}")

# --------------------- Time Series Structuring ---------------------
result = filtered_df[['Total_CO2_capture', 'year', 'file_id']]
file_ids = result['file_id'].unique()
num_file_ids = len(file_ids)
max_timesteps = 101
relevant_data = np.zeros((num_file_ids, max_timesteps))
file_id_order = np.zeros(num_file_ids)

for i, file_id in enumerate(file_ids):
    file_data = result[result['file_id'] == file_id]['Total_CO2_capture'].values
    relevant_data[i, :len(file_data)] = file_data
    file_id_order[i] = file_id
print(f"Time series matrix constructed: relevant_data.shape = {relevant_data.shape}")

# --------------------- Clustering ---------------------
scaler = StandardScaler()
normalized_data = scaler.fit_transform(relevant_data)
kmeans = KMeans(n_clusters=8, random_state=42)
clusters = kmeans.fit_predict(normalized_data)
print("Performed KMeans clustering into 8 clusters")

# Compute boundary stats
cluster_boundaries = []
for cluster_id in range(8):
    cluster_data = normalized_data[clusters == cluster_id]
    min_v = scaler.inverse_transform(np.min(cluster_data, axis=0).reshape(1, -1)).flatten()
    median_v = scaler.inverse_transform(np.median(cluster_data, axis=0).reshape(1, -1)).flatten()
    mean_v = scaler.inverse_transform(np.mean(cluster_data, axis=0).reshape(1, -1)).flatten()
    max_v = scaler.inverse_transform(np.max(cluster_data, axis=0).reshape(1, -1)).flatten()
    cluster_boundaries.append((min_v, median_v, mean_v, max_v))
cluster_boundaries = np.array(cluster_boundaries)
print(f"Cluster boundary stats calculated: cluster_boundaries.shape = {cluster_boundaries.shape}")

# --------------------- Merge Static Features with Clusters ---------------------
Clustering_link_table = pd.DataFrame({'file_id': file_id_order.astype(int), 'cluster': clusters})
Clustering_link_table = Clustering_link_table.sort_values(by='file_id').reset_index(drop=True)
merged_df = pd.merge(Input_Link_Table, Clustering_link_table, on='file_id')
print(f"Final input features (static + cluster): merged_df.shape = {merged_df.shape}")

# --------------------- Create Output Time Series DataFrame ---------------------
data = [[file_id_order[i].astype(int), t, relevant_data[i, t]] for i in range(len(file_id_order)) for t in range(max_timesteps)]
df_output = pd.DataFrame(data, columns=['file_id', 'timestep', 'CO2']).sort_values(by=['file_id', 'timestep'])
print(f"Final output time series: df_output.shape = {df_output.shape}")

# --------------------- Summary ---------------------
print("Data Preparation Summary:")
print(f"Static Input Table: merged_df [{merged_df.shape[0]} rows × {merged_df.shape[1]} columns]")
print(f"Time Series Output: df_output [{df_output.shape[0]} rows × 3 columns]")
print(f"Cluster Boundaries: cluster_boundaries [{cluster_boundaries.shape}]")

Mounting Google Drive and loading dataset...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded dataset with shape: (1192157, 17)
Static feature table created: Input_Link_Table.shape = (2703, 15)
Time series matrix constructed: relevant_data.shape = (2703, 101)
Performed KMeans clustering into 8 clusters
Cluster boundary stats calculated: cluster_boundaries.shape = (8, 4, 101)
Final input features (static + cluster): merged_df.shape = (2703, 16)
Final output time series: df_output.shape = (273003, 3)
Data Preparation Summary:
Static Input Table: merged_df [2703 rows × 16 columns]
Time Series Output: df_output [273003 rows × 3 columns]
Cluster Boundaries: cluster_boundaries [(8, 4, 101)]


In [12]:
merged_df

Unnamed: 0,file_id,MikeSorghum,Quartz,Plagioclase,Apatite,Ilmenite,Diopside_Mn,Diopside,Olivine,Alkali-feldspar,Montmorillonite,Glass,temp,shift,year,cluster
0,1,0.01,1.00,1.0,1.0,1.00,1.00,1.0,1.0,1.0,1.0,1.0,5,100,0.0,1
1,2,0.10,1.00,1.0,1.0,1.00,1.00,1.0,1.0,1.0,1.0,1.0,5,100,0.0,1
2,3,1.00,1.00,1.0,1.0,1.00,1.00,1.0,1.0,1.0,1.0,1.0,5,100,0.0,1
3,4,10.00,1.00,1.0,1.0,1.00,1.00,1.0,1.0,1.0,1.0,1.0,5,100,0.0,1
4,6,1.00,0.01,1.0,1.0,1.00,1.00,1.0,1.0,1.0,1.0,1.0,5,100,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2698,6866,1.00,1.00,1.0,1.0,0.01,1.00,1.0,1.0,1.0,1.0,1.0,25,300,0.0,0
2699,6876,0.01,1.00,1.0,1.0,1.00,1.00,1.0,1.0,1.0,1.0,1.0,25,350,0.0,0
2700,6892,1.00,1.00,1.0,0.1,1.00,1.00,1.0,1.0,1.0,1.0,1.0,25,350,0.0,0
2701,6896,1.00,1.00,1.0,1.0,1.00,0.01,1.0,1.0,1.0,1.0,1.0,25,350,0.0,5


In [13]:
# Select input features and the target cluster
X = merged_df.iloc[:, 1:-1]  # 2nd through second last columns
y = merged_df.iloc[:, -1]    # Last column is the cluster

In [14]:
data = []
# Iterate over each timeseries
for i in range(len(file_id_order)):
    file_id = file_id_order[i]
    for t in range(relevant_data.shape[1]):
        timestep = t
        co2_value = relevant_data[i, t]
        data.append([file_id.astype(int), timestep, co2_value])

# Convert the list to a DataFrame
df = pd.DataFrame(data, columns=['file_id', 'timestep', 'CO2'])
df_output = df.sort_values(by=['file_id', 'timestep'])

# Display the DataFrame
print(df_output)

        file_id  timestep       CO2
0             1         0  0.002554
1             1         1  0.335405
2             1         2  0.636192
3             1         3  0.879135
4             1         4  1.071001
...         ...       ...       ...
253505     6912        96  1.764417
253506     6912        97  1.768134
253507     6912        98  1.771791
253508     6912        99  1.775448
253509     6912       100  1.779215

[273003 rows x 3 columns]


In [15]:
file_path = 'drive/My Drive/output_df.csv'
df_output.to_csv(file_path, index=False)

**Print out the cluster boudaries figure**

In [17]:
import matplotlib.pyplot as plt
import matplotlib as mpl

# 🛠 Big Fonts for Large Figure
mpl.rcParams.update({
    'font.size': 24,
    'axes.titlesize': 28,
    'axes.labelsize': 26,
    'xtick.labelsize': 22,
    'ytick.labelsize': 22,
    'legend.fontsize': 22,
    'figure.dpi': 300,
    'savefig.dpi': 300,
    'figure.autolayout': True,
})

# 📈 Create Big Figure
plt.figure(figsize=(15, 10))

colors = ['blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink', 'gray']

for i, (min_values, median_values, mean_values, max_values) in enumerate(cluster_boundaries):
    plt.fill_between(range(relevant_data.shape[1]), min_values, max_values, color=colors[i], alpha=0.2)
    plt.plot(median_values, color=colors[i], linestyle='--', linewidth=3)
    plt.plot(mean_values, color=colors[i], linestyle='-', linewidth=3)

# 📚 Create Representative Legend
import matplotlib.patches as mpatches
import matplotlib.lines as mlines

shaded_patch = mpatches.Patch(color='gray', alpha=0.2, label='Shaded: Cluster Range')
median_line = mlines.Line2D([], [], color='black', linestyle='--', linewidth=3, label='Dashed: Cluster Median')
mean_line = mlines.Line2D([], [], color='black', linestyle='-', linewidth=3, label='Solid: Cluster Mean')

plt.legend(
    handles=[shaded_patch, median_line, mean_line],
    loc='lower right',    # 📍 Move legend to lower right
    frameon=True,
    fancybox=True,
    edgecolor='black'
)

plt.title('Cluster Boundaries, Medians, and Means')
plt.xlabel('Time Steps')
plt.ylabel('Original Scale Value')

plt.tight_layout(pad=3.0)

# 💾 Save Final PDF
plt.savefig('drive/My Drive/DSSM-Figures-final2/Clustering_All_Boundaries.pdf', format='pdf', bbox_inches='tight')
plt.close()

# Interpretation

In [18]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Creating a pivot table to prepare for heatmap
heatmap_data = pd.pivot_table(merged_df, values='file_id', index='temp', columns='shift', aggfunc=lambda x: y[x.index].mode()[0])

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_data, cmap='coolwarm', annot=True, fmt='.0f')
plt.title('Heatmap of Clusters based on Temp and Shift')
plt.xlabel('Shift')
plt.ylabel('Temp')
plt.tight_layout(pad=3.0)

# 💾 Save the final PDF
plt.savefig('drive/My Drive/DSSM-Figures-final2/Temp_Shift_Heatmaps2.pdf', format='pdf', bbox_inches='tight')
plt.close()

In [19]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import numpy as np

# 🛠 Big Fonts for Large Figure
mpl.rcParams.update({
    'font.size': 20,
    'axes.titlesize': 24,
    'axes.labelsize': 22,
    'xtick.labelsize': 18,
    'ytick.labelsize': 18,
    'legend.fontsize': 20,
    'figure.dpi': 300,
    'savefig.dpi': 300,
    'figure.autolayout': True,
})

# Step 1: Group by 'temp' and 'shift' to find the most frequent cluster and its percentage
grouped = merged_df.groupby(['temp', 'shift', 'cluster']).size().reset_index(name='counts')

# Step 2: For each unique 'temp' and 'shift', find the cluster with the maximum count
idx = grouped.groupby(['temp', 'shift'])['counts'].idxmax()
mode_data = grouped.loc[idx].pivot(index='temp', columns='shift', values='cluster')

# Step 3: Calculate the percentage of the dominant cluster
percentage_data = grouped.loc[idx].pivot(index='temp', columns='shift', values='counts')
total_counts = merged_df.groupby(['temp', 'shift']).size().unstack(fill_value=0)
percentage_data = (percentage_data / total_counts) * 100

# Round off percentage values and replace <NA> with np.nan for seaborn
percentage_data = percentage_data.round(0)
percentage_data = percentage_data.where(pd.notnull(percentage_data), np.nan)

# Step 4: Create subplots for the two heatmaps
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

# Plot the first heatmap (Most frequent cluster)
sns.heatmap(mode_data, cmap='coolwarm', annot=True, fmt='g', ax=axes[0], cbar=True,
            annot_kws={"size": 16}, cbar_kws={'label': 'Cluster ID'})
axes[0].set_title('Most Frequent Cluster at Temp and Shift')
axes[0].set_xlabel('Shift')
axes[0].set_ylabel('Temp')

# Rotate X-tick labels
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=90)

# Plot the second heatmap (Percentage of the most frequent cluster)
# Custom annotations based on number size
def annot_fmt(x):
    if pd.isna(x):
        return ''
    if x >= 100:
        return f'{int(x)}'  # Keep 3 digit but smaller font
    else:
        return f'{int(x)}'  # 2 digit normal

# Generate annotation text manually
annot_text = np.empty_like(percentage_data.values, dtype=object)
for i in range(percentage_data.shape[0]):
    for j in range(percentage_data.shape[1]):
        value = percentage_data.values[i, j]
        if pd.notna(value):
            annot_text[i, j] = annot_fmt(value)
        else:
            annot_text[i, j] = ""

sns.heatmap(percentage_data, cmap='Blues', annot=annot_text, fmt='', ax=axes[1], cbar=True,
            annot_kws={"size": 14}, cbar_kws={'label': 'Percentage (%)'})
axes[1].set_title('Percentage of Dominant Cluster')
axes[1].set_xlabel('Shift')
axes[1].set_ylabel('Temp')

# Rotate X-tick labels
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=90)

# Layout adjustments
plt.tight_layout(pad=3.0)

# 💾 Save the final PDF
plt.savefig('drive/My Drive/DSSM-Figures-final2/Temp_Shift_Heatmaps.pdf', format='pdf', bbox_inches='tight')
plt.close()

In [20]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

# 🛠 Big Fonts for Large Figures
mpl.rcParams.update({
    'font.size': 20,
    'axes.titlesize': 24,
    'axes.labelsize': 22,
    'xtick.labelsize': 18,
    'ytick.labelsize': 18,
    'legend.fontsize': 20,
    'figure.dpi': 300,
    'savefig.dpi': 300,
    'figure.autolayout': True,
})

# Features to plot
features = ['temp', 'shift']

# Create individual boxplots and save them
for feature in features:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=merged_df['cluster'], y=merged_df[feature])

    plt.title(f'Cluster-wise Boxplot for {feature}')
    plt.xlabel('Cluster')
    plt.ylabel(f'{feature} Value')
    plt.tight_layout(pad=3.0)

    # 💾 Save each plot as PDF
    filename = f'drive/My Drive/DSSM-Figures-final2/Boxplot_{feature}.pdf'
    plt.savefig(filename, format='pdf', bbox_inches='tight')
    plt.close()


In [21]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# 🛠 Big Fonts for Large Figure
mpl.rcParams.update({
    'font.size': 20,
    'axes.titlesize': 24,
    'axes.labelsize': 22,
    'xtick.labelsize': 18,
    'ytick.labelsize': 18,
    'legend.fontsize': 20,
    'figure.dpi': 300,
    'savefig.dpi': 300,
    'figure.autolayout': True,
})

# Select features and target
X = merged_df.iloc[:, 1:-1]  # Assuming 2nd to last columns are features
y = merged_df.iloc[:, -1]    # Assuming the last column is the cluster label

# Train a Random Forest model
rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf.fit(X, y)

# Get feature importances
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]  # Sort feature importances

# Plot feature importances
plt.figure(figsize=(10, 6))

plt.bar(range(X.shape[1]), importances[indices], align='center')

plt.xticks(range(X.shape[1]), X.columns[indices], rotation=90)
plt.title('Feature Importances')
plt.xlabel('Features')
plt.ylabel('Importance')

plt.tight_layout(pad=3.0)

# 💾 Save the figure
plt.savefig('drive/My Drive/DSSM-Figures-final2/Feature_Importances_RF.pdf', format='pdf', bbox_inches='tight')
plt.close()
