In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_pickle('../data/df.pkl')

In [None]:
plot_features = [
    'plot_id',
    'curve_mean',
    'curve_min',
    'curve_max',
    'pro_curve_mean',
    'pro_curve_min',
    'pro_curve_max',
    'plan_curve_mean',
    'plan_curve_min',
    'plan_curve_max',
    'elev_min',
    'elev_max',
    'elev_mean',
    'elev_dev_min',
    'elev_dev_max',
    'elev_dev_mean',
    'total_relief',
    'area_m2',
    'area_ha',
    'aspect_min_cos',
    'aspect_min_sin',
    'aspect_max_cos',
    'aspect_max_sin',
    'aspect_mean_cos',
    'aspect_mean_sin',
    'slope_rad',
    'slope_grad',
    'slope_x',
    'slope_y',
    'local_relief',
    'total_relief_log',
    'sandtotal_r',
    'silttotal_r',
    'claytotal_r',
    'awc_r',
    'cec7_r',
    'om_r',
    'ph1to1h2o_r',
    'ec_r',
    'profile_depth',
    'max_depth',
    'frag3to10_r',
    'fraggt10_r',
    'dbovendry_r',
    'caco3_r',
]

In [None]:
plot_features_df = df[plot_features].copy()
# weekly_df = df[weekly_cols].copy()

In [None]:
# features = pd.read_pickle('../data/plot_elev_features.pkl')
# cov_df = pd.read_pickle('../data/ndvi/plots/coefficient_of_var.pkl')

In [None]:
plot_features_df = plot_features_df.drop_duplicates().reset_index(drop = True)

In [None]:
# Plot-level vibe health score
health = df.groupby('plot_id').agg(
    mean_vigor=('ndvi_mean','mean'),
    mean_stability=('ndvi_cov','mean'),
    variance=('ndvi_std','mean')
)
health['health'] = (health['mean_vigor'] + ( health['mean_stability'])) + health['variance']**2

In [None]:
health

In [None]:
df = plot_features_df.merge(health, how = 'inner', on = 'plot_id')

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.decomposition import PCA

terrain_cols = [
    'slope_x','slope_y',
    'pro_curve_mean',
    'plan_curve_mean',
    'elev_min',
    'elev_max',
    'elev_mean',
    # 'elev_dev_min',
    # 'elev_dev_max',
    'elev_dev_mean',
    # 'total_relief',
    # 'area_ha',
    'aspect_mean_sin','aspect_mean_cos',
    # 'aspect_min_sin','aspect_min_cos',
    # 'aspect_max_sin','aspect_max_cos'
]

soil_cols = [
    'sandtotal_r',
    'silttotal_r',
    'claytotal_r',
    'awc_r',
    'cec7_r',
    'om_r',
    'ph1to1h2o_r',
    'ec_r',
    # 'profile_depth',
    # 'max_depth',
    # 'frag3to10_r',
    # 'fraggt10_r',
    'dbovendry_r',
    'caco3_r'
    
    
]

health_cols = ['mean_vigor', 'mean_stability', 'health']

In [None]:
X = df[terrain_cols + soil_cols].copy()
y = df[health_cols].copy()

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y_scaled = scaler.fit_transform(y)

In [None]:

model = GradientBoostingRegressor(random_state=98723)
model.fit(X_scaled, y['mean_vigor'])
leaf_emb_vigor = model.apply(X_scaled).reshape(len(X_scaled), -1)

model.fit(X_scaled, y['mean_stability'])
leaf_emb_stability = model.apply(X_scaled).reshape(len(X_scaled), -1)

model.fit(X_scaled, y['health'])
leaf_emb_health = model.apply(X_scaled).reshape(len(X_scaled), -1)

data_for_clustering = np.hstack([leaf_emb_vigor, leaf_emb_stability, leaf_emb_health])
# data_for_clustering = leaf_emb_vigor

leaf_emb = model.apply(X_scaled).reshape(len(X_scaled), -1)

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

pca = PCA()
X_scaled_pca = pca.fit_transform(X_scaled)

# Scree plot
plt.figure(figsize=(6,4))
plt.plot(np.cumsum(pca.explained_variance_ratio_)*100, marker='o')
plt.xlabel('Number of PCs')
plt.ylabel('Cumulative Explained Variance (%)')
plt.title('PCA Explained Variance')
plt.grid(True)
plt.show()


In [None]:
pca = PCA(n_components = 2)
coords = pca.fit_transform(X_scaled)

loadings = pd.DataFrame(
    pca.components_.T,
    columns=[f'PC{i+1}' for i in range(pca.n_components_)],
    index=X.columns
)

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

data_for_clustering = leaf_emb

k_values = range(1, 10)
inertia = []

for k in k_values:
    km = KMeans(n_clusters=k, random_state=4893)
    km.fit(leaf_emb)
    inertia.append(km.inertia_)

In [None]:
plt.figure(figsize=(8,5))
plt.plot(k_values, inertia, 'o-', color='blue', linewidth=2)
plt.xlabel("Number of clusters (k)")
plt.ylabel("Within-cluster Sum of Squares (Inertia)")
plt.title("Elbow Method for Optimal k")
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
from sklearn.cluster import KMeans

k = 3
km = KMeans(n_clusters=k, random_state=864587)
df['cluster'] = km.fit_predict(coords)
df_copy = df.copy()

In [None]:
cluster_means = pd.DataFrame(coords, columns=[f'PC{i+1}' for i in range(2)])
cluster_means['cluster'] = df['cluster']
cluster_summary = cluster_means.groupby('cluster').mean()
print(cluster_summary)

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np

# PCA transformation (first 2 PCs)
# plot_coords = pca.fit_transform(leaf_emb)
plot_coords = coords[:,:4]
# Compute cluster centers in PCA space
clusters = df['cluster'].values
unique_clusters = np.unique(clusters)
centers = np.array([plot_coords[clusters == c].mean(axis=0) for c in unique_clusters])

# Create discrete colormap for clusters
n_clusters = len(unique_clusters)
cmap = plt.cm.get_cmap('viridis', n_clusters)  # tab10 has 10 discrete colors

# Plot
plt.figure(figsize=(8,6))
scatter = plt.scatter(plot_coords[:,0], plot_coords[:,1], 
                      c=clusters, cmap=cmap, s=80, alpha=0.2)

# Cluster centers in red
plt.scatter(centers[:,0], centers[:,1], color='red', s=150, marker='X', label='Cluster Centers')

# Colorbar with discrete ticks
cbar = plt.colorbar(scatter, ticks=range(n_clusters))
cbar.set_label("Cluster")
cbar.set_ticklabels([f'Cluster {c}' for c in range(n_clusters)])

plt.title("Vineyard Block Clusters in PCA Space")
plt.xlabel("PC1")
plt.ylabel("PC2")
# plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('../img/cluster_pca.png')
plt.show()


In [None]:
import pandas as pd
import numpy as np

# Suppose you ran:
# pca = PCA(n_components=3)
# pca.fit(X_scaled)

# Get feature names from your dataframe
feature_names = X.columns

# Build the loadings dataframe
loadings = pd.DataFrame(
    pca.components_.T,         # transpose so rows = features
    columns=[f'PC{i+1}' for i in range(pca.n_components_)],
    index=feature_names
)

In [None]:
loadings

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Assume `loadings` DataFrame from your previous step
# Keep only PC1 and PC2
pc_features = loadings[['PC1','PC2']]

# Plot bar chart
fig, ax = plt.subplots(figsize=(10,6))

# x positions for features
x = np.arange(len(pc_features))
width = 0.35

ax.bar(x - width/2, pc_features['PC1'], width, label='PC1', color='steelblue')
ax.bar(x + width/2, pc_features['PC2'], width, label='PC2', color='orange')

ax.set_xticks(x)
ax.set_xticklabels(pc_features.index, rotation=45, ha='right')
ax.set_ylabel('Feature Weight')
ax.set_title('Feature Contributions to PC1 and PC2')
ax.legend()
plt.tight_layout()
plt.savefig('feat_cont.png')
plt.show()


In [None]:
df

In [None]:
# See which variables contribute most to PC1, PC2, etc.
for i in range(pca.n_components_):
    print(f"\nTop features for PC{i+1}:")
    display(loadings.iloc[:, i].abs().sort_values(ascending=False).head(10))


In [None]:
special_cols = soil_cols + terrain_cols
special_cols.extend(health_cols)
# special_cols.append('cluster')

In [None]:
df_copy = df[special_cols]

In [None]:
df_copy_scaled = pd.DataFrame(scaler.fit_transform(df_copy), columns=df_copy.columns)

In [None]:
df_copy_scaled['cluster'] = df['cluster'].copy() 

In [None]:
# for feature in special_cols:
#     plt.figure(figsize=(8, 6))
#     # Create boxplot grouped by cluster
#     data_to_plot = [df_copy_scaled[df_copy_scaled['cluster'] == c][feature] for c in sorted(df['cluster'].unique())]
#     plt.boxplot(data_to_plot, labels=sorted(df_copy_scaled['cluster'].unique()), patch_artist=True)
#     plt.title(f"{feature} distribution by cluster")
#     plt.xlabel("Cluster")
#     plt.ylabel(feature)
#     plt.show()

In [None]:
from scipy.stats import kruskal

groups = [df[df['cluster'] == c]['mean_stability'] for c in sorted(df['cluster'].unique())]
h_stat, p_value = kruskal(*groups)

print(f"Kruskal-Wallis H-statistic: {h_stat:.3f}, p-value: {p_value:.3f}")


from scipy.stats import kruskal

for target in ['mean_vigor', 'mean_stability', 'health']:
    groups = [df[df['cluster']==c][target] for c in sorted(df['cluster'].unique())]
    h, p = kruskal(*groups)
    print(f"{target}: H={h:.2f}, p={p:.4f}")

In [None]:
import pandas as pd
from scipy.stats import kruskal

targets = ['mean_vigor', 'mean_stability', 'health']
unique_clusters = sorted(df['cluster'].unique())

results = {}

for target in targets:
    groups = [df[df['cluster']==c][target] for c in unique_clusters]
    h, p = kruskal(*groups)
    results[target] = {'H_stat': h, 'p_value': p}

pd.DataFrame(results).T


In [None]:
# import sys
# print(sys.executable)

# !{sys.executable} -m pip install scikit-posthocs
import scikit_posthocs as sp

for target in targets:
    print(f"\nPost-hoc pairwise Dunn test for {target}:")
    data = df[[target, 'cluster']]
    dunn = sp.posthoc_dunn(data, val_col=target, group_col='cluster', p_adjust='bonferroni')
    print(dunn)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

for target in targets:
    sns.boxplot(x='cluster', y=target, data=df)
    plt.title(target)
    plt.show()


In [None]:
def kruskal_eta_squared(H, n_total):
    return H * (n_total + 1) / (n_total**2 - 1)


In [None]:
from sklearn.metrics import silhouette_score
sil = silhouette_score(data_for_clustering, clusters)
print(f"Silhouette score: {sil:.3f}")

In [None]:
cluster_means = df_copy_scaled.groupby('cluster')[special_cols].mean()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.cm import get_cmap

# Select features to plot (terrain features or top weighted features)
features = soil_cols + terrain_cols + health_cols # or a subset like top 10 PCs
n_features = len(features)

# Compute cluster-wise mean values
cluster_means = df_copy_scaled.groupby('cluster')[features].mean()

# Normalize features 0–1 for plotting
cluster_means_norm = (cluster_means - cluster_means.min()) / (cluster_means.max() - cluster_means.min())

# Angles for radar plot
angles = np.linspace(0, 2*np.pi, n_features, endpoint=False).tolist()
angles += angles[:1]  # close the loop

# Discrete colormap: assign one color per cluster
n_clusters = cluster_means_norm.shape[0]
cmap = get_cmap('viridis', n_clusters)
colors = [cmap(i) for i in range(n_clusters)]

fig, ax = plt.subplots(figsize=(10,10), subplot_kw=dict(polar=True))

# Plot each cluster with discrete color
for i, (cluster_id, row) in enumerate(cluster_means_norm.iterrows()):
    values = row.tolist()
    values += values[:1]  # close the loop
    ax.plot(angles, values, label=f'Cluster {cluster_id}', linewidth=2, color=colors[i])
    ax.fill(angles, values, alpha=0.25, color=colors[i])

# Feature labels
ax.set_xticks(angles[:-1])

ax.set_xticklabels(features, rotation=45, ha='right')

for label, angle in zip(ax.get_xticklabels(), angles[:-1]):
    angle_deg = np.degrees(angle)

#     # right side of the plot → align left
    if 0 <= angle_deg <= 90 or angle_deg >270:
        label.set_horizontalalignment('left')
    else:
        label.set_horizontalalignment('right')
for label in ax.get_xticklabels():
    if label.get_text() in ['health', 'mean_vigor', 'mean_stability']:
        label.set_fontweight('bold')
ax.set_title('Cluster Analysis\nTopography Effect on Vineyard Plot NDVI', fontsize=14, y = 1.07)
ax.legend(loc='upper right', bbox_to_anchor=(1.1, 1.1))
plt.savefig('../img/radar_cluster.png')
plt.show()


In [None]:
df

In [None]:
clusters = df[['plot_id', 'cluster', 'mean_vigor', 'mean_stability','variance','health']]

In [None]:
clusters.to_csv('clusters.csv')

In [None]:
df.to_pickle('df_clustered.pkl')

In [None]:
plt.hist(
    df['health'][df['cluster'] == 0], label = '0', alpha = 0.3
)
plt.hist(
    df['health'][df['cluster'] == 1], label = '1', alpha = 0.3
)

plt.hist(
    df['health'][df['cluster'] == 2], label = '2', alpha = 0.3
)
plt.legend()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

clusters = df['cluster'].unique()

plt.figure(figsize=(10, 6))

for c in reversed(clusters):
    subset = df[df['cluster'] == c]['health'].dropna()

    mean = np.mean(subset)
    
    
    
    # Build KDE
    kde = gaussian_kde(subset)

    # Create a grid over the data range
    x_vals = np.linspace(subset.min(), subset.max(), 300)

    # Evaluate PDF
    pdf_vals = kde(x_vals)

    # Plot
    plt.plot(x_vals, pdf_vals, label=f'Cluster {c}')
    
    plt.plot((mean, mean), (0,6), '--k')
    


plt.xlabel('Health')
plt.ylabel('Probability Density')
plt.title('KDE (PDF) of Health by Cluster')
plt.legend()
plt.show()
