In [None]:
%load_ext autoreload
%autoreload 2
import sys

# instead of creating a package using setup.py or building from a docker/singularity file,
# import the sister directory of src code to be called on in notebook.
# This keeps the notebook free from code to only hold visualizations and is easier to test
# It also helps keep the state of variables clean such that cells aren't run out of order with a mysterious state
sys.path.append("..")

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans

# import sklearn.cluster.hierarchical as hclust
from sklearn import preprocessing
import seaborn as sns

from src import most_recent_mesonet_data
from src import most_recent_mesonet_time
from src import landtype_describe
from src.plotting_scripts import landtype
import os
import pandas as pd
import cartopy.crs as crs
import cartopy.feature as cfeature

In [None]:
def plurality_plot(df):
    projPC = crs.PlateCarree()
    latN = df["lat"].max() + 1
    latS = df["lat"].min() - 1
    lonW = df["lon"].max() + 1
    lonE = df["lon"].min() - 1
    cLat = (latN + latS) / 2
    cLon = (lonW + lonE) / 2
    projLcc = crs.LambertConformal(central_longitude=cLon, central_latitude=cLat)

    fig, ax = plt.subplots(
        figsize=(12, 9), subplot_kw={"projection": crs.PlateCarree()}, dpi=400
    )
    ax.set_extent([lonW, lonE, latS, latN], crs=projPC)
    ax.add_feature(cfeature.LAND)
    ax.add_feature(cfeature.COASTLINE)
    ax.add_feature(cfeature.BORDERS, linestyle="--")
    ax.add_feature(cfeature.LAKES, alpha=0.5)
    ax.add_feature(cfeature.STATES)
    ax.xticklabels_top = False
    ax.ylabels_right = False
    ax.gridlines(
        crs=crs.PlateCarree(),
        draw_labels=True,
        linewidth=2,
        color="black",
        alpha=0.5,
        linestyle="--",
    )
    ax.scatter(
        x=df["lon"],
        y=df["lat"],
        c=df["color"],
        s=40,
        marker="o",
        edgecolor="black",
        transform=crs.PlateCarree(),
    )
    ax.set_title("Mesonet Site Groups", size=16)
    ax.set_xlabel("Longitude", size=14)
    ax.set_ylabel("Latitude", size=14)
    ax.tick_params(axis="x", labelsize=12)
    ax.tick_params(axis="y", labelsize=12)
    ax.grid()

In [None]:
# get lons and lats
# This will return the most recent data avail on mesonet
# this is my file path
ny_mesonet_data_path = "/home/aevans/nysm/archive/nysm/netcdf/proc"
ok_mesonet_data_path = "/home/aevans/landtype/geoinfo.csv"

In [None]:
# create a dataframe of mesonet data ok
ok_df = pd.read_csv(ok_mesonet_data_path)

In [None]:
ok_df_lons = ok_df["elon"].to_list()
ok_df_lats = ok_df["nlat"].to_list()

In [None]:
df_lulc = pd.read_csv("/home/aevans/landtype/data/OKbuffer_10_percent.csv")

In [None]:
# exclude the categorical columns
features = df_lulc.drop(["site", "station"], axis=1)

In [None]:
features.describe()

In [None]:
# noramalize data
scaler = preprocessing.MinMaxScaler()
features_normal = scaler.fit_transform(features)

In [None]:
pd.DataFrame(features_normal).describe()

In [None]:
# kmeans clustering
inertia = []
K = range(1, 20)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(features_normal)
    kmeanModel.fit(features_normal)
    inertia.append(kmeanModel.inertia_)

In [None]:
# Plot the elbow
plt.plot(K, inertia, "bx-")
plt.xlabel("k")
plt.ylabel("Inertia")
plt.show()

In [None]:
# let's say our elbow is 12
kmeans = KMeans(n_clusters=6).fit(features_normal)

In [None]:
labels = pd.DataFrame(
    kmeans.labels_
)  # This is where the label output of the KMeans we just ran lives. Make it a dataframe so we can concatenate back to the original data
labeledClasses = pd.concat((features, labels), axis=1)
labeledClasses = labeledClasses.rename({0: "labels"}, axis=1)

In [None]:
labeledClasses.head()

In [None]:
sns.pairplot(labeledClasses, hue="labels")

In [None]:
labeledClasses[
    "Constant"
] = "Data"  # This is just to add something constant for the strip/swarm plots' X axis. Can be anything you want it to be.

In [None]:
f, axes = plt.subplots(
    4, 5, figsize=(20, 25), sharex=False
)  # create a 4x5 grid of empty figures where we will plot our feature plots. We will have a couple empty ones.
f.subplots_adjust(hspace=0.2, wspace=0.7)  # Scooch em apart, give em some room
# In this for loop, I step through every column that I want to plot. This is a 4x5 grid, so I split this up by rows of 5 in the else if statements
for i in range(
    0, len(list(labeledClasses)) - 2
):  # minus two because I don't want to plot labels or constant
    col = labeledClasses.columns[i]
    if i < 5:
        ax = sns.stripplot(
            x=labeledClasses["Constant"],
            y=labeledClasses[col].values,
            hue=labeledClasses["labels"],
            jitter=True,
            ax=axes[0, (i)],
        )
        ax.set_title(col)
    elif i >= 5 and i < 10:
        ax = sns.stripplot(
            x=labeledClasses["Constant"],
            y=labeledClasses[col].values,
            hue=labeledClasses["labels"],
            jitter=True,
            ax=axes[1, (i - 5)],
        )  # so if i=6 it is row 1 column 1
        ax.set_title(col)
    elif i >= 10 and i < 15:
        ax = sns.stripplot(
            x=labeledClasses["Constant"],
            y=labeledClasses[col].values,
            hue=labeledClasses["labels"],
            jitter=True,
            ax=axes[2, (i - 10)],
        )
        ax.set_title(col)
    elif i >= 15:
        ax = sns.stripplot(
            x=labeledClasses["Constant"],
            y=labeledClasses[col].values,
            hue=labeledClasses["labels"],
            jitter=True,
            ax=axes[3, (i - 15)],
        )
        ax.set_title(col)

In [None]:
f, axes = plt.subplots(4, 5, figsize=(20, 25), sharex=False)
f.subplots_adjust(hspace=0.2, wspace=0.7)
for i in range(0, len(list(labeledClasses)) - 2):
    col = labeledClasses.columns[i]
    if i < 5:
        ax = sns.swarmplot(
            x=labeledClasses["Constant"],
            y=labeledClasses[col].values,
            hue=labeledClasses["labels"],
            ax=axes[0, (i)],
        )
        ax.set_title(col)
    elif i >= 5 and i < 10:
        ax = sns.swarmplot(
            x=labeledClasses["Constant"],
            y=labeledClasses[col].values,
            hue=labeledClasses["labels"],
            ax=axes[1, (i - 5)],
        )
        ax.set_title(col)
    elif i >= 10 and i < 15:
        ax = sns.swarmplot(
            x=labeledClasses["Constant"],
            y=labeledClasses[col].values,
            hue=labeledClasses["labels"],
            ax=axes[2, (i - 10)],
        )
        ax.set_title(col)
    elif i >= 15:
        ax = sns.swarmplot(
            x=labeledClasses["Constant"],
            y=labeledClasses[col].values,
            hue=labeledClasses["labels"],
            ax=axes[3, (i - 15)],
        )
        ax.set_title(col)

In [None]:
classes = df_lulc["station"]
classes = pd.concat((classes, labels), axis=1)
classes = classes.rename({"station": "station", 0: "Cluster"}, axis=1)
sortclasses = classes.sort_values(["station"])
pd.set_option("display.max_rows", 1000)
sortclasses["lon"] = ok_df_lons
sortclasses["lat"] = ok_df_lats

In [None]:
colors = [
    "black",
    "blue",
    "green",
    "red",
    "orange",
    "yellow",
    "white",
    "purple",
    "cyan",
    "plum",
    "coral",
    "pink",
]
legend = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

colordict = {}

for x, _ in enumerate(colors):
    colordict.update({legend[x]: colors[x]})

sortclasses["color"] = sortclasses["Cluster"].map(colordict)

In [None]:
plurality_plot(sortclasses)

# Let's cluster on Elevation

In [None]:
elev_df = pd.read_csv("/home/aevans/landtype/elevation/data/OK/elev/ok30_df.csv")
elev_df.head()

In [None]:
# exclude the categorical columns
features_elev = elev_df.drop(["station", "Unnamed: 0", "lon", "lat"], axis=1)

In [None]:
# noramalize data
scaler = preprocessing.MinMaxScaler()
features_norm = scaler.fit_transform(features_elev)

In [None]:
pd.DataFrame(features_norm).describe()

In [None]:
# kmeans clustering
inertia_elev = []
K = range(1, 15)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(features_norm)
    kmeanModel.fit(features_norm)
    inertia_elev.append(kmeanModel.inertia_)

In [None]:
# Plot the elbow
plt.plot(K, inertia_elev, "bx-")
plt.xlabel("k")
plt.ylabel("Inertia")
plt.show()

In [None]:
# let's say our elbow is 12
kmeans_elev = KMeans(n_clusters=6).fit(features_norm)

In [None]:
labels_elev = pd.DataFrame(
    kmeans_elev.labels_
)  # This is where the label output of the KMeans we just ran lives. Make it a dataframe so we can concatenate back to the original data
labeledElev = pd.concat((features_elev, labels_elev), axis=1)
labeledElev = labeledElev.rename({0: "labels"}, axis=1)

In [None]:
labeledElev.head()

In [None]:
sns.pairplot(labeledElev, hue="labels")

In [None]:
labeledElev[
    "Constant"
] = "Data"  # This is just to add something constant for the strip/swarm plots' X axis. Can be anything you want it to be.

In [None]:
f, axes = plt.subplots(
    4, 5, figsize=(20, 25), sharex=False
)  # create a 4x5 grid of empty figures where we will plot our feature plots. We will have a couple empty ones.
f.subplots_adjust(hspace=0.2, wspace=0.7)  # Scooch em apart, give em some room
# In this for loop, I step through every column that I want to plot. This is a 4x5 grid, so I split this up by rows of 5 in the else if statements
for i in range(
    0, len(list(labeledElev)) - 2
):  # minus two because I don't want to plot labels or constant
    col = labeledElev.columns[i]
    if i < 5:
        ax = sns.stripplot(
            x=labeledElev["Constant"],
            y=labeledElev[col].values,
            hue=labeledElev["labels"],
            jitter=True,
            ax=axes[0, (i)],
        )
        ax.set_title(col)
    elif i >= 5 and i < 10:
        ax = sns.stripplot(
            x=labeledElev["Constant"],
            y=labeledElev[col].values,
            hue=labeledElev["labels"],
            jitter=True,
            ax=axes[1, (i - 5)],
        )  # so if i=6 it is row 1 column 1
        ax.set_title(col)
    elif i >= 10 and i < 15:
        ax = sns.stripplot(
            x=labeledElev["Constant"],
            y=labeledElev[col].values,
            hue=labeledElev["labels"],
            jitter=True,
            ax=axes[2, (i - 10)],
        )
        ax.set_title(col)
    elif i >= 15:
        ax = sns.stripplot(
            x=labeledElev["Constant"],
            y=labeledElev[col].values,
            hue=labeledElev["labels"],
            jitter=True,
            ax=axes[3, (i - 15)],
        )
        ax.set_title(col)

In [None]:
f, axes = plt.subplots(4, 5, figsize=(20, 25), sharex=False)
f.subplots_adjust(hspace=0.2, wspace=0.7)
for i in range(0, len(list(labeledElev)) - 2):
    col = labeledElev.columns[i]
    if i < 5:
        ax = sns.swarmplot(
            x=labeledElev["Constant"],
            y=labeledElev[col].values,
            hue=labeledElev["labels"],
            ax=axes[0, (i)],
        )
        ax.set_title(col)
    elif i >= 5 and i < 10:
        ax = sns.swarmplot(
            x=labeledElev["Constant"],
            y=labeledElev[col].values,
            hue=labeledElev["labels"],
            ax=axes[1, (i - 5)],
        )
        ax.set_title(col)
    elif i >= 10 and i < 15:
        ax = sns.swarmplot(
            x=labeledElev["Constant"],
            y=labeledElev[col].values,
            hue=labeledElev["labels"],
            ax=axes[2, (i - 10)],
        )
        ax.set_title(col)
    elif i >= 15:
        ax = sns.swarmplot(
            x=labeledElev["Constant"],
            y=labeledElev[col].values,
            hue=labeledElev["labels"],
            ax=axes[3, (i - 15)],
        )
        ax.set_title(col)

In [None]:
elevs = df_lulc["station"]
elevs = pd.concat((elevs, labels_elev), axis=1)
elevs = elevs.rename({"station": "station", 0: "Cluster"}, axis=1)
sortelevs = elevs.sort_values(["station"])
pd.set_option("display.max_rows", 1000)
sortelevs["lon"] = ok_df_lons
sortelevs["lat"] = ok_df_lats

In [None]:
colors = [
    "black",
    "blue",
    "green",
    "red",
    "orange",
    "yellow",
    "white",
    "purple",
    "cyan",
    "plum",
    "coral",
    "pink",
]
legend = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

colordict = {}

for x, _ in enumerate(colors):
    colordict.update({legend[x]: colors[x]})

sortelevs["color"] = sortelevs["Cluster"].map(colordict)

In [None]:
plurality_plot(sortelevs)

# let's cluster on slope

In [None]:
slope_df = pd.read_csv(
    "/home/aevans/landtype/elevation/data/OK/slope/sl_percent_30.csv"
)
slope_df.head()

In [None]:
# exclude the categorical columns
features_slope = slope_df.drop(["site", "station"], axis=1)

In [None]:
# noramalize data
scaler = preprocessing.MinMaxScaler()
features_norm_sl = scaler.fit_transform(features_slope)

In [None]:
pd.DataFrame(features_norm_sl).describe()

In [None]:
# kmeans clustering
inertia_slope = []
K = range(1, 15)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(features_norm_sl)
    kmeanModel.fit(features_norm_sl)
    inertia_slope.append(kmeanModel.inertia_)

In [None]:
# Plot the elbow
plt.plot(K, inertia_slope, "bx-")
plt.xlabel("k")
plt.ylabel("Inertia")
plt.show()

In [None]:
# let's say our elbow is 12
kmeans_slope = KMeans(n_clusters=4).fit(features_norm_sl)

In [None]:
labels_slope = pd.DataFrame(
    kmeans_slope.labels_
)  # This is where the label output of the KMeans we just ran lives. Make it a dataframe so we can concatenate back to the original data
labeledSlope = pd.concat((features_slope, labels_slope), axis=1)
labeledSlope = labeledSlope.rename({0: "labels"}, axis=1)

In [None]:
labeledSlope.head()

In [None]:
sns.pairplot(labeledSlope, hue="labels")

In [None]:
labeledSlope[
    "Constant"
] = "Data"  # This is just to add something constant for the strip/swarm plots' X axis. Can be anything you want it to be.

In [None]:
f, axes = plt.subplots(
    4, 5, figsize=(20, 25), sharex=False
)  # create a 4x5 grid of empty figures where we will plot our feature plots. We will have a couple empty ones.
f.subplots_adjust(hspace=0.2, wspace=0.7)  # Scooch em apart, give em some room
# In this for loop, I step through every column that I want to plot. This is a 4x5 grid, so I split this up by rows of 5 in the else if statements
for i in range(
    0, len(list(labeledSlope)) - 2
):  # minus two because I don't want to plot labels or constant
    col = labeledSlope.columns[i]
    if i < 5:
        ax = sns.stripplot(
            x=labeledSlope["Constant"],
            y=labeledSlope[col].values,
            hue=labeledSlope["labels"],
            jitter=True,
            ax=axes[0, (i)],
        )
        ax.set_title(col)
    elif i >= 5 and i < 10:
        ax = sns.stripplot(
            x=labeledSlope["Constant"],
            y=labeledSlope[col].values,
            hue=labeledSlope["labels"],
            jitter=True,
            ax=axes[1, (i - 5)],
        )  # so if i=6 it is row 1 column 1
        ax.set_title(col)
    elif i >= 10 and i < 15:
        ax = sns.stripplot(
            x=labeledSlope["Constant"],
            y=labeledSlope[col].values,
            hue=labeledSlope["labels"],
            jitter=True,
            ax=axes[2, (i - 10)],
        )
        ax.set_title(col)
    elif i >= 15:
        ax = sns.stripplot(
            x=labeledSlope["Constant"],
            y=labeledSlope[col].values,
            hue=labeledSlope["labels"],
            jitter=True,
            ax=axes[3, (i - 15)],
        )
        ax.set_title(col)

In [None]:
f, axes = plt.subplots(4, 5, figsize=(20, 25), sharex=False)
f.subplots_adjust(hspace=0.2, wspace=0.7)
for i in range(0, len(list(labeledSlope)) - 2):
    col = labeledSlope.columns[i]
    if i < 5:
        ax = sns.swarmplot(
            x=labeledSlope["Constant"],
            y=labeledSlope[col].values,
            hue=labeledSlope["labels"],
            ax=axes[0, (i)],
        )
        ax.set_title(col)
    elif i >= 5 and i < 10:
        ax = sns.swarmplot(
            x=labeledSlope["Constant"],
            y=labeledSlope[col].values,
            hue=labeledSlope["labels"],
            ax=axes[1, (i - 5)],
        )
        ax.set_title(col)
    elif i >= 10 and i < 15:
        ax = sns.swarmplot(
            x=labeledSlope["Constant"],
            y=labeledSlope[col].values,
            hue=labeledSlope["labels"],
            ax=axes[2, (i - 10)],
        )
        ax.set_title(col)
    elif i >= 15:
        ax = sns.swarmplot(
            x=labeledSlope["Constant"],
            y=labeledSlope[col].values,
            hue=labeledSlope["labels"],
            ax=axes[3, (i - 15)],
        )
        ax.set_title(col)

In [None]:
slopes = df_lulc["station"]
slopes = pd.concat((slopes, labels_slope), axis=1)
slopes = slopes.rename({"station": "station", 0: "Cluster"}, axis=1)
sortslopes = slopes.sort_values(["station"])
pd.set_option("display.max_rows", 1000)
sortslopes["lon"] = ok_df_lons
sortslopes["lat"] = ok_df_lats

In [None]:
colors = [
    "black",
    "blue",
    "green",
    "red",
    "orange",
    "yellow",
    "white",
    "purple",
    "cyan",
    "plum",
    "coral",
    "pink",
]
legend = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

colordict = {}

for x, _ in enumerate(colors):
    colordict.update({legend[x]: colors[x]})

sortslopes["color"] = sortslopes["Cluster"].map(colordict)

In [None]:
print("Slopes")
plurality_plot(sortslopes)
print("Landtype")
plurality_plot(sortclasses)
print("Elevation")
plurality_plot(sortelevs)

In [None]:
# for i in np.arange(1,7):
#     slopes_cat = sortslopes.loc[sortslopes['Cluster'] == i]
#     slopes_cat = slopes_cat.loc[sortelevs['Cluster'] == i]
#     print(f'category {i}', slopes_cat)
#     slopes_cat.to_csv(f'/home/aevans/landtype/data/clean_cats/ok/df_{i}.csv')

In [None]:
# dirs = sorted(os.listdir('/home/aevans/landtype/data/clean_cats/ok/'))

In [None]:
# for n,_ in enumerate(dirs):
#     df = pd.read_csv(f'/home/aevans/landtype/data/clean_cats/ok/{dirs[n]}')
#     plurality_plot(df)

# Let's Cluster on All Vars

In [None]:
comb = pd.merge(df_lulc, elev_df, on="station")
comb_df = pd.merge(comb, slope_df, on="station")

In [None]:
comb_df.head()

In [None]:
features_all = comb_df.drop(
    ["site_x", "station", "variance", "Unnamed: 0", "lon", "lat", "site_y"], axis=1
)

In [None]:
features_all.head()

In [None]:
# noramalize data
scaler = preprocessing.MinMaxScaler()
features_norm_a = scaler.fit_transform(features_all)

In [None]:
# kmeans clustering
inertia_elev = []
K = range(1, 20)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(features_norm_a)
    kmeanModel.fit(features_norm_a)
    inertia_elev.append(kmeanModel.inertia_)

In [None]:
# Plot the elbow
plt.plot(K, inertia_elev, "bx-")
plt.xlabel("k")
plt.ylabel("Inertia")
plt.show()

In [None]:
# let's say our elbow is 12
kmeans_all = KMeans(n_clusters=6).fit(features_norm_a)

In [None]:
labels_a = pd.DataFrame(
    kmeans_all.labels_
)  # This is where the label output of the KMeans we just ran lives. Make it a dataframe so we can concatenate back to the original data
labeledAll = pd.concat((features_all, labels_a), axis=1)
labeledAll = labeledAll.rename({0: "labels"}, axis=1)

In [None]:
sns.pairplot(labeledAll, hue="labels")

In [None]:
labeledAll[
    "Constant"
] = "Data"  # This is just to add something constant for the strip/swarm plots' X axis. Can be anything you want it to be.

In [None]:
# f, axes = plt.subplots(4, 5, figsize=(20, 25), sharex=False) #create a 4x5 grid of empty figures where we will plot our feature plots. We will have a couple empty ones.
# f.subplots_adjust(hspace=0.2, wspace=0.7) #Scooch em apart, give em some room
# #In this for loop, I step through every column that I want to plot. This is a 4x5 grid, so I split this up by rows of 5 in the else if statements
# for i in range(0,len(list(labeledAll))-2): #minus two because I don't want to plot labels or constant
#     col = labeledAll.columns[i]
#     if i < 5:
#         ax = sns.stripplot(x=labeledAll['Constant'],y=labeledAll[col].values,hue=labeledAll['labels'],jitter=True,ax=axes[0,(i)])
#         ax.set_title(col)
#     elif i >= 5 and i<10:
#         ax = sns.stripplot(x=labeledAll['Constant'],y=labeledAll[col].values,hue=labeledAll['labels'],jitter=True,ax=axes[1,(i-5)]) #so if i=6 it is row 1 column 1
#         ax.set_title(col)
#     elif i >= 10 and i<15:
#         ax = sns.stripplot(x=labeledAll['Constant'],y=labeledAll[col].values,hue=labeledAll['labels'],jitter=True,ax=axes[2,(i-10)])
#         ax.set_title(col)
#     elif i >= 15:
#         ax = sns.stripplot(x=labeledAll['Constant'],y=labeledAll[col].values,hue=labeledAll['labels'],jitter=True,ax=axes[3,(i-15)])
#         ax.set_title(col)

In [None]:
# f, axes = plt.subplots(4, 5, figsize=(20, 25), sharex=False)
# f.subplots_adjust(hspace=0.2, wspace=0.7)
# for i in range(0,len(list(labeledAll))-2):
#     col = labeledAll.columns[i]
#     if i < 5:
#         ax = sns.swarmplot(x=labeledAll['Constant'],y=labeledAll[col].values,hue=labeledAll['labels'],ax=axes[0,(i)])
#         ax.set_title(col)
#     elif i >= 5 and i<10:
#         ax = sns.swarmplot(x=labeledAll['Constant'],y=labeledAll[col].values,hue=labeledAll['labels'],ax=axes[1,(i-5)])
#         ax.set_title(col)
#     elif i >= 10 and i<15:
#         ax = sns.swarmplot(x=labeledAll['Constant'],y=labeledAll[col].values,hue=labeledAll['labels'],ax=axes[2,(i-10)])
#         ax.set_title(col)
#     elif i >= 15:
#         ax = sns.swarmplot(x=labeledAll['Constant'],y=labeledAll[col].values,hue=labeledAll['labels'],ax=axes[3,(i-15)])
#         ax.set_title(col)

In [None]:
alls = df_lulc["station"]
alls = pd.concat((alls, labels_a), axis=1)
alls = alls.rename({"station": "station", 0: "Cluster"}, axis=1)
sortAll = alls.sort_values(["station"])
pd.set_option("display.max_rows", 1000)
sortAll["lon"] = ok_df_lons
sortAll["lat"] = ok_df_lats

In [None]:
colors = [
    "black",
    "blue",
    "green",
    "red",
    "orange",
    "yellow",
    "grey",
    "purple",
    "cyan",
    "plum",
    "coral",
    "pink",
]
legend = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

colordict = {}

for x, _ in enumerate(colors):
    colordict.update({legend[x]: colors[x]})

sortAll["color"] = sortAll["Cluster"].map(colordict)

In [None]:
plurality_plot(sortAll)

In [None]:
df_land = pd.read_csv("/home/aevans/landtype/data/buffer_10_km/avg_lulc_ok_10.csv")

In [None]:
for i in np.arange(0, 10):
    slopes_cat = sortAll.loc[sortAll["Cluster"] == i]
    slopes_cat = pd.merge(slopes_cat, comb_df, on="station")
    slopes_cat = pd.merge(slopes_cat, df_land, on="station")
    # slopes_cat.to_csv(f'/home/aevans/landtype/data/clean_cats/ok/df_{i}.csv')

In [None]:
def plurality_plot_x(df):
    projPC = crs.PlateCarree()
    latN = df["lat_x"].max() + 1
    latS = df["lat_x"].min() - 1
    lonW = df["lon_x"].max() + 1
    lonE = df["lon_x"].min() - 1
    cLat = (latN + latS) / 2
    cLon = (lonW + lonE) / 2
    projLcc = crs.LambertConformal(central_longitude=cLon, central_latitude=cLat)

    fig, ax = plt.subplots(
        figsize=(12, 9), subplot_kw={"projection": crs.PlateCarree()}, dpi=400
    )
    ax.set_extent([lonW, lonE, latS, latN], crs=projPC)
    ax.add_feature(cfeature.LAND)
    ax.add_feature(cfeature.COASTLINE)
    ax.add_feature(cfeature.BORDERS, linestyle="--")
    ax.add_feature(cfeature.LAKES, alpha=0.5)
    ax.add_feature(cfeature.STATES)
    ax.xticklabels_top = False
    ax.ylabels_right = False
    ax.gridlines(
        crs=crs.PlateCarree(),
        draw_labels=True,
        linewidth=2,
        color="black",
        alpha=0.5,
        linestyle="--",
    )
    ax.scatter(
        x=df["lon_x"],
        y=df["lat_x"],
        c=df["color"],
        s=40,
        marker="o",
        edgecolor="black",
        transform=crs.PlateCarree(),
    )
    ax.set_title("Mesonet Site Groups", size=16)
    ax.set_xlabel("Longitude", size=14)
    ax.set_ylabel("Latitude", size=14)
    ax.tick_params(axis="x", labelsize=12)
    ax.tick_params(axis="y", labelsize=12)
    ax.grid()

In [None]:
dirs = sorted(os.listdir("/home/aevans/landtype/data/clean_cats/ok/"))

In [None]:
df_a = pd.read_csv(f"/home/aevans/landtype/data/clean_cats/ok/df_0.csv")

In [None]:
plurality_plot_x(df_a)

In [None]:
df_b = pd.read_csv(f"/home/aevans/landtype/data/clean_cats/ok/df_1.csv")

In [None]:
plurality_plot_x(df_b)

In [None]:
df_c = pd.read_csv(f"/home/aevans/landtype/data/clean_cats/ok/df_2.csv")

In [None]:
plurality_plot_x(df_c)

In [None]:
df_d = pd.read_csv(f"/home/aevans/landtype/data/clean_cats/ok/df_3.csv")

In [None]:
plurality_plot_x(df_d)

In [None]:
df_e = pd.read_csv(f"/home/aevans/landtype/data/clean_cats/ok/df_4.csv")

In [None]:
plurality_plot_x(df_e)

In [None]:
df_f = pd.read_csv(f"/home/aevans/landtype/data/clean_cats/ok/df_5.csv")

In [None]:
plurality_plot_x(df_f)

In [None]:
# df_new_cats = pd.concat([df_a, df_b, df_c, df_d, df_e, df_f])
# df_new_cats.to_csv('/home/aevans/landtype/df_new_cats_ok.csv')

# Let's Explain the Clusters

In [None]:
from kmeans_interp.kmeans_feature_imp import KMeansInterp

kms = KMeansInterp(
    n_clusters=len(sortAll["Cluster"].unique()),
    random_state=1,
    ordered_feature_names=features_all.keys(),
    feature_importance_method="wcss_min",
).fit(features_all)
labels = kms.labels_
sortAll["Cluster"] = labels

In [None]:
centroids = kms.cluster_centers_
num_clusters = kms.n_clusters
n_features = kms.n_features_in_
centroids_labels = pd.DataFrame(
    {
        "centroid_dim": np.array(
            [centroids[i] for i in range(len(centroids))]
        ).flatten(),
        "cluster_label": np.array(
            [np.repeat(label, n_features) for label in range(num_clusters)]
        ).flatten(),
    }
)

In [None]:
centroids_labels[centroids_labels["cluster_label"] == 0]["centroid_dim"]

In [None]:
kms.feature_importances_[1][:-1]

In [None]:
for cluster_label, feature_weights in kms.feature_importances_.items():
    df_feature_weight = pd.DataFrame(
        feature_weights[:15], columns=["Feature", "Weight"]
    )
    fig, ax = plt.subplots(figsize=(14, 6))
    sns.barplot(x="Feature", y="Weight", data=df_feature_weight)
    plt.xticks(rotation=-45, ha="left")
    ax.tick_params(axis="both", which="major", labelsize=22)
    plt.title(
        f"Highest Weight Features in Cluster {cluster_label}", fontsize="xx-large"
    )
    plt.xlabel("Feature", fontsize=18)
    plt.ylabel("Weight", fontsize=18)

    plt.show()

    print("\n\n")