In [None]:
%load_ext autoreload
%autoreload 2
import sys

# instead of creating a package using setup.py or building from a docker/singularity file,
# import the sister directory of src code to be called on in notebook.
# This keeps the notebook free from code to only hold visualizations and is easier to test
# It also helps keep the state of variables clean such that cells aren't run out of order with a mysterious state
sys.path.append("..")

In [None]:
from numpy import unique
from numpy import where
from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
from sklearn.cluster import AffinityPropagation
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AffinityPropagation
from src import most_recent_mesonet_data
from src import most_recent_mesonet_time
from src import landtype_describe
from src.plotting_scripts import landtype

import seaborn as sns
import os
import pandas as pd
import numpy as np
import cartopy.crs as crs
import cartopy.feature as cfeature

In [None]:
# get lons and lats
# This will return the most recent data avail on mesonet
# this is my file path
ny_mesonet_data_path = "/home/aevans/nysm/archive/nysm/netcdf/proc/"
ok_mesonet_data_path = "/home/aevans/landtype/geoinfo.csv"

In [None]:
# create a dataframe of mesonet data ny
ny_df = most_recent_mesonet_data.current_time_mesonet_df(ny_mesonet_data_path)
ny_df = most_recent_mesonet_time.most_recent_time(ny_df, ny_mesonet_data_path)

In [None]:
ny_df_lons = ny_df["lon"].to_list()
ny_df_lats = ny_df["lat"].to_list()

In [None]:
def plurality_plot(df):
    projPC = crs.PlateCarree()
    latN = df["lat"].max() + 1
    latS = df["lat"].min() - 1
    lonW = df["lon"].max() + 1
    lonE = df["lon"].min() - 1
    cLat = (latN + latS) / 2
    cLon = (lonW + lonE) / 2
    projLcc = crs.LambertConformal(central_longitude=cLon, central_latitude=cLat)

    fig, ax = plt.subplots(
        figsize=(12, 9), subplot_kw={"projection": crs.PlateCarree()}
    )
    ax.set_extent([lonW, lonE, latS, latN], crs=projPC)
    ax.add_feature(cfeature.LAND)
    ax.add_feature(cfeature.COASTLINE)
    ax.add_feature(cfeature.BORDERS, linestyle="--")
    ax.add_feature(cfeature.LAKES, alpha=0.5)
    ax.add_feature(cfeature.STATES)
    ax.xticklabels_top = False
    ax.ylabels_right = False
    ax.gridlines(
        crs=crs.PlateCarree(),
        draw_labels=True,
        linewidth=2,
        color="black",
        alpha=0.5,
        linestyle="--",
    )
    ax.scatter(
        x=df["lon"],
        y=df["lat"],
        c=df["color"],
        s=40,
        marker="o",
        transform=crs.PlateCarree(),
    )
    ax.set_title("Mesonet Site Groups", size=16)
    ax.set_xlabel("Longitude", size=14)
    ax.set_ylabel("Latitude", size=14)
    ax.tick_params(axis="x", labelsize=12)
    ax.tick_params(axis="y", labelsize=12)
    ax.grid()

In [None]:
df = pd.read_csv("/home/aevans/landtype/data/buffer_10_km/avg_lulc_ny_10.csv")

In [None]:
# exclude the categorical columns
features = df.drop(
    ["station", "Unnamed: 0", "firstmode_class", "secondmode_class", "thirdmode_class"],
    axis=1,
)

In [None]:
cor = features.corr()

In [None]:
fig = plt.figure(figsize=(10, 10))
sns.heatmap(cor, square=True)
plt.show()

In [None]:
scaler = StandardScaler()
X_std = scaler.fit_transform(features)

In [None]:
clt = AffinityPropagation(damping=0.5, max_iter=500, affinity="euclidean")

In [None]:
model = clt.fit(X_std)
n_clusters_ = len(model.cluster_centers_indices_)
print("Number of Clusters: ", n_clusters_)

In [None]:
clusters = pd.DataFrame(model.fit_predict(X_std))
features["Cluster"] = clusters

In [None]:
stations = df["station"]
stations = pd.concat((stations, features), axis=1)
stations = stations.rename({"station": "station", 0: "Cluster"}, axis=1)
sortstations = stations.sort_values(["station"])
sortstations["lon"] = ny_df_lons
sortstations["lat"] = ny_df_lats

In [None]:
colors = [
    "black",
    "blue",
    "white",
    "coral",
    "pink",
    "red",
    "magenta",
    "gray",
    "lime",
    "forestgreen",
    "green",
    "olive",
    "brown",
    "slategray",
    "darkorchid",
    "plum",
    "indigo",
    "purple",
    "yellow",
    "gold",
    "orange",
    "cyan",
]
legend = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]


colordict = {}

for x, _ in enumerate(colors):
    colordict.update({legend[x]: colors[x]})

sortstations["color"] = sortstations["Cluster"].map(colordict)

In [None]:
plurality_plot(sortstations)

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
scatter = ax.scatter(
    sortstations["station"], sortstations["Cluster"], c=sortstations["Cluster"], s=50
)
ax.set_title("AffinityPropagation Clustering")
ax.set_xlabel("X0")
ax.set_ylabel("X1")
plt.colorbar(scatter)
plt.show()

# let's cluster on elevation

In [None]:
elev_df = pd.read_csv("/home/aevans/landtype/elevation/data/NY/elev/ny30_df.csv")
elev_df.head()

In [None]:
# exclude the categorical columns
features_elev = elev_df.drop(["station", "Unnamed: 0", "lon", "lat"], axis=1)

In [None]:
cor_e = features_elev.corr()

In [None]:
fig = plt.figure(figsize=(10, 10))
sns.heatmap(cor_e, square=True)
plt.show()

In [None]:
scaler = StandardScaler()
X_std_e = scaler.fit_transform(features_elev)

In [None]:
clt_e = AffinityPropagation(damping=0.5, max_iter=500, affinity="euclidean")

In [None]:
model = clt_e.fit(X_std_e)
n_clusters_ = len(model.cluster_centers_indices_)
print("Number of Clusters: ", n_clusters_)

In [None]:
clusters_e = pd.DataFrame(model.fit_predict(X_std_e))
features_elev["Cluster"] = clusters_e

In [None]:
features_elev.head()

In [None]:
stations_elev = elev_df["station"]
stations_elev = pd.concat((stations_elev, features_elev), axis=1)
stations_elev = stations_elev.rename({"station": "station", 0: "Cluster"}, axis=1)
sortstations_e = stations_elev.sort_values(["station"])
sortstations_e["lon"] = ny_df_lons
sortstations_e["lat"] = ny_df_lats

In [None]:
colors = [
    "black",
    "blue",
    "white",
    "coral",
    "pink",
    "red",
    "magenta",
    "gray",
    "lime",
    "forestgreen",
    "green",
    "olive",
    "brown",
    "slategray",
    "darkorchid",
    "plum",
    "indigo",
    "purple",
    "yellow",
    "gold",
    "orange",
    "cyan",
]
legend = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]


colordict = {}

for x, _ in enumerate(colors):
    colordict.update({legend[x]: colors[x]})

sortstations_e["color"] = sortstations_e["Cluster"].map(colordict)

In [None]:
plurality_plot(sortstations_e)

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
scatter = ax.scatter(
    sortstations_e["station"],
    sortstations_e["Cluster"],
    c=sortstations_e["Cluster"],
    s=50,
)
ax.set_title("AffinityPropagation Clustering")
ax.set_xlabel("X0")
ax.set_ylabel("X1")
plt.colorbar(scatter)
plt.show()

# Let's Cluster on Slope

In [None]:
slope_df = pd.read_csv("/home/aevans/landtype/elevation/data/NY/elev/slope30_ny_df.csv")
slope_df.head()

In [None]:
# exclude the categorical columns
features_slope = slope_df.drop(
    ["Station", "Unnamed: 0", "Direction", "Slope", "color"], axis=1
)

In [None]:
cor_s = features_slope.corr()

In [None]:
fig = plt.figure(figsize=(10, 10))
sns.heatmap(cor_s, square=True)
plt.show()

In [None]:
scaler = StandardScaler()
X_std_s = scaler.fit_transform(features_slope)

In [None]:
clt_s = AffinityPropagation(damping=0.5, max_iter=500, affinity="euclidean")

In [None]:
model = clt_s.fit(X_std_s)
n_clusters_ = len(model.cluster_centers_indices_)
print("Number of Clusters: ", n_clusters_)

In [None]:
clusters_s = pd.DataFrame(model.fit_predict(X_std_s))
features_slope["Cluster"] = clusters_s

In [None]:
features_slope.head()

In [None]:
stations_slope = slope_df["Station"]
stations_slope = pd.concat((stations_slope, features_slope), axis=1)
stations_slope = stations_slope.rename({"Station": "station", 0: "Cluster"}, axis=1)
sortstations_s = stations_slope.sort_values(["station"])
sortstations_e["lon"] = ny_df_lons
sortstations_e["lat"] = ny_df_lats

In [None]:
sortstations_s.head()

In [None]:
colors = [
    "black",
    "blue",
    "white",
    "coral",
    "pink",
    "red",
    "magenta",
    "gray",
    "lime",
    "forestgreen",
    "green",
    "olive",
    "brown",
    "slategray",
    "darkorchid",
    "plum",
    "indigo",
    "purple",
    "yellow",
    "gold",
    "orange",
    "cyan",
]
legend = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]


colordict = {}

for x, _ in enumerate(colors):
    colordict.update({legend[x]: colors[x]})

sortstations_s["color"] = sortstations_s["Cluster"].map(colordict)

In [None]:
plurality_plot(sortstations_s)

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
scatter = ax.scatter(
    sortstations_s["station"],
    sortstations_s["Cluster"],
    c=sortstations_s["Cluster"],
    s=50,
)
ax.set_title("AffinityPropagation Clustering")
ax.set_xlabel("X0")
ax.set_ylabel("X1")
plt.colorbar(scatter)
plt.show()

# Let's try them all together

In [None]:
stations_ls = df["station"]

In [None]:
sortstations_s.head()

In [None]:
daddy = pd.merge(sortstations, sortstations_e, on="station")
dad_df = pd.merge(daddy, sortstations_s, on="station")
dad_df = dad_df.drop(
    columns=[
        "station",
        "Cluster_x",
        "Cluster",
        "Cluster_y",
        "lon",
        "lat",
        "lon_x",
        "lat_x",
        "lon_y",
        "lat_y",
        "color",
        "color_x",
        "color_y",
        "sum_total",
        "Sum_total",
        "Divide",
        "Count",
    ]
)
dad_df

In [None]:
cor_d = dad_df.corr()

In [None]:
fig = plt.figure(figsize=(10, 10))
sns.heatmap(cor_d, square=True)
plt.show()

In [None]:
scaler = StandardScaler()
X_std_d = scaler.fit_transform(dad_df)

In [None]:
model = clt.fit(X_std_d)
n_clusters_ = len(model.cluster_centers_indices_)
print("Number of Clusters: ", n_clusters_)

In [None]:
clusters_d = pd.DataFrame(model.fit_predict(X_std_d))
dad_df["Cluster"] = clusters_d

In [None]:
stations_d = df["station"]
stations_d = pd.concat((stations_d, dad_df), axis=1)
stations_d = stations_d.rename({"station": "station", 0: "Cluster"}, axis=1)
sortstations_d = stations_d.sort_values(["station"])
sortstations_d["lon"] = ny_df_lons
sortstations_d["lat"] = ny_df_lats

In [None]:
sortstations_d.head()

In [None]:
colors = [
    "black",
    "blue",
    "white",
    "coral",
    "pink",
    "red",
    "magenta",
    "gray",
    "lime",
    "forestgreen",
    "green",
    "olive",
    "brown",
    "slategray",
    "darkorchid",
    "plum",
    "indigo",
    "purple",
    "yellow",
    "gold",
    "orange",
    "cyan",
]
legend = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]


colordict = {}

for x, _ in enumerate(colors):
    colordict.update({legend[x]: colors[x]})

sortstations_d["color"] = sortstations_d["Cluster"].map(colordict)

In [None]:
plurality_plot(sortstations_d)

In [None]:
for i in np.arange(0, 17):
    slopes_cat = sortstations_d.loc[sortstations_d["Cluster"] == i]
    slopes_cat = pd.merge(slopes_cat, daddy, on="station")
    slopes_cat.to_csv(f"/home/aevans/landtype/data/clean_cats/ny/aff/df_{i}.csv")

In [None]:
dirs = sorted(os.listdir("/home/aevans/landtype/data/clean_cats/ny/aff/"))

In [None]:
for n, _ in enumerate(dirs):
    df = pd.read_csv(f"/home/aevans/landtype/data/clean_cats/ny/aff/{dirs[n]}")
    plurality_plot(df)