In [385]:
%load_ext autoreload
%autoreload 2
import sys

# instead of creating a package using setup.py or building from a docker/singularity file,
# import the sister directory of src code to be called on in notebook.
# This keeps the notebook free from code to only hold visualizations and is easier to test
# It also helps keep the state of variables clean such that cells aren't run out of order with a mysterious state
sys.path.append("..")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [386]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
import matplotlib.patches as mpatches

# import sklearn.cluster.hierarchical as hclust
from sklearn import preprocessing
import seaborn as sns
from sklearn.metrics import silhouette_score

from src import most_recent_mesonet_data
from src import most_recent_mesonet_time
from src import landtype_describe
from src import get_represents

from src.plotting_scripts import landtype
from src.plotting_scripts import stackplot
from src.plotting_scripts import rose_plot
from src.plotting_scripts import stacks
from src.plotting_scripts import stat_scatterplot

import os
import pandas as pd
import cartopy.crs as crs
import cartopy.feature as cfeature
from scipy.stats import skew
import statistics

In [387]:
def format_df(df):
    """Format a DataFrame with counts of values into a DataFrame with individual values

    Args:
    - df: A pandas DataFrame with columns "COUNT" and "VALUE"

    Returns:
    - A new pandas DataFrame with a single column "VALUE" containing individual values

    Example:
    input DataFrame:

    | COUNT | VALUE |
    |-------|-------|
    |   2   |  10   |
    |   3   |  20   |
    |   1   |  30   |

    output DataFrame:

    | VALUE |
    |-------|
    |  10   |
    |  10   |
    |  20   |
    |  20   |
    |  20   |
    |  30   |
    """

    new_df = pd.DataFrame()
    value_list = []

    # iterate over the rows of the input DataFrame
    for x, _ in df.iterrows():
        # extract the count and value from each row
        count = int(df.iloc[x]["Count"])
        value = df.iloc[x]["Value"]

        # repeat the value the specified number of times
        for n in np.arange(count):
            val = value
            value_list.append(val)

    # create a new DataFrame with the individual values
    new_df["Value"] = value_list

    return new_df

In [388]:
def main(clim_div_int, climate_division):
    # read csvs
    nysm_cats_df = pd.read_csv("/home/aevans/nwp_bias/src/landtype/data/nysm.csv")
    # LULC
    lulc_df = pd.read_csv("/home/aevans/nwp_bias/src/correlation/data/nlcd_nam.csv")
    # elevation
    elev_df = pd.read_csv("/home/aevans/nwp_bias/src/correlation/data/elev_nam.csv")
    # aspect/slope
    asp_slop_df = pd.read_csv(
        "/home/aevans/nwp_bias/src/correlation/data/aspect_nam.csv"
    )

    # get and format csvs from clim region
    # get
    nlcd_df = pd.read_csv(
        f"/home/aevans/nwp_bias/src/landtype/data/org_cats_geoinfo/{climate_division}_lulc.csv"
    )
    nlcd_df = nlcd_df.drop(columns=["OID_", "Red", "Blue", "Green", "Opacity"])
    aspect_df = pd.read_csv(
        f"/home/aevans/nwp_bias/src/landtype/data/org_cats_geoinfo/{climate_division}_asp_slope.csv"
    )
    elevation_df = pd.read_csv(
        f"/home/aevans/nwp_bias/src/landtype/data/org_cats_geoinfo/{climate_division}_elevs.csv"
    )
    elevation_df = format_df(elevation_df)
    # format
    df_y = aspect_df.assign(
        Percentage=lambda x: (x["Count"] / sum(aspect_df["Count"]) * 100)
    )
    df_x = nlcd_df.assign(
        Percentage=lambda x: (x["Count"] / sum(nlcd_df["Count"]) * 100)
    )
    mean = elevation_df["Value"].mean()
    my_skew = skew(elevation_df["Value"])

    # concat and get first clim division
    clim1_df = pd.concat([nysm_cats_df, lulc_df, elev_df, asp_slop_df], axis=1)
    clim1_df = clim1_df[clim1_df["climate_division"] == clim_div_int]
    lons = clim1_df["lon [degrees]"].tolist()
    lats = clim1_df["lat [degrees]"].tolist()
    elevs = clim1_df["elev"].tolist()
    sites = clim1_df["stid"].tolist()

    # LULC
    print("LULC")
    stack_df = lulc_df[lulc_df["station"].isin(clim1_df["stid"])]
    stack_df = get_represents.get_represent(stack_df, df_x)

    # Aspect/Slope
    print("Aspect/Slope")
    stack_df1 = asp_slop_df[asp_slop_df["station"].isin(clim1_df["stid"])]
    stack_df1 = get_represents.get_represent(stack_df1, df_y)

    # Elevation
    print("Elevation")
    stack_df2 = elev_df[elev_df["station"].isin(clim1_df["stid"])]
    stack_df2 = stack_df2.drop(
        columns=["Unnamed: 0", "station", "med_dist", "lat", "lon", "variance", "std"]
    )
    stack_df2["elev"] = (mean - stack_df2["elev"]) / stack_df2["elev"].std()
    stack_df2["sums"] = stack_df2.sum(axis=1)
    stack_df2["sums"] = (stack_df2["sums"]) / stack_df2["sums"].std()
    stack_df2["station"] = sites

    # final eval
    df = pd.DataFrame()
    df["sums"] = stack_df["sums"] + stack_df1["sums"] + stack_df2["sums"]
    df["sums"] = df["sums"] / df["sums"].std()
    df["station"] = sites

    return df

In [389]:
df = main(1, "west_plat")

LULC
Aspect/Slope
Elevation


In [390]:
df

Unnamed: 0,sums,station
0,0.339027,ADDI
6,1.22396,BELM
28,-0.044861,COHO
34,0.303897,DELE
43,2.501506,ELMI
51,-0.39956,GROV
55,-0.781571,HART
73,1.138078,OLEA
85,1.07555,RAND
