In [None]:
import pandas as pd
import os
from collections import Counter
from functools import reduce
from operator import add
import numpy as np
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", None)

In [None]:
base_path = "/media/muskrat/T7 Shield/eco_data/v4/occurences/step_4/processing/step_1/spatial_mapping/soil/split_retry_2"

In [None]:
files = os.listdir(base_path)

In [None]:
combined = pd.DataFrame()

for file in files:
    df = pd.read_parquet(f"{base_path}/{file}")

    combined = pd.concat([combined, df], ignore_index=True, axis=0)

In [None]:
combined

In [None]:
df_path = "/media/muskrat/T7 Shield/eco_data/v4/occurences/step_4/processing/step_1/spatial_mapping/soil/split_retry_2/chunk_0.parquet"

df = pd.read_parquet(df_path)

In [None]:
df

In [None]:
# print max length of soil_id column
print(f'max id length is {df["soil_id"].str.len().max()}')
print(f'max specific length is {df["specific_soil_name"].str.len().max()}')
print(f'max dominant length is {df["dominant_soil_name"].str.len().max()}')

In [None]:
df["soil_id"] = df["soil_id"].apply(lambda x: dict(Counter(x)))
df["specific_soil_name"] = df["specific_soil_name"].apply(lambda x: dict(Counter(x)))
df["dominant_soil_name"] = df["dominant_soil_name"].apply(lambda x: dict(Counter(x)))

In [None]:
df

In [None]:
# print the max number of keys in soil_id column
print(f'max id keys is {df["soil_id"].apply(lambda x: len(x.keys())).max()}')
print(
    f'max specific keys is {df["specific_soil_name"].apply(lambda x: len(x.keys())).max()}'
)
print(
    f'max dominant keys is {df["dominant_soil_name"].apply(lambda x: len(x.keys())).max()}'
)

In [None]:
duplicates = df[df.duplicated(subset=["scientific_name"], keep=False)]

duplicates

In [None]:
combined["soil_id"] = combined["soil_id"].apply(lambda x: dict(Counter(x)))
combined["specific_soil_name"] = combined["specific_soil_name"].apply(
    lambda x: dict(Counter(x))
)
combined["dominant_soil_name"] = combined["dominant_soil_name"].apply(
    lambda x: dict(Counter(x))
)

In [None]:
combined

In [None]:
print(f'max id keys is {combined["soil_id"].apply(lambda x: len(x.keys())).max()}')
print(
    f'max specific keys is {combined["specific_soil_name"].apply(lambda x: len(x.keys())).max()}'
)
print(
    f'max dominant keys is {combined["dominant_soil_name"].apply(lambda x: len(x.keys())).max()}'
)

In [None]:
combined_grouped = combined.groupby(
    ["scientific_name", "kingdom", "phylum", "class", "order", "family", "genus"],
    as_index=False,
    dropna=False,
)[["soil_id", "specific_soil_name", "dominant_soil_name"]].agg(
    lambda x: reduce(add, (Counter(sublist) for sublist in x))
)

In [None]:
combined_grouped

In [None]:
print(
    f'max id keys is {combined_grouped["soil_id"].apply(lambda x: len(x.keys())).max()}'
)
print(
    f'max specific keys is {combined_grouped["specific_soil_name"].apply(lambda x: len(x.keys())).max()}'
)
print(
    f'max dominant keys is {combined_grouped["dominant_soil_name"].apply(lambda x: len(x.keys())).max()}'
)

In [None]:
# convert soil_id values to dict from Counter
combined_grouped["soil_id"] = combined_grouped["soil_id"].apply(
    lambda x: dict(Counter(x))
)
combined_grouped["specific_soil_name"] = combined_grouped["specific_soil_name"].apply(
    lambda x: dict(Counter(x))
)
combined_grouped["dominant_soil_name"] = combined_grouped["dominant_soil_name"].apply(
    lambda x: dict(Counter(x))
)

combined_grouped

In [None]:
duplicates = combined_grouped[
    combined_grouped.duplicated(subset=["scientific_name"], keep=False)
]

duplicates

In [None]:
del combined

In [None]:
combined_grouped.to_json(
    "/media/muskrat/T7 Shield/eco_data/v4/occurences/step_4/processing/step_2/merge/soil_only_merge.json",
    orient="records",
    force_ascii=False,
)
# combined_grouped.to_parquet(
#     "/media/muskrat/T7 Shield/eco_data/v4/occurences/step_4/processing/step_2/merge/soil_only_merge.parquet"
# )

In [None]:
# plot a bar chart of dominant_soil_name for row 0, where the key are the x axis and the value are the y axis

# import matplotlib.pyplot as plt

plt.bar(
    list(combined_grouped.iloc[18990]["dominant_soil_name"].keys()),
    list(combined_grouped.iloc[18990]["dominant_soil_name"].values()),
)
# plt.show()

# make the x axis labels vertical
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.bar(
    list(combined_grouped.iloc[18990]["specific_soil_name"].keys()),
    list(combined_grouped.iloc[18990]["specific_soil_name"].values()),
)
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.bar(
    list(combined_grouped.iloc[18990]["soil_id"].keys()),
    list(combined_grouped.iloc[18990]["soil_id"].values()),
)
plt.xticks(rotation=90)
plt.show()

In [None]:
# find the row index of the row with the most soil_id values

combined_grouped["soil_id"].apply(lambda x: len(x.keys())).idxmax()

In [None]:
# print row 18990
combined_grouped.iloc[18990]

In [None]:
# for row 18990, find the sum of the values in the soil_id column

total = sum(combined_grouped.iloc[18990]["soil_id"].values())

# for row 18990, find the key with the highest value in the soil_id column and store the value in a variable

max_value = max(combined_grouped.iloc[18990]["soil_id"].values())

percentage = (max_value / total) * 100

print(f"percentage is {percentage}")

In [None]:
# create a new column called max_soil_id_percentage
combined_grouped["max_soil_id_percentage"] = combined_grouped.apply(
    lambda x: (max(x["soil_id"].values()) / sum(x["soil_id"].values())) * 100, axis=1
)

combined_grouped

In [None]:
combined_grouped["max_specific_percentage"] = combined_grouped.apply(
    lambda x: (
        max(x["specific_soil_name"].values()) / sum(x["specific_soil_name"].values())
    )
    * 100,
    axis=1,
)

combined_grouped

In [None]:
plants_and_fungi = combined_grouped[
    (combined_grouped["kingdom"] == "Plantae")
    | (combined_grouped["kingdom"] == "Fungi")
]

plants_and_fungi

In [None]:
# create new db called hundred which contains the rows in combined_grouped where the sum of the values in the soil_id column is greater than 100
hundred = plants_and_fungi[
    plants_and_fungi["soil_id"].apply(lambda x: sum(x.values())) > 100
]

fifty = plants_and_fungi[
    plants_and_fungi["soil_id"].apply(lambda x: sum(x.values())) > 50
]

twenty = plants_and_fungi[
    plants_and_fungi["soil_id"].apply(lambda x: sum(x.values())) > 20
]

ten = plants_and_fungi[
    plants_and_fungi["soil_id"].apply(lambda x: sum(x.values())) > 10
]

under_ten = plants_and_fungi[
    plants_and_fungi["soil_id"].apply(lambda x: sum(x.values())) <= 10
]

In [None]:
hundred_specific = plants_and_fungi[
    plants_and_fungi["specific_soil_name"].apply(lambda x: sum(x.values())) > 100
]

fifty_specific = plants_and_fungi[
    plants_and_fungi["specific_soil_name"].apply(lambda x: sum(x.values())) > 50
]

twenty_specific = plants_and_fungi[
    plants_and_fungi["specific_soil_name"].apply(lambda x: sum(x.values())) > 20
]

ten_specific = plants_and_fungi[
    plants_and_fungi["specific_soil_name"].apply(lambda x: sum(x.values())) > 10
]

under_ten_specific = plants_and_fungi[
    plants_and_fungi["specific_soil_name"].apply(lambda x: sum(x.values())) <= 10
]

In [None]:
# plot max_specific_percentage with bin width of 10

# import matplotlib.pyplot as plt

plt.hist(hundred["max_specific_percentage"], bins=10)
plt.show()

In [None]:
plt.hist(hundred["max_soil_id_percentage"], bins=10)
plt.show()

In [None]:
plt.hist(fifty["max_specific_percentage"], bins=10)
plt.show()

plt.hist(fifty["max_soil_id_percentage"], bins=10)
plt.show()

In [None]:
plt.hist(twenty["max_specific_percentage"], bins=10)
plt.show()

plt.hist(twenty["max_soil_id_percentage"], bins=10)
plt.show()

In [None]:
plt.hist(ten["max_specific_percentage"], bins=10)
plt.show()

plt.hist(ten["max_soil_id_percentage"], bins=10)
plt.show()

In [None]:
# what percentage of values in max_soil_id_percentage in hundred are less than 20

print(len(hundred[hundred["max_soil_id_percentage"] < 45]) / len(hundred) * 100)

print(len(fifty[fifty["max_soil_id_percentage"] < 25]) / len(fifty) * 100)

print(len(twenty[twenty["max_soil_id_percentage"] < 30]) / len(twenty) * 100)

print(len(ten[ten["max_soil_id_percentage"] < 30]) / len(ten) * 100)

In [None]:
print(len(hundred[hundred["max_specific_percentage"] < 40]) / len(hundred) * 100)

print(len(fifty[fifty["max_specific_percentage"] < 45]) / len(fifty) * 100)

print(len(twenty[twenty["max_specific_percentage"] < 50]) / len(twenty) * 100)

print(len(ten[ten["max_specific_percentage"] < 50]) / len(ten) * 100)

In [None]:
combined_grouped["soil_id_key_count"] = combined_grouped["soil_id"].apply(
    lambda x: len(x.keys())
)

combined_grouped