# Import libraries

* Use just one year (e.g. 2019)
* Train on 70 %, test on 30 % of the data 
* Random forests (classification)

* two classes
* decrease class means > -10%
* no change class is between -5 and 5 %

In [2]:
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt
import plotly

# ML libraries 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

# SHAP values
import shap
#shap.initjs()

# Load the modeling dataframe

In [4]:
# Load the dataframe from a file 
df = pd.read_csv('D:/Stenka_Cliwac/Topic_1/05_RESULTS/20230525_modeling_df/all/modeling_df.csv')

# Delete the columns named "x" and "y"
columns_to_delete = ['x', 'y']
df = df.drop(columns=columns_to_delete)

# Print the modified dataframe
print(df)

# Subset to year 2019 

df_2019 = df[df["Year_NDVI_anom"] == 2019] # 12676 rows, 20 columns 
print(df_2019.shape) # (12676, 20)

# Get the unique values in the "Year_NDVI_anom" column
unique_years = df_2019['Year_NDVI_anom'].unique()

# Print the unique years
print(unique_years) 

       NDVI_anomaly  agriculture_proximity      aspect  canopyheight  \
0          1.416254             127.341949  167.068985     19.195433   
1          1.416254             115.573837  169.534698     18.430872   
2         -5.224010             101.598724  168.056488     19.391434   
3         -2.815495             261.209290  156.305817     20.143400   
4         24.508228             158.589233  156.574768     18.864574   
...             ...                    ...         ...           ...   
52671    -11.722970             357.211300  136.366700     22.712750   
52672    -14.182640             366.751200  205.665100     23.685390   
52673    -14.234450              64.125180  165.034800     20.886400   
52674    -13.254150             204.868300  259.138500     21.125610   
52675    -12.797440             245.438700  170.808900     22.881620   

        elevation  forest_proximity     slope        TCD  water_proximity  \
0       82.999863         56.206688  6.481580  56.059963  

# Create new classes

In [7]:
# Define the thresholds and corresponding categories
thresholds = [-np.inf, -10, -5, 5, 10, np.inf]
categories = ["large_decrease", "small_decrease", "no_change", "small_increase", "large_increase"]

# Create a new column "NDVI_categories" based on the classification
df_2019["NDVI_categories"] = pd.cut(df_2019["NDVI_anomaly"], bins=thresholds, labels=categories, right=False)

# number per category

category_counts = df_2019["NDVI_categories"].value_counts()
print(category_counts)

#no_change         5392
#small_decrease    4603
#large_decrease    2573
#small_increase      82
#large_increase      26
#Name: NDVI_categories, dtype: int64

no_change         5392
small_decrease    4603
large_decrease    2573
small_increase      82
large_increase      26
Name: NDVI_categories, dtype: int64



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Remove increase classes

* too few samples in the % increase categories
* plus, my study is about why the forest suffers...
* I am also removing the "small_decrease" class now.

In [8]:
# Create a boolean mask for rows to be removed
mask = (df_2019["NDVI_categories"] == "small_increase") | (df_2019["NDVI_categories"] == "large_increase") | (df_2019["NDVI_categories"] == "small_decrease")

# Apply the mask to the dataframe to remove the rows
df_filtered = df_2019[~mask]

# categories
print(df_filtered["NDVI_categories"].unique()) # ['no_change', 'large_decrease']

print(df_filtered["NDVI_categories"].value_counts())
#no_change         5392
#large_decrease    2573
#small_decrease       0
#small_increase       0
#large_increase       0
#Name: NDVI_categories, dtype: int64

['no_change', 'large_decrease']
Categories (5, object): ['large_decrease' < 'small_decrease' < 'no_change' < 'small_increase' < 'large_increase']
no_change         5392
large_decrease    2573
small_decrease       0
small_increase       0
large_increase       0
Name: NDVI_categories, dtype: int64


# Make sample sizes per class equal

* less samples per category
* 2500 to start with

In [11]:
# 2500 to start with
# Set the desired number of samples per category
num_samples = 2500

# Group the dataframe by the "NDVI_categories" column
grouped = df_filtered.groupby("NDVI_categories")

# Create an empty list to store the sampled dataframes
sampled_dfs = []

# Iterate over each group
for category, group in grouped:
    # Check if the number of samples in the group is greater than the desired number
    if len(group) > num_samples:
        # Randomly sample the desired number of rows from the group
        sampled_group = group.sample(n=num_samples, random_state=42)
        # Add the sampled group to the list
        sampled_dfs.append(sampled_group)
    else:
        # If the group has fewer samples than the desired number, add all rows to the list
        sampled_dfs.append(group)

# Concatenate the sampled dataframes back into a single dataframe
df_sub_2019 = pd.concat(sampled_dfs)

# Optional: Reset the index of the resulting dataframe
df_sub_2019 = df_sub_2019.reset_index(drop=True)

print(df_sub_2019["NDVI_categories"].value_counts())
# large_decrease    2500
#no_change         2500
#small_decrease       0
#small_increase       0
#large_increase       0
#Name: NDVI_categories, dtype: int64

# save it
#df_sub_2019.to_csv("D:/Stenka_Cliwac/Topic_1/04_PROCESSED_DATA/20230614_modeling_df_2class/modeling_df_2class_2019.csv", index=False)

large_decrease    2500
no_change         2500
small_decrease       0
small_increase       0
large_increase       0
Name: NDVI_categories, dtype: int64
