# Import libraries

* Use just one year (e.g. 2019)
* Train on 70 %, test on 30 % of the data 
* Random forests (classification)

* two classes
* decrease class means > -10%
* no change class is between -5 and 5 %

In [2]:
# shapenv Anaconda environment
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt
import plotly

# ML libraries 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

# SHAP values
import shap
#shap.initjs()

# Load the modeling dataframe

In [4]:
# Load the dataframe from a file 
df = pd.read_csv('D:/Stenka_Cliwac/Topic_1/04_PROCESSED_DATA/20230623_modeling_df/all/noNA/modeling_df_noNA.csv')
# Rows: 50,968
# Columns: 27

# Keep the columns named "x" and "y" this time

# Print the dataframe
print(df)
# [50968 rows x 27 columns]

# Subset to year 2019 

df_2019 = df[df["Year_NDVI_anom"] == 2019] # 12676 rows, 20 columns 
print(df_2019.shape) 
# (12281, 27)

# Get the unique values in the "Year_NDVI_anom" column
unique_years = df_2019['Year_NDVI_anom'].unique()

# Print the unique years
print(unique_years) # [2019]

               x          y  NDVI_anomaly  agriculture_proximity      aspect  \
0      13.694817  53.463234     -5.224010             101.598724  168.056488   
1      14.215839  53.391369     -4.695665             141.954056  166.101166   
2      14.215839  53.382386     -6.167371             340.478790  160.925415   
3      14.224823  53.382386    -12.489527             177.500885  137.270935   
4      14.215839  53.373403     -8.321093             251.918365  155.945984   
...          ...        ...           ...                    ...         ...   
50963  13.883463  51.379143    -11.596499             364.639801  193.677811   
50964  13.892446  51.379143    -17.068777             220.772079  213.901718   
50965  13.901429  51.379143    -18.853287             198.778305  201.734650   
50966  13.973294  51.379143    -11.722972             357.211304  136.366745   
50967  13.703800  51.370160    -14.234449              64.125183  165.034775   

       broadleaf_percentage  canopyheig

# Create new classes

In [5]:
# Define the thresholds and corresponding categories
thresholds = [-np.inf, -10, -5, 5, 10, np.inf]
categories = ["large_decrease", "small_decrease", "no_change", "small_increase", "large_increase"]

# Create a new column "NDVI_categories" based on the classification
df_2019["NDVI_categories"] = pd.cut(df_2019["NDVI_anomaly"], bins=thresholds, labels=categories, right=False)

# number per category

category_counts = df_2019["NDVI_categories"].value_counts()
print(category_counts)

#NDVI_categories
#no_change         5217
#small_decrease    4470
#large_decrease    2491
#small_increase      80
#large_increase      23
#Name: count, dtype: int64

NDVI_categories
no_change         5217
small_decrease    4470
large_decrease    2491
small_increase      80
large_increase      23
Name: count, dtype: int64



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Remove increase classes

* too few samples in the % increase categories
* plus, my study is about why the forest suffers...
* I am also removing the "small_decrease" class now.

In [6]:
# Create a boolean mask for rows to be removed
mask = (df_2019["NDVI_categories"] == "small_increase") | (df_2019["NDVI_categories"] == "large_increase") | (df_2019["NDVI_categories"] == "small_decrease")

# Apply the mask to the dataframe to remove the rows
df_filtered = df_2019[~mask]

# categories
print(df_filtered["NDVI_categories"].unique()) # ['no_change', 'large_decrease']

print(df_filtered["NDVI_categories"].value_counts())
#NDVI_categories
#no_change         5217
#large_decrease    2491
#small_decrease       0
#small_increase       0
#large_increase       0
#Name: count, dtype: int64

['no_change', 'large_decrease']
Categories (5, object): ['large_decrease' < 'small_decrease' < 'no_change' < 'small_increase' < 'large_increase']
NDVI_categories
no_change         5217
large_decrease    2491
small_decrease       0
small_increase       0
large_increase       0
Name: count, dtype: int64


# Make sample sizes per class equal

* less samples per category
* 2000 to start with

In [8]:
# 2000 to start with
# Set the desired number of samples per category
num_samples = 2000

# Group the dataframe by the "NDVI_categories" column
grouped = df_filtered.groupby("NDVI_categories")

# Create an empty list to store the sampled dataframes
sampled_dfs = []

# Iterate over each group
for category, group in grouped:
    # Check if the number of samples in the group is greater than the desired number
    if len(group) > num_samples:
        # Randomly sample the desired number of rows from the group
        sampled_group = group.sample(n=num_samples, random_state=42)
        # Add the sampled group to the list
        sampled_dfs.append(sampled_group)
    else:
        # If the group has fewer samples than the desired number, add all rows to the list
        sampled_dfs.append(group)

# Concatenate the sampled dataframes back into a single dataframe
df_sub_2019 = pd.concat(sampled_dfs)

# Optional: Reset the index of the resulting dataframe
df_sub_2019 = df_sub_2019.reset_index(drop=True)

print(df_sub_2019["NDVI_categories"].value_counts())
# large_decrease    2000
#no_change         2000
#small_decrease       0
#small_increase       0
#large_increase       0
#Name: NDVI_categories, dtype: int64

# save it
df_sub_2019.to_csv("D:/Stenka_Cliwac/Topic_1/04_PROCESSED_DATA/20230623_modeling_df/all/twoclass_subset/modeling_df_2class_2019.csv", index=False)

NDVI_categories
large_decrease    2000
no_change         2000
small_decrease       0
small_increase       0
large_increase       0
Name: count, dtype: int64
