In [97]:
import pandas as pd
import plotly.express as px

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.tree import export_graphviz
from IPython.display import Image
from subprocess import call
import shap
import joblib
from sklearn.preprocessing import LabelEncoder
from xgboost import plot_tree, plot_importance

In [98]:
def encode_and_export(df, columns):
    """
    Encode categorical columns in a DataFrame using LabelEncoder
    and export the encoder.

    Parameters:
    - df: pandas DataFrame
    - columns: list of column names to be encoded

    Returns:
    - df_encoded: DataFrame with encoded categorical columns
    - encoders: dictionary containing the fitted LabelEncoders
    """

    df_encoded = df.copy()
    encoders = {}

    for column in columns:
        if df[column].dtype == 'O':  # Check if the column is categorical
            le = LabelEncoder()
            df_encoded[column] = le.fit_transform(df[column])
            encoders[column] = le

    # Export encoders using joblib
    for column, encoder in encoders.items():
        filename = f"./encoders/{column}_encoder.joblib"
        joblib.dump(encoder, filename)
        print(f"Encoder for '{column}' saved as {filename}")

    return df_encoded, encoders


def import_and_encode_new_data(new_data, encoders):
    """
    Import the saved encoders and use them to encode new data.

    Parameters:
    - new_data: pandas DataFrame containing new data to be encoded
    - encoders: dictionary containing the saved LabelEncoders

    Returns:
    - new_data_encoded: DataFrame with encoded categorical columns
    """

    new_data_encoded = new_data.copy()

    for column, encoder in encoders.items():
        if column in new_data.columns:
            try:
                new_data_encoded[column] = encoder.transform(new_data[column])
            except ValueError:
                print(f"Warning: Column '{column}' in new_data contains values not present in the training data. {column} It will be ignored.")
        else:
            print(f"Warning: Column '{column}' not found in new_data. It will be ignored.")

    return new_data_encoded


def load_encoders(encoder_data):
    """
    Load the saved encoders from files.

    Parameters:
    - encoder_data: dictionary containing the column names and the filenames of the saved encoders {column: filename}

    Returns:
    - encoders: dictionary containing the loaded LabelEncoders
    """

    encoders = {}

    for column, filename in encoder_data.items():
        encoders[column] = joblib.load(filename)
        print(f"Encoder for '{column}' loaded from {filename}")
        
    return encoders

In [99]:
df = pd.read_parquet('../django-api/app/fires_merged_comunas.parquet')

In [100]:
# Remove values where satellite == '1'
df = df[df['satellite'] != '1']

In [101]:
# Not null data, aka labeled data
labeled = df[df['type'].notnull()]

In [102]:
labeled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 367685 entries, 0 to 465149
Data columns (total 16 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   latitude    367685 non-null  float64       
 1   longitude   367685 non-null  float64       
 2   brightness  367685 non-null  float64       
 3   scan        367685 non-null  float64       
 4   track       367685 non-null  float64       
 5   acq_date    367685 non-null  datetime64[ns]
 6   acq_time    367685 non-null  int64         
 7   satellite   367685 non-null  object        
 8   instrument  367685 non-null  object        
 9   confidence  367685 non-null  object        
 10  version     367685 non-null  object        
 11  bright_t31  367685 non-null  float64       
 12  frp         367685 non-null  float64       
 13  daynight    367685 non-null  object        
 14  type        367685 non-null  float64       
 15  comuna      367362 non-null  object        
dtypes: date

In [103]:
unlabeled = df[df['type'].isnull()]

In [104]:
unlabeled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 80105 entries, 435741 to 580028
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   latitude    80105 non-null  float64       
 1   longitude   80105 non-null  float64       
 2   brightness  80105 non-null  float64       
 3   scan        80105 non-null  float64       
 4   track       80105 non-null  float64       
 5   acq_date    80105 non-null  datetime64[ns]
 6   acq_time    80105 non-null  int64         
 7   satellite   80105 non-null  object        
 8   instrument  80105 non-null  object        
 9   confidence  80105 non-null  object        
 10  version     80105 non-null  object        
 11  bright_t31  80105 non-null  float64       
 12  frp         80105 non-null  float64       
 13  daynight    80105 non-null  object        
 14  type        0 non-null      float64       
 15  comuna      80089 non-null  object        
dtypes: datetime64[ns](1),

In [105]:
train = labeled.copy()

categorical_columns = ['satellite', 'comuna', 'daynight']

train_encoded, e = encode_and_export(train, categorical_columns)

Encoder for 'satellite' saved as ./encoders/satellite_encoder.joblib
Encoder for 'comuna' saved as ./encoders/comuna_encoder.joblib
Encoder for 'daynight' saved as ./encoders/daynight_encoder.joblib


In [106]:
train_encoded.head()

Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_t31,frp,daynight,type,comuna
0,-23.820446,-70.320282,301.51,0.74,0.76,2013-01-01,448,1,VIIRS,n,1,285.54,2.38,1,2.0,7
1,-23.823833,-70.318871,306.9,0.74,0.76,2013-01-01,448,1,VIIRS,n,1,285.8,2.33,1,2.0,7
2,-26.430983,-69.475632,299.73,0.58,0.7,2013-01-01,448,1,VIIRS,n,1,279.61,2.86,1,2.0,76
3,-32.760929,-71.47644,309.7,0.52,0.67,2013-01-01,448,1,VIIRS,n,1,285.42,2.5,1,3.0,218
4,-34.624073,-71.000023,319.97,0.44,0.63,2013-01-01,448,1,VIIRS,n,1,290.28,2.27,1,0.0,41


In [107]:
unlabeled_encoded = unlabeled.copy()

In [108]:
encoders_path = {
    'satellite': './encoders/satellite_encoder.joblib',
    'comuna': './encoders/comuna_encoder.joblib',
    'daynight': './encoders/daynight_encoder.joblib'
}

encoders = load_encoders(encoders_path)

Encoder for 'satellite' loaded from ./encoders/satellite_encoder.joblib
Encoder for 'comuna' loaded from ./encoders/comuna_encoder.joblib
Encoder for 'daynight' loaded from ./encoders/daynight_encoder.joblib


In [109]:
encoders

{'satellite': LabelEncoder(),
 'comuna': LabelEncoder(),
 'daynight': LabelEncoder()}

In [110]:
unlabeled_encoded = import_and_encode_new_data(unlabeled_encoded, encoders)

In [111]:
unlabeled_encoded

Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_t31,frp,daynight,type,comuna
435741,-34.10435,-70.45564,305.62,0.57,0.52,2022-09-01,506,1,VIIRS,n,2.0NRT,275.52,1.92,1,,148
435742,-36.86266,-71.38131,298.44,0.55,0.51,2022-09-01,508,1,VIIRS,n,2.0NRT,268.38,2.18,1,,50
435743,-36.86740,-71.38251,308.81,0.55,0.51,2022-09-01,508,1,VIIRS,n,2.0NRT,264.69,2.18,1,,50
435744,-36.81532,-73.02235,297.67,0.33,0.55,2022-09-01,508,1,VIIRS,n,2.0NRT,277.15,0.54,1,,58
435745,-36.81627,-73.01860,297.46,0.33,0.55,2022-09-01,508,1,VIIRS,n,2.0NRT,277.20,0.54,1,,58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580024,-34.10617,-70.45506,299.64,0.50,0.49,2024-01-01,616,1,VIIRS,n,2.0NRT,286.24,5.18,1,,148
580025,-39.42022,-71.93173,305.23,0.48,0.48,2024-01-01,616,1,VIIRS,n,2.0NRT,266.92,4.84,1,,189
580026,-39.42414,-71.93903,306.30,0.48,0.48,2024-01-01,616,1,VIIRS,n,2.0NRT,262.45,2.30,1,,189
580027,-33.53647,-70.82634,323.98,0.47,0.48,2024-01-01,616,1,VIIRS,n,2.0NRT,285.25,2.70,1,,151
