# Finetuning a DCNN to detect landfills in the Litoral Region in Argentina

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys, os
import numpy as np

In [3]:
project_directory = "/content/drive/MyDrive/Trabajo/Proyectos/Basurales_Carla/cientibeca"
os.chdir(project_directory)
print(f"Current working directory: {os.getcwd()}")

Current working directory: /content/drive/.shortcut-targets-by-id/1dmEJvNxrJ2abNEMaGXh_azjDZFbyVCn8/cientibeca


# Loading architecture and pre-trained weights

In [4]:
!pip install pytorch-ignite

Collecting pytorch-ignite
  Downloading pytorch_ignite-0.5.2-py3-none-any.whl.metadata (27 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=1.3->pytorch-ignite)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=1.3->pytorch-ignite)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=1.3->pytorch-ignite)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=1.3->pytorch-ignite)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=1.3->pytorch-ignite)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-

In [5]:
import torch

sys.path.append(os.path.join(project_directory, 'modelo_italia'))

# Importing model definition
from architecture.resnet50_fpn import Net

# Loading state dictionary
STATE_DICT_PATH = "LitoralArg_Model/best_local_model.pth"

# Creating an instance of the model
model = Net(num_classes=1)

# Loading the weights into the model
model.load_state_dict(torch.load(STATE_DICT_PATH, map_location=torch.device('cpu')))

Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /root/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth


pretrained


100%|██████████| 97.8M/97.8M [00:00<00:00, 246MB/s]


<All keys matched successfully>

# Loading data

In [6]:
import pandas as pd
db_parana = pd.read_csv("labels/parana_labels.csv")
db_rosario = pd.read_csv("labels/rosario_labels.csv")
db_santafe = pd.read_csv("labels/santafe_labels.csv")

list_test_images = pd.read_csv("LitoralArg_Model/test_images.csv")
list_val_images = pd.read_csv("LitoralArg_Model/val_images.csv")
list_train_images = pd.read_csv("LitoralArg_Model/train_images.csv")

db_parana_train = db_parana[db_parana['file_name'].isin(list_train_images['file_name'])]
db_parana_val = db_parana[db_parana['file_name'].isin(list_val_images['file_name'])]
db_parana_test = db_parana[db_parana['file_name'].isin(list_test_images['file_name'])]

db_rosario_train = db_rosario[db_rosario['file_name'].isin(list_train_images['file_name'])]
db_rosario_val = db_rosario[db_rosario['file_name'].isin(list_val_images['file_name'])]
db_rosario_test = db_rosario[db_rosario['file_name'].isin(list_test_images['file_name'])]

db_santafe_train = db_santafe[db_santafe['file_name'].isin(list_train_images['file_name'])]
db_santafe_val = db_santafe[db_santafe['file_name'].isin(list_val_images['file_name'])]
db_santafe_test = db_santafe[db_santafe['file_name'].isin(list_test_images['file_name'])]

In [15]:
from torch.utils.data import Dataset
from PIL import Image
from torchvision import transforms

train_val_transform = transforms.Compose([
            transforms.Resize(800), # Images are resized to 800x800 using the same resolution as inTorres et al. (2023)
            transforms.RandomHorizontalFlip(),  # Random horizontal flips as in Torres et al. (2023)
            transforms.RandomRotation(90), # Random 90-degree rotations as in Torres et al. (2023)
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])  # ImageNet normalization
            ])

test_transform = transforms.Compose([
            transforms.Resize(800), # Images are resized to 800x800 using the same resolution as inTorres et al. (2023)
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])  # ImageNet normalization
            ])


class CustomImageDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform):
        """
        Args:
            dataframe (pandas.DataFrame): DataFrame containing image file names and labels.
            img_dir (str): Directory containing the images.
        """
        self.img_labels = dataframe[['file_name', 'etiqueta']]
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0])
        image = Image.open(img_path).convert('RGB')
        label = self.img_labels.iloc[idx, 1]

        if self.transform:
            image = self.transform(image)

        return image, label

parana_dataset = CustomImageDataset(db_parana, 'images/parana/patches_parana',test_transform)
rosario_dataset = CustomImageDataset(db_rosario, 'images/rosario/patches_ros',test_transform)
santafe_dataset = CustomImageDataset(db_santafe, 'images/santa_fe/patches_sfe',test_transform)

parana_test_dataset = CustomImageDataset(db_parana_test, 'images/parana/patches_parana',test_transform)
parana_val_dataset = CustomImageDataset(db_parana_val, 'images/parana/patches_parana',train_val_transform)
parana_train_dataset = CustomImageDataset(db_parana_train, 'images/parana/patches_parana',train_val_transform)

rosario_test_dataset = CustomImageDataset(db_rosario_test, 'images/rosario/patches_ros',test_transform)
rosario_val_dataset = CustomImageDataset(db_rosario_val, 'images/rosario/patches_ros',train_val_transform)
rosario_train_dataset = CustomImageDataset(db_rosario_train, 'images/rosario/patches_ros',train_val_transform)

santafe_test_dataset = CustomImageDataset(db_santafe_test, 'images/santa_fe/patches_sfe',test_transform)
santafe_val_dataset = CustomImageDataset(db_santafe_val, 'images/santa_fe/patches_sfe',train_val_transform)
santafe_train_dataset = CustomImageDataset(db_santafe_train, 'images/santa_fe/patches_sfe',train_val_transform)

In [8]:
from torch.utils.data import ConcatDataset

# Combine train datasets
train_dataset = ConcatDataset([parana_train_dataset, rosario_train_dataset, santafe_train_dataset])

# Combine val datasets
val_dataset = ConcatDataset([parana_val_dataset, rosario_val_dataset, santafe_val_dataset])

# Combine test datasets
test_dataset = ConcatDataset([parana_test_dataset, rosario_test_dataset, santafe_test_dataset])

print(f"Combined Train Dataset Size: {len(train_dataset)}")
print(f"Combined Val Dataset Size: {len(val_dataset)}")
print(f"Combined Test Dataset Size: {len(test_dataset)}")



Combined Train Dataset Size: 588
Combined Val Dataset Size: 196
Combined Test Dataset Size: 196


## Testing the model

In [9]:
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.nn import BCEWithLogitsLoss

batch_size = 8

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

def evaluate_model(model, dataloader, device):
    model.eval()
    predictions = []
    targets = []

    with torch.no_grad():
        for data, target in dataloader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            predictions.extend(output.cpu().numpy())
            targets.extend(target.cpu().numpy())

    return predictions, targets


# Moving model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Evaluating the model
print("Evaluating model...")
print("Evaluating train set...")
train_predictions, train_targets = evaluate_model(model, train_loader, device)
print("Evaluating val set...")
val_predictions, val_targets = evaluate_model(model, val_loader, device)
print("Evaluating test set...")
test_predictions, test_targets = evaluate_model(model, test_loader, device)


Evaluating model...
Evaluating train set...
Evaluating val set...
Evaluating test set...


In [23]:
def sigmoid(x):
    return 1/(1+(np.exp((-x))))

train_predicted_scores = sigmoid(np.array(train_predictions)).tolist()
val_predicted_scores = sigmoid(np.array(val_predictions)).tolist()
test_predicted_scores = sigmoid(np.array(test_predictions)).tolist()

train_predicted_labels = [1 if x > 0 else 0 for x in train_predictions]
val_predicted_labels = [1 if x > 0 else 0 for x in val_predictions]
test_predicted_labels = [1 if x > 0 else 0 for x in test_predictions]

In [24]:
# Output location
output_dir = "LitoralArg_Model"

train_image_filenames = db_parana_train['file_name'].tolist()
train_image_filenames.extend(db_rosario_train['file_name'].tolist())
train_image_filenames.extend(db_santafe_train['file_name'].tolist())
train_results_df = pd.DataFrame({'file_name': train_image_filenames, 'labels': train_targets, 'predicted_scores': train_predicted_scores, 'predicted_labels': train_predicted_labels})
train_results_df.to_csv(os.path.join(output_dir, 'train_results.csv'), index=False)


val_image_filenames = db_parana_val['file_name'].tolist()
val_image_filenames.extend(db_rosario_val['file_name'].tolist())
val_image_filenames.extend(db_santafe_val['file_name'].tolist())
val_results_df = pd.DataFrame({'file_name': val_image_filenames, 'labels': val_targets, 'predicted_scores': val_predicted_scores, 'predicted_labels': val_predicted_labels})
val_results_df.to_csv(os.path.join(output_dir, 'val_results.csv'), index=False)

test_image_filenames = db_parana_test['file_name'].tolist()
test_image_filenames.extend(db_rosario_test['file_name'].tolist())
test_image_filenames.extend(db_santafe_test['file_name'].tolist())
test_results_df = pd.DataFrame({'file_name': test_image_filenames, 'labels': test_targets, 'predicted_scores': test_predicted_scores, 'predicted_labels': test_predicted_labels})
test_results_df.to_csv(os.path.join(output_dir, 'test_results.csv'), index=False)

In [20]:
frac_pos_train_labels = np.sum(train_targets)/len(train_targets)
print(f"Fraction of positive labels in train set: {frac_pos_train_labels}")

frac_pos_train_predictions = np.sum(train_predicted_labels)/len(train_predicted_labels)
print(f"Fraction of positive predictions in train set: {frac_pos_train_predictions}")

frac_pos_test_labels = np.sum(test_targets)/len(test_targets)
print(f"Fraction of positive labels in test set: {frac_pos_test_labels}")

frac_pos_test_predictions = np.sum(test_predicted_labels)/len(test_predicted_labels)
print(f"Fraction of positive predictions in test set: {frac_pos_test_predictions}")

Fraction of positive labels in train set: 0.12755102040816327
Fraction of positive predictions in train set: 0.003401360544217687
Fraction of positive labels in test set: 0.09693877551020408
Fraction of positive predictions in test set: 0.00510204081632653


In [25]:
acc_train = np.sum(np.array(train_targets) == np.array(train_predicted_labels))/len(train_targets)
acc_test = np.sum(np.array(test_targets) == np.array(test_predicted_labels))/len(test_targets)

print(f"Accuracy in train set: {acc_train}")
print(f"Accuracy in test set: {acc_test}")

Accuracy in train set: 0.8724489795918368
Accuracy in test set: 0.9081632653061225


In [29]:
def compute_balanced_accuracy(y_true, y_pred):
    if not isinstance(y_true, np.ndarray) or not isinstance(y_pred, np.ndarray):
        y_true, y_pred = np.array(y_true), np.array(y_pred)
    acc_pos = np.sum((y_true == 1) & (y_pred == 1)) / np.sum(y_true == 1)
    acc_neg = np.sum((y_true == 0) & (y_pred == 0)) / np.sum(y_true == 0)
    balanced_acc = (acc_pos + acc_neg) / 2
    return balanced_acc

bal_acc_train = compute_balanced_accuracy(train_targets, train_predicted_labels)
bal_acc_test = compute_balanced_accuracy(test_targets, test_predicted_labels)

print(f"Balanced accuracy in train set: {bal_acc_train}")
print(f"Balanced accuracy in test set: {bal_acc_test}")

Balanced accuracy in train set: 0.505692007797271
Balanced accuracy in test set: 0.5263157894736842
