In [23]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.5.0-cp311-cp311-macosx_10_9_x86_64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.13.1-cp311-cp311-macosx_10_9_x86_64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.5.0-cp311-cp311-macosx_10_9_x86_64.whl (12.1 MB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading scipy-1.13.1-cp311-cp311-macosx_10_9_x86_64.whl (39.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.3/39.3 MB[0m [31m458.6 kB/s[0m eta [36m0:00:00[0m00:01[0m00:03[0m
[?25hDownloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.5.0 scipy-1.

In [188]:
import os
from PIL import Image

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision.datasets import ImageFolder
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import copy

In [208]:
transform = transforms.Compose([
    transforms.RandomPerspective(distortion_scale=0.09, p=0.75, interpolation=3, fill=255),
    transforms.AutoAugment(policy=transforms.autoaugment.AutoAugmentPolicy.IMAGENET),
    transforms.RandomResizedCrop(232, scale=(0.1, 1), interpolation=transforms.InterpolationMode.BILINEAR),
    transforms.ColorJitter(hue=(-0.5,0.5)),
    transforms.RandomEqualize(),
    transforms.RandomGrayscale(p=0.2),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    #transforms.Resize((224, 224)),
    #transforms.RandomHorizontalFlip(p=0.5),    # Apply horizontal flip with 50% probability
    #transforms.RandomVerticalFlip(p=0.5),      # Apply vertical flip with 50% probability
    #transforms.RandomRotation(degrees=15),     # Rotate the image by a random angle in the range (-15, 15)
    #transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),  # Randomly change the brightness, contrast, saturation and hue
    #transforms.RandomGrayscale(p=0.1),         # Convert to grayscale with 10% probability
    #transforms.ToTensor(),
    #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])     # Normalize the image
])


# Charger les données d'entraînement et de validation
dataset = ImageFolder(root='/Users/maillet/Desktop/Clean_Dirty/data/train', transform=transform)
print(len(dataset))

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

#val_dataset = ImageFolder(root='/Users/maillet/Desktop/Clean_Dirty/data/test', transform=transform)
#val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

40


In [209]:
model = models.resnet18(pretrained=True)

for param in model.parameters():
    param.requires_grad = False


num_features = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(num_features, 400),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(400, 1),
    #nn.Sigmoid()
)

# Déplacer le modèle sur le GPU si disponible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [210]:
# Définir la fonction de perte et l'optimiseur
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.fc.parameters(), lr=0.003, amsgrad=True)

# Fonction d'entraînement
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct_predictions = 0
    total_samples = 0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device).float().view(-1, 1)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)
        
        # Calculate predictions
        preds = torch.sigmoid(outputs) > 0.5
        correct_predictions += (preds == labels).sum().item()
        total_samples += inputs.size(0)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_accuracy = correct_predictions / total_samples
    
    return epoch_loss, epoch_accuracy

In [211]:
def evaluate(model, val_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device).float().view(-1, 1)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item() * inputs.size(0)
            predicted = (torch.sigmoid(outputs) > 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    epoch_loss = running_loss / len(val_loader.dataset)
    accuracy = correct / total
    return epoch_loss, accuracy

In [220]:
# Entraîner le modèle
num_epochs = 10
for epoch in range(num_epochs):
    train_loss, train_accuracy = train(model, train_loader, criterion, optimizer, device)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.4f}')

Epoch 1/10, Loss: 0.6308, Accuracy: 0.5938
Epoch 2/10, Loss: 0.5929, Accuracy: 0.7500
Epoch 3/10, Loss: 0.7102, Accuracy: 0.6250
Epoch 4/10, Loss: 0.5977, Accuracy: 0.7188
Epoch 5/10, Loss: 0.6973, Accuracy: 0.4375
Epoch 6/10, Loss: 0.6640, Accuracy: 0.6250
Epoch 7/10, Loss: 0.5892, Accuracy: 0.6562
Epoch 8/10, Loss: 0.5525, Accuracy: 0.7500
Epoch 9/10, Loss: 0.5029, Accuracy: 0.8750
Epoch 10/10, Loss: 0.7220, Accuracy: 0.5312


In [221]:
val_loss, val_accuracy = evaluate(model, val_loader, criterion, device)
print(f'Validation Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}')

Validation Loss: 0.6026, Accuracy: 0.5000


In [214]:
# Define transformations for the test data
test_transform = transforms.Compose([
    transforms.Resize(232, interpolation=transforms.InterpolationMode.BILINEAR),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    #transforms.Resize((224, 224)),
    #transforms.RandomHorizontalFlip(p=0.5),    # Apply horizontal flip with 50% probability
    #transforms.RandomVerticalFlip(p=0.5),      # Apply vertical flip with 50% probability
    #transforms.RandomRotation(degrees=15),     # Rotate the image by a random angle in the range (-15, 15)
    #transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),  # Randomly change the brightness, contrast, saturation and hue
    #transforms.RandomGrayscale(p=0.1),         # Convert to grayscale with 10% probability
    #transforms.ToTensor(),
    #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])     # Normalize the image
])

def load_images_from_folder(folder, transform):
    images = []
    filenames = []
    for filename in os.listdir(folder):
        if filename.endswith(('.png', '.jpg', '.jpeg')):
            img_path = os.path.join(folder, filename)
            image = Image.open(img_path).convert('RGB')
            image = transform(image)
            images.append(image)
            filenames.append(filename)
    return images, filenames

# Load test images
test_folder = '/Users/maillet/Desktop/Clean_Dirty/data/test'
test_images, test_filenames = load_images_from_folder(test_folder, test_transform)

# Convert list of images to a batch
test_images = torch.stack(test_images)

# Load the best model weights
model.eval()  # Set the model to evaluation mode

# Move to the same device as the model
test_images = test_images.to(device)

# Make predictions
with torch.no_grad():
    outputs = model(test_images)
    preds = torch.sigmoid(outputs) > 0.5  # Assuming binary classification (clean vs dirty)

# Convert predictions to numpy array and filenames to list
preds = preds.cpu().numpy().astype(int)
predictions = list(zip(test_filenames, preds))

# Print the predictions
for filename, pred in predictions:
    label = 'clean' if pred == 0 else 'dirty'
    if filename == '0000.jpg':
        print(f'{filename}: {label}')

0000.jpg: dirty


In [215]:
image_name, predict = [], []
for filename, pred in predictions:
    image_name.append(filename[0:4])
    if pred == 0:
        predict.append('clean')
    else:
        predict.append('dirty')

In [216]:
res = pd.DataFrame()

In [217]:
res['id'] = image_name
res['label'] = predict

In [218]:
res

Unnamed: 0,id,label
0,0071,clean
1,0717,dirty
2,0703,dirty
3,0065,dirty
4,0059,dirty
...,...,...
739,0040,dirty
740,0726,dirty
741,0732,dirty
742,0054,clean


In [219]:
csv_file_path = '/Users/maillet/Desktop/Clean_Dirty/test_with_predictions.csv'
res.to_csv(csv_file_path, index=False)

In [163]:
predictions

[('0071.jpg', array([0])),
 ('0717.jpg', array([0])),
 ('0703.jpg', array([1])),
 ('0065.jpg', array([1])),
 ('0059.jpg', array([0])),
 ('0515.jpg', array([1])),
 ('0273.jpg', array([1])),
 ('0267.jpg', array([1])),
 ('0501.jpg', array([1])),
 ('0529.jpg', array([1])),
 ('0298.jpg', array([0])),
 ('0461.jpg', array([1])),
 ('0307.jpg', array([1])),
 ('0313.jpg', array([0])),
 ('0475.jpg', array([1])),
 ('0449.jpg', array([1])),
 ('0105.jpg', array([1])),
 ('0663.jpg', array([1])),
 ('0677.jpg', array([1])),
 ('0111.jpg', array([1])),
 ('0139.jpg', array([1])),
 ('0688.jpg', array([1])),
 ('0689.jpg', array([1])),
 ('0138.jpg', array([1])),
 ('0676.jpg', array([1])),
 ('0110.jpg', array([1])),
 ('0104.jpg', array([1])),
 ('0662.jpg', array([1])),
 ('0448.jpg', array([1])),
 ('0312.jpg', array([1])),
 ('0474.jpg', array([0])),
 ('0460.jpg', array([1])),
 ('0306.jpg', array([1])),
 ('0299.jpg', array([1])),
 ('0528.jpg', array([1])),
 ('0266.jpg', array([1])),
 ('0500.jpg', array([1])),
 

In [164]:
df = pd.read_csv('/Users/maillet/Desktop/Clean_Dirty/sample_submission.csv')

In [165]:
df['predicted_label'] = df['predicted_label'].apply(lambda x: 'clean' if x == 0 else 'dirty')

KeyError: 'predicted_label'

In [14]:
model.eval()
# Function to map predictions to class labels
def map_predictions(predictions):
    return ['clean' if pred == 0 else 'dirty' for pred in predictions]

# Make Predictions on the test dataset
all_preds = []
all_filenames = []
for images, filenames in test_loader:
    outputs = model(images)
    _, preds = torch.max(outputs, 1)
    all_preds.extend(preds)
    all_filenames.extend(filenames)

# Map predictions to class labels
all_preds = map_predictions(all_preds)

In [15]:
submission_df = pd.read_csv('sample_submission.csv')
submission_df.rename(columns={'id': 'image_id'}, inplace=True)

In [16]:
submission_df['image_id'].values

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [17]:
for filename, pred in zip(all_filenames, all_preds):
    c = os.path.splitext(os.path.basename(filename))[0]  # Extract ID from the filename
    if int(c) in submission_df['image_id'].values:
        submission_df.loc[submission_df['image_id'] == int(c), 'label'] = pred
# Save the updated CSV
submission_df = submission_df.rename(columns={'image_id': 'id'})
submission_df.to_csv('updated_sample_submission.csv', index=False)

print("Predictions updated in 'updated_sample_submission.csv'")

Predictions updated in 'updated_sample_submission.csv'


In [18]:
submission_df

Unnamed: 0,id,label
0,0,dirty
1,1,dirty
2,2,dirty
3,3,dirty
4,4,dirty
...,...,...
739,739,dirty
740,740,dirty
741,741,clean
742,742,clean


In [19]:
all_preds

['dirty',
 'clean',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'clean',
 'dirty',
 'clean',
 'dirty',
 'dirty',
 'dirty',
 'clean',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'clean',
 'dirty',
 'dirty',
 'clean',
 'dirty',
 'dirty',
 'clean',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'clean',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'clean',
 'dirty',
 'clean',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'clean',
 'clean',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'clean',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'dirty',
 'clean',
 'clean',
 'dirty',
 'dirty',
 'dirty',
 'clean',
 'dirty',
 'dirty',
 'dirty',
 'clean',
 'dirty',
 'dirty',
 'clean',
 'clean',
 'dirty',
 'clean',
 'clean',
 'dirty',
 'dirty',


In [118]:
s = 'abcd'

In [119]:
l = list(s)

In [120]:
print(l)

['a', 'b', 'c', 'd']
