In [2]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

import torch
import torchvision.transforms as transforms
from torchvision import models
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter(action='ignore', category=UserWarning)


In [16]:
train = pd.read_csv('/root/train.csv')
test = pd.read_csv('/root/test.csv')


In [17]:
# not worrying about '_sd' columns for now
sd_columns = [col for col in train.columns if col.endswith('_sd')]
train = train.drop(columns=sd_columns)


In [18]:
# target columns
mean_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']

In [19]:
train['image_path'] = '/root/train_images/' + train['id'].astype(str) + '.jpeg'
test['image_path'] = '/root/test_images/' + test['id'].astype(str) + '.jpeg'

In [20]:
for column in mean_columns:
    upper_quantile = train[column].quantile(0.98)
    train = train[(train[column] < upper_quantile)]

In [21]:
mean_values = train[mean_columns].mean()
test[mean_columns] = mean_values
test[mean_columns]

Unnamed: 0,X4_mean,X11_mean,X18_mean,X50_mean,X26_mean,X3112_mean
0,0.51282,15.790058,2.553687,1.578751,19.297984,1481.69107
1,0.51282,15.790058,2.553687,1.578751,19.297984,1481.69107
2,0.51282,15.790058,2.553687,1.578751,19.297984,1481.69107
3,0.51282,15.790058,2.553687,1.578751,19.297984,1481.69107
4,0.51282,15.790058,2.553687,1.578751,19.297984,1481.69107
...,...,...,...,...,...,...
6540,0.51282,15.790058,2.553687,1.578751,19.297984,1481.69107
6541,0.51282,15.790058,2.553687,1.578751,19.297984,1481.69107
6542,0.51282,15.790058,2.553687,1.578751,19.297984,1481.69107
6543,0.51282,15.790058,2.553687,1.578751,19.297984,1481.69107


In [None]:
#pip install efficientnet_pytorch

In [22]:
from efficientnet_pytorch import EfficientNet
import torch.nn.functional as F

def get_model():
    # Load EfficientNetB0 as the base model
    base_model = EfficientNet.from_pretrained('efficientnet-b0')

    # Modify the model's final fully connected layer for regression
    num_ftrs = base_model._fc.in_features
    base_model._fc = nn.Linear(num_ftrs, len(mean_columns))  # Output layer for regression with multiple outputs

    return base_model

# Create the model
model = get_model()

Loaded pretrained weights for efficientnet-b0


In [23]:
def load_and_preprocess_image(path, label):
    image = Image.open(path).convert('RGB')
    image = transforms.Resize((224, 224))(image)
    image = transforms.ToTensor()(image)
    image = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])(image)
    return image, label

def create_dataset(df, target_col=None, batch_size=32):
    paths = df['image_path'].values
    
    if target_col is not None:
        labels = df[target_col].values.astype(np.float32)
    else:
        labels = np.zeros(len(df), dtype=np.float32)
    
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    dataset = [(transform(Image.open(path).convert('RGB')), label) for path, label in zip(paths, labels)]
    inputs, labels = zip(*dataset)
    inputs = torch.stack(inputs)
    labels = torch.tensor(labels, dtype=torch.float32)
    dataset = torch.utils.data.TensorDataset(inputs, labels)
    dataset = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    return dataset

In [24]:
from PIL import Image
columns_to_train = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
models = {}

for column in columns_to_train:
    model = get_model()
    device = torch.device("cuda")
    model.to(device)
    
    train_df, test_df = train_test_split(train, test_size=0.2, random_state=42)

    train_dataset = create_dataset(train_df, column, batch_size=32)
    test_dataset = create_dataset(test_df, column, batch_size=32)
    
    criterion = nn.MSELoss()  # Using Mean Squared Error loss for regression
    
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Move model to appropriate device
    
    print(f"Training model for {column}")
    for epoch in range(1):  # Assuming you want to train for 1 epoch
        model.train()
        running_loss = 0.0
        for inputs, labels in train_dataset:
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels.view(-1, 1))  # Assuming labels are reshaped to match output
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        
        # Print average training loss for each epoch
        print(f"Epoch {epoch+1} - Training Loss: {running_loss / len(train_dataset.dataset)}")
    
    # Validation
    model.eval()
    with torch.no_grad():
        test_loss = 0.0
        for inputs, labels in test_dataset:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            test_loss += criterion(outputs, labels.view(-1, 1)).item() * inputs.size(0)
        
        # Print average test loss
        print(f"Test Loss: {test_loss / len(test_dataset.dataset)}")
    
    # Save the model
    models[column] = model
    torch.save(model.state_dict(), f'model_{column}.pt')

Loaded pretrained weights for efficientnet-b0


Training model for X4_mean
Epoch 1 - Training Loss: 0.021239181947938224
Test Loss: 0.020469692918916618
Loaded pretrained weights for efficientnet-b0
Training model for X11_mean
Epoch 1 - Training Loss: 46.747400992312706
Test Loss: 43.62801690076394
Loaded pretrained weights for efficientnet-b0
Training model for X18_mean
Epoch 1 - Training Loss: 13.072887767674327
Test Loss: 11.20801850649997
Loaded pretrained weights for efficientnet-b0
Training model for X50_mean
Epoch 1 - Training Loss: 0.35687438255804727
Test Loss: 0.32672436458942217
Loaded pretrained weights for efficientnet-b0
Training model for X26_mean
Epoch 1 - Training Loss: 2685.941923542746
Test Loss: 2390.626837089579
Loaded pretrained weights for efficientnet-b0
Training model for X3112_mean
Epoch 1 - Training Loss: 4885201.972233528
Test Loss: 3588214.9550010175


In [33]:
from PIL import Image
columns_to_train = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
models = {}

for column in columns_to_train:
    model = get_model()
    device = torch.device("cuda")
    model.to(device)
    
    train_df, test_df = train_test_split(train, test_size=0.2, random_state=42)

    train_dataset = create_dataset(train_df, column, batch_size=32)
    test_dataset = create_dataset(test_df, column, batch_size=32)
    
    criterion = nn.MSELoss()  # Using Mean Squared Error loss for regression
    
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Move model to appropriate device
    
    print(f"Training model for {column}")
    for epoch in range(1):  # Assuming you want to train for 1 epoch
        model.train()
        running_loss = 0.0
        for inputs, labels in train_dataset:
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels.view(-1, 1))  # Assuming labels are reshaped to match output
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        
        # Print average training loss for each epoch
        print(f"Epoch {epoch+1} - Training Loss: {running_loss / len(train_dataset.dataset)}")
    
    # Validation
    model.eval()
    with torch.no_grad():
        test_loss = 0.0
        for inputs, labels in test_dataset:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            test_loss += criterion(outputs, labels.view(-1, 1)).item() * inputs.size(0)
        
        # Print average test loss
        print(f"Test Loss: {test_loss / len(test_dataset.dataset)}")
    
    # Save the model
    models[column] = model
    torch.save(model.state_dict(), f'/root/model_{column}.pt')
    print(f"model_{column} saved")

Loaded pretrained weights for efficientnet-b0
Training model for X4_mean
Epoch 1 - Training Loss: 0.021585435825119306
Test Loss: 0.016378742850040293
model_X4_mean saved
Loaded pretrained weights for efficientnet-b0
Training model for X11_mean
Epoch 1 - Training Loss: 44.8337281892088
Test Loss: 41.138216396730805
model_X11_mean saved
Loaded pretrained weights for efficientnet-b0


In [26]:
predict_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean']

for column in predict_columns:
    model = models[column]  # Load the trained model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()  # Set the model to evaluation mode

    preds = []  # List to store predictions

    # Iterate over the test dataset
    with torch.no_grad():
        for inputs, _ in test_data_dataset:
            inputs = inputs.to(device)
            outputs = model(inputs)
            preds.extend(outputs.cpu().numpy().flatten())  # Flatten the predictions and store them

    # Assign the predictions to the respective column
    test[column] = preds

ValueError: Length of values (39270) does not match length of index (6545)