# 导入必要的库
包括数据处理、图像处理、模型构建和训练等。
# 配置 GPU 内存增长
提高 GPU 资源利用率，避免在训练大模型时出现内存不足的情况。

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import random

import math

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import Dense, Input, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

for gpu in tf.config.experimental.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(gpu, True)
    
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# 读取和预处理数据
去除无关列，添加图像路径。确保数据集中只有相关特征，并为每个样本提供图像文件路径，以便后续加载图像数据。

In [None]:
train = pd.read_csv('/kaggle/input/planttraits2024/train.csv')

sd_columns = [col for col in train.columns if col.endswith('_sd')]
train = train.drop(columns=sd_columns)

train_image_folder = '/kaggle/input/planttraits2024/train_images'
train['image_path'] = train['id'].apply(lambda x: os.path.join(train_image_folder, f"{x}.jpeg"))

test = pd.read_csv('/kaggle/input/planttraits2024/test.csv')
test_image_folder = '/kaggle/input/planttraits2024/test_images'
test['image_path'] = test['id'].apply(lambda x: os.path.join(test_image_folder, f"{x}.jpeg"))

mean_columns = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']

#limit train data for quick test
#train = train.head(1000)

# 定义图像增强和处理函数
图像增强可以增加数据多样性，减少过拟合；预处理可以规范化图像数据，适应模型输入要求。



In [None]:
# Define image augmentation operations
def augment_image(img):
    img = tf.image.random_flip_left_right(img)
    img = tf.image.random_flip_up_down(img)
    img = tf.image.random_brightness(img, max_delta=0.2)
    img = tf.image.random_contrast(img, lower=0.5, upper=1.5)
    img = tf.image.random_hue(img, max_delta=0.2)
    img = tf.image.random_saturation(img, lower=0.5, upper=1.5)
    img = tf.image.random_crop(img, size=[224, 224, 3])  # Random cropping
    return img

# Process image with augmentation
def process_image(file_path):
    img = tf.io.read_file(file_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = augment_image(img)  # Apply augmentation
    img = preprocess_input(img)
    return img

# Define your dataset processing function
def process_path(file_path, tabular_data, targets):
    img = process_image(file_path)
    return (img, tabular_data), targets

# 数据可视化
了解数据分布情况

In [None]:
def plot_data(df):
    plt.figure(figsize=(15, 3))

    # Setting up a grid of plots with 2 columns
    n_cols = 6
    n_rows = len(mean_columns) // n_cols + (len(mean_columns) % n_cols > 0)

    for i, col in enumerate(mean_columns):
        plt.subplot(n_rows, n_cols, i+1)
        sns.kdeplot(df[col], bw_adjust=0.5, fill=False, color='blue')
        plt.title(f'Distribution of {col}')
        plt.xlabel('Value')
        plt.ylabel('Density')

    plt.tight_layout()
    plt.show()
    
plot_data(train)

# 数据清洗
去除异常值，提升模型的训练效果。

In [None]:
for column in mean_columns:
    upper_quantile = train[column].quantile(0.98)  
    train = train[(train[column] < upper_quantile)]
    train = train[(train[column] > 0)]    
plot_data(train)

# 数据标准化
使每个特征的均值为0，标准差为1。

In [None]:
original_means = {}
original_stds = {}

for column in mean_columns:
    # Calculate the mean and standard deviation for each column
    original_means[column] = train[column].mean()
    original_stds[column] = train[column].std()
    
    # Apply the scaling: (value - mean) / std
    # This standardizes each column to have a mean of 0 and std of 1
    train[column] = (train[column] - original_means[column]) / original_stds[column]
    
plot_data(train)

# 归一化辅助数据
将数据缩放到相同范围，提高模型的训练效果。

In [None]:
x = train.drop(columns=['id', 'image_path'] + mean_columns)

for column in x.columns:
    min_val = x[column].min()
    max_val = x[column].max()
    x[column] = (x[column] - min_val) / (max_val - min_val)    

# 拆分数据集，创建 TensorFlow 数据集

In [None]:
y = train[mean_columns]
x_paths = train['image_path']

train_tabular, val_tabular, train_targets, val_targets = train_test_split(
    x, y, test_size=0.2, random_state=42)

train_paths, val_paths = train_test_split(
    x_paths, test_size=0.2, random_state=42)

train_ds = tf.data.Dataset.from_tensor_slices((train_paths, train_tabular.to_numpy(), train_targets.to_numpy()))
val_ds = tf.data.Dataset.from_tensor_slices((val_paths, val_tabular.to_numpy(), val_targets.to_numpy()))

# Apply the processing function
train_ds = train_ds.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)
val_ds = val_ds.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)

# Batch and prefetch
train_ds = train_ds.batch(32).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.batch(32).prefetch(tf.data.AUTOTUNE)


# 构建和编译模型: 
构建并编译一个多输入多输出模型，包括图像特征提取路径和辅助数据处理路径。

In [None]:
from tensorflow.keras.layers import Input, Dense, Dropout, Flatten, Conv2D, MaxPooling2D, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Define the image model pathway
image_input = Input(shape=(224, 224, 3))

# Custom CNN architecture
image_feature_layers = Conv2D(32, (3, 3), activation='relu', padding='same')(image_input)
image_feature_layers = MaxPooling2D((2, 2))(image_feature_layers)
image_feature_layers = Conv2D(64, (3, 3), activation='relu', padding='same')(image_feature_layers)
image_feature_layers = MaxPooling2D((2, 2))(image_feature_layers)
image_feature_layers = Conv2D(128, (3, 3), activation='relu', padding='same')(image_feature_layers)
image_feature_layers = MaxPooling2D((2, 2))(image_feature_layers)
image_feature_layers = Flatten()(image_feature_layers)
image_feature_layers = Dense(256, activation='relu')(image_feature_layers)
image_feature_layers = Dropout(0.3)(image_feature_layers)  # Add dropout for regularization

tabular_input_shape = x.shape[1]
target_columns_shape = y.shape[1]

# Define the tabular model pathway
tabular_input = Input(shape=(tabular_input_shape,))
tabular_dense = Dense(512, activation='relu')(tabular_input)
tabular_dense = Dropout(0.3)(tabular_dense)  # Add dropout for regularization

# Concatenate both pathways
concat = Concatenate()([image_feature_layers, tabular_dense])
concat_dense = Dense(256, activation='relu')(concat)
concat_dense = Dropout(0.3)(concat_dense)  # Continue to use dropout for regularization

# Output layer for 6 targets (assuming 'mean_columns' is your output size)
output = Dense(target_columns_shape, activation='linear')(concat_dense)  # Use linear activation for regression

# Create the model
model = Model(inputs=[image_input, tabular_input], outputs=output)

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.0001), loss='mse', metrics=['mae'])

# Display the model summary to check the architecture
model.summary()

# 训练模型

In [None]:
history = model.fit(train_ds, validation_data=val_ds, epochs=30)

# 保存模型

In [None]:
model.save("model.keras")

# 预处理测试数据
确保测试数据的处理与训练数据一致，方便模型进行预测。

In [None]:
# Prepare tabular data (excluding 'id' and 'image_path')
test_tabular = test.drop(columns=['id', 'image_path'])

#normalize tabular data
for column in test_tabular.columns:
    min_val = test_tabular[column].min()
    max_val = test_tabular[column].max()
    test_tabular[column] = (test_tabular[column] - min_val) / (max_val - min_val)

# 进行预测

In [None]:
test_tabular_np = test_tabular.to_numpy()

# Create a TensorFlow dataset for the image paths and map them through the preprocessing function
test_images_ds = tf.data.Dataset.from_tensor_slices(test['image_path'])\
    .map(process_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)

# Create a TensorFlow dataset for the tabular data
test_tabular_ds = tf.data.Dataset.from_tensor_slices(test_tabular_np)

# Zip the two datasets together
test_ds = tf.data.Dataset.zip((test_images_ds, test_tabular_ds))

# Prepare the dataset for prediction by ensuring the structure matches the model's expectations
test_ds_for_prediction = test_ds.map(lambda image, tabular: ((image, tabular),), num_parallel_calls=tf.data.experimental.AUTOTUNE)

# Batch the dataset
test_ds_batched = test_ds_for_prediction.batch(32)

# Use the model to predict on the batched dataset
predictions = model.predict(test_ds_batched)

predictions_df = pd.DataFrame(predictions, columns=mean_columns)
test = pd.concat([test.reset_index(drop=True), predictions_df], axis=1)

plot_data(test)

#Verify we didn't predict and NaNs..
print("NaN values\n", test[mean_columns].isna().sum())
test[mean_columns]

# 反归一化预测值/调整到原始缩放比例
将预测结果转化回原始尺度，方便理解和进一步使用。

In [None]:
for column in mean_columns:
    original_mean = original_means[column]
    original_std = original_stds[column]

    # Reverse the standardization
    test[column] = test[column] * original_std + original_mean

plot_data(test)
test[mean_columns]

# 保存结果，保存为.csv文件

In [None]:
test = test[['id'] + mean_columns]

#rename from _mean
test.columns = test.columns.str.replace('_mean', '')
test.to_csv('submission.csv', index=False)

test