In [None]:
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2

# Files

In [None]:
!ls ../input/sorghum-id-fgvc-9

In [None]:
BASE_DIR = "../input/sorghum-id-fgvc-9"
TRAIN_IMG_PATH = os.path.join(BASE_DIR, "train_images")
TEST_IMG_PATH = os.path.join(BASE_DIR, "test")

# Train data

In [None]:
train_df = pd.read_csv(os.path.join(BASE_DIR, "train_cultivar_mapping.csv"))
print(f"Number of train images: {train_df.shape[0]}")
train_df.head()

## Size of training data
### Check existence of images
* Note that some images, listed in train_cultivar_mapping.csv, do not exist.

In [None]:
train_df["image_existence"] = [os.path.exists(os.path.join(TRAIN_IMG_PATH, row.image)) for _, row in train_df.iterrows()]
print(f"Number of non-existent images: {train_df[train_df.image_existence == False].size}")

### Plot number of images

In [None]:
train_counts = train_df.cultivar.value_counts()
existed_train_counts = train_df[train_df.image_existence == True].cultivar.value_counts()
print(f"Number of cultivars: {train_counts.size}")

plt.figure(figsize=(18, 4))
plt.bar(train_counts.index, train_counts.values, color="black", alpha=0.5, label="All")
plt.bar(existed_train_counts.index, existed_train_counts.values, color="red", alpha=0.5, label="Existed")
plt.xticks(rotation=90)
plt.xlabel("Class label (cultivar)")
plt.ylabel("Frequency")
plt.title("Number of train images")
plt.ylim(0, 350)
plt.legend()
plt.show()

### Show examples

In [None]:
n_cols = 5
plt.figure(figsize=(20,100))

shown_cultivars = set()
plot_index = 1

for _, row in train_df[train_df.image_existence == True].iterrows():
    if row.cultivar in shown_cultivars:
        continue
    shown_cultivars.add(row.cultivar)

    img_path = os.path.join(TRAIN_IMG_PATH, row.image)
    img = cv2.imread(img_path)
    plt.subplot(math.ceil(train_df.cultivar.value_counts().size / n_cols), n_cols,  plot_index)
    plot_index += 1
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.title(f"{row.cultivar}")

plt.show()

# Testing data

In [None]:
test_df = pd.read_csv(os.path.join(BASE_DIR, "sample_submission.csv"))
print(f"Number of testing data: {test_df.shape[0]}")
test_df.head()

## Check existence of images
* All test images exist.

In [None]:
test_df["image_existence"] = [os.path.exists(os.path.join(TEST_IMG_PATH, row.filename)) for _, row in test_df.iterrows()]
print(f"Number of non-existent images: {test_df[test_df.image_existence == False].size}")

### Show examples

In [None]:
n_cols = 5
plot_index = 1
n_show_images = 100

plt.figure(figsize=(20,100))
for _, row in test_df.iloc[:n_show_images,:].iterrows():
    img_path = os.path.join(TEST_IMG_PATH, row.filename)
    img = cv2.imread(img_path)
    plt.subplot(math.ceil(n_show_images / n_cols), n_cols,  plot_index)
    plot_index += 1
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.title(f"{row.filename}")

plt.show()