In [None]:
import random
import os
from multiprocessing import Pool


import numpy as np
import pandas as pd
import itertools
import glob
import cv2
from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm
from matplotlib import pyplot as plt
import seaborn as sns
import matplotlib.patches as patches
from sklearn.cluster import KMeans
from IPython.display import Video

## setting and loading data

In [None]:
debug = True
CONF_THRE = 0.3
BASE_DIR = '../input/nfl-health-and-safety-helmet-assignment'

labels = pd.read_csv(f'{BASE_DIR}/train_labels.csv')
if debug:
    tracking = pd.read_csv(f'{BASE_DIR}/train_player_tracking.csv')
    helmets = pd.read_csv(f'{BASE_DIR}/train_baseline_helmets.csv')
else:
    tracking = pd.read_csv(f'{BASE_DIR}/test_player_tracking.csv')
    helmets = pd.read_csv(f'{BASE_DIR}/test_baseline_helmets.csv')

### EDA

Let's plot the raw image along after helmet bounding boxes gven in the file `image_labels.csv`. This data would be crucial to train a model to detect helmets

In [None]:
image_labels_df = pd.read_csv(os.path.join(BASE_DIR, 'image_labels.csv'))

In [None]:
image_labels_df.head()

In [None]:
## Functions to plot image and bounding boxes together
def plot_image_bbox(image, left, width, top, height, ax):
    """
    Plots image and helmet bounding box
    """
    ax.imshow(image)
    rect = patches.Rectangle((left,top), width, height, edgecolor='r', facecolor="none", linewidth=2)
    ax.add_patch(rect)
    
def plot_image_bboxes(image, image_name, labels_df, ax):
    """
    Plots image and bounding box for all the 
    helmet instances in the image
    """
    df = labels_df[labels_df["image"] == image_name]
    for i, row in df.iterrows():
        plot_image_bbox(image, row["left"], row["width"], row["top"], row["height"], ax=ax)  

In [None]:
fig, axs = plt.subplots(nrows = 2, ncols = 2, figsize=(20, 12))
fig.tight_layout()
axs = axs.ravel()
images_dir = os.path.join(BASE_DIR, "images")
image_names = os.listdir(images_dir)
for i, ax in enumerate(axs):
    image_path = os.path.join(images_dir, image_names[i])
    image = cv2.imread(filename=image_path)
    plot_image_bboxes(image, image_names[i], image_labels_df, ax=ax)
fig.suptitle("Sample images with helmet bbox")    
plt.show()

Not bad..... annotation of helmets look pretty neat!!

Now lets take a lot at the distribution of number of helmets per image...

In [None]:
boxes_per_image = image_labels_df.groupby(["image"]).agg({"left": "count"}).rename(columns={"left":"count"})

sns.displot(x=boxes_per_image["count"], height=6, aspect=2) 
plt.title("Distribution of number of helmets per image")
plt.xlabel("Number of helmets")
plt.show()

There is clearly a peak at `22` (which is the total number of players in ground at a time)

There are values greater than 22 as well due to the sidelined players

In [None]:
sns.barplot(x = image_labels_df["label"].value_counts(normalize=True).values, 
            y = image_labels_df["label"].value_counts(normalize=True).index, 
           )
plt.title('Distribution of Helmet types')
plt.xlabel('Proportion of all helmets')
plt.grid()
plt.show()

Note that most of the helmets are normal while 3-4% of helmets are difficult to classify.

#### Stay tuned for more.....