## **Extract Jersey Color for Team Clustering**
I think the final winners' solutions must include various techniques which is ont only limited to the player tracking but also include rich player mapping method. Thus, I like the notebook which @coldfir3 shared to [detect jersey number using OCR](http://www.kaggle.com/coldfir3/jersey-number-detection-using-ocr). <br>
Here, I share a demo notebook to **extract jersey color for team clustering** by using **Opencv**. Hope this notebook can inspire you to make fun with this competition.

In [None]:
import os
import cv2
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

In [None]:
img_path = '../input/sample-image/57583_000082_Sideline_Moment.jpg' 

# label player bbox manually: (xmin, ymin, xmax, ymax, class)
bbox_list = [
(157,289,190,367,'person_home'),
(545,80,589,147,'person_home'),
(439,278,478,322,'person_home'),
(423,303,460,341,'person_home'),
(384,311,421,352,'person_home'),
(427,328,470,380,'person_home'),
(388,352,426,400,'person_home'),
(373,374,407,424,'person_home'),
(325,388,362,427,'person_home'),
(311,409,350,471,'person_home'),
(330,418,381,493,'person_home'),
(635,90,671,152,'person_visit'),
(511,268,570,320,'person_visit'),
(500,303,552,356,'person_visit'),
(465,338,521,398,'person_visit'),
(440,398,501,461,'person_visit'),
(523,408,569,486,'person_visit'),
(568,469,611,555,'person_visit'),
(627,400,651,478,'person_visit'),
(612,372,653,435,'person_visit'),
(642,322,683,392,'person_visit'),
(953,357,984,433,'person_visit'),
]

# turn a list of tuple to dataframe
bbox_df = pd.DataFrame({'xmin':[bbox_list[i][0] for i in range(len(bbox_list))],
'ymin':[bbox_list[i][1] for i in range(len(bbox_list))],
'xmax':[bbox_list[i][2] for i in range(len(bbox_list))],
'ymax':[bbox_list[i][3] for i in range(len(bbox_list))],
'class':[bbox_list[i][4] for i in range(len(bbox_list))],
})

bbox_df

In [None]:
def visualize_img_with_bbox(bbox_df, img_path):
    fig = plt.figure(figsize=(15,100))
    img = cv2.imread(img_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    font = cv2.FONT_HERSHEY_SIMPLEX
    for i in range(len(bbox_df)):
        xmin = bbox_df.iloc[i,0]
        ymin = bbox_df.iloc[i,1]
        xmax = bbox_df.iloc[i,2]
        ymax = bbox_df.iloc[i,3]
        cls = bbox_df.iloc[i,4]
        cv2.rectangle(img, (xmin, ymin), (xmax, ymax), (255,0,0), 3)
        cv2.putText(img, cls,(xmin + 2, ymin - 2), font, 0.5, (0,0,255), 2)
    plt.axis('off')
    plt.imshow(img)
    plt.show()
    
# visualize the image with labeled bboxes
visualize_img_with_bbox(bbox_df, img_path)

In [None]:
def anno_player_boxes(bbox_df, img_path):
    img = cv2.imread(img_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    frame_list = []
    for i in range(len(bbox_df)):
        xmin = bbox_df.iloc[i,0]
        ymin = bbox_df.iloc[i,1]
        xmax = bbox_df.iloc[i,2]
        ymax = bbox_df.iloc[i,3]
        cls = bbox_df.iloc[i,4]
        box = img[ymin: ymax, xmin: xmax]
        box = cv2.resize(box, (40, 40))
        frame_list.append(box)
    return frame_list

# cut out the player bbox
frame_list = anno_player_boxes(bbox_df, img_path)

# visualize
plt.figure(figsize=(20, 18))
for i in range(len(frame_list)):
    plt.subplot(int(len(frame_list)/2), 11, i+1)
    plt.imshow(frame_list[i])
    plt.title('{}:{}'.format(bbox_list[i][4], i))
    plt.axis('off')
plt.show()

## Extract Jersey Colour by Opencv

In [None]:
def calculate(image, mask, hist_bin):
    '''Compute the percentage of the color histogram along the given channel'''
    hist = cv2.calcHist([image], [0], mask.astype(np.uint8), [hist_bin], [0.0, 255.0])
    hist = hist.flatten()
    return hist/np.sum(hist)

def get_green_mask(imgRGB):
    '''Get the mask of the green background'''
    # REF: https://codereview.stackexchange.com/questions/184044/processing-an-image-to-extract-green-screen-mask/184059#184059
    RED, GREEN, BLUE = (0, 1, 2) # Your numbers
    height = imgRGB.shape[0]
    width = imgRGB.shape[1]
    empty_img = np.zeros((height, width, 3), np.uint8)

    reds = imgRGB[:, :, RED]
    greens = imgRGB[:, :, GREEN]
    blues = imgRGB[:, :, BLUE]

    mask = ((greens < 35) | (reds >= greens) | (blues >= greens)) * 1 # 1 为保留 0 为 遮盖掉
    return mask

def classify_hist_with_split(image, hbin):
    '''Get the hist. percentages among RGB channels and then average them'''
    mask = get_green_mask(image)
    sub_image = cv2.split(image)
    hist_sum = np.zeros(hbin)
    for im in sub_image:
        hist = calculate(im, mask, hbin)
        hist_sum += hist
    hist_sum = hist_sum / 3
    return hist_sum

def visualize_img_with_mask(image, mask):
    '''Plot the RGB picture with the corresponding mask'''
    plt.imshow(image * np.repeat(mask[:, :, np.newaxis], 3, -1))
    plt.axis('off')
    plt.show()    

def compare_color_hist(image1, image2, mask1, mask2, hist_bin):
    '''PPlot the RGB picture with the corresponding mask'''
    hist1 = calculate(image1, mask1, hist_bin)
    hist2 = calculate(image2, mask2, hist_bin)
    plt.plot(hist1, color = 'blue', label = 'player1')
    plt.plot(hist2, color = 'purple', label = 'player2')
    plt.legend()
    plt.show()

In [None]:
# Sample two players
sample_img1 = frame_list[0] # player_home:0
sample_img2 = frame_list[11] # player_visit:11

# Get masks
mask1 = get_green_mask(sample_img1)
mask2 = get_green_mask(sample_img2)

# Plot the RGB picture with the corresponding mask
visualize_img_with_mask(sample_img1, mask1)
visualize_img_with_mask(sample_img2, mask2)

# Plot the RGB picture with the corresponding mask
compare_color_hist(sample_img1, sample_img2, mask1, mask2, 80)

## Team Clustering based on Jersey Colour 

In [None]:
# collect histograms of players
frame_list = anno_player_boxes(bbox_df, img_path)
hist_sums = []
for i in range(len(bbox_df)):
    hist_sum = classify_hist_with_split(frame_list[i], 80)
    hist_sums.append(hist_sum)
    
# Kmeans clustering
kmeans = KMeans(n_clusters=2, random_state=0).fit(hist_sums)
bbox_df['pred_cluster'] = kmeans.labels_

# Evaluation
count = 0
for i in range(len(bbox_df)):
    if (bbox_df['class'].iloc[i] == 'person_home' and bbox_df['pred_cluster'].iloc[i] == 1) or \
    (bbox_df['class'].iloc[i] == 'person_visit' and bbox_df['pred_cluster'].iloc[i] == 0):
        count += 1
print('Accuracy：{:.2f}% ({}/{})'.format(100 * count / len(bbox_df), count, len(bbox_df)))

In [None]:
bbox_df