Disclaimer : All the code here was made by Ito takumi, I only translated the comments.

Even the translation might be pretty poor, as it was done through google translate

In [None]:
from multiprocessing import Pool, cpu_count
import glob, zipfile, os, itertools
from PIL import Image, ImageStat
from sklearn import *
import pandas as pd
import numpy as np

# 統計データを取得
# Get statistical data
def get_features(path):
    try:
        st = []
        # 画像のpixcelデータを取得
        # Get pixel data of image
        img = Image.open(path)
        # 画像のpixcelデータのRGB別の統計結果を開始
        # Start statistics by RGB of pixcel data of image
        im_stats_ = ImageStat.Stat(img)
        # 合計
        # total
        st += im_stats_.sum
        # 平均値
        # Average value
        st += im_stats_.mean
        # 二乗平均平方根
        # Root mean square
        st += im_stats_.rms
        # 分散
        # dispersion
        st += im_stats_.var
        # 標準偏差
        # standard deviation
        st += im_stats_.stddev
    except:
        print(path)
    return [path, st]

# 並列処理
# Parallel processing
def normalize_img(paths):
    imf_d = {}
    p = Pool(cpu_count())
    # get_features関数を並列処理
    # Parallelize get_features function
    ret = p.map(get_features, paths)
    # 並列処理の結果を配列化
    # Arrange the result of parallel processing
    for i in range(len(ret)):
        imf_d[ret[i][0]] = ret[i][1]
    ret = []
    fdata = [imf_d[f] for f in paths]
    return pd.DataFrame(fdata)

# 画像データのパスを読み込み
# Load path of image data
dog_bytes = pd.DataFrame(glob.glob('../input/all-dogs/all-dogs/**'), columns=['Path'])
# 画像毎にpixcelデータの統計データを取得
# Get statistical data of pixcel data for each image
dog_bytes = pd.concat((dog_bytes, normalize_img(dog_bytes.Path.values)), axis=1)
dog_bytes.head()

In [None]:
# KMeans法によって、画像データを100分類に分割
# Divide image data into 100 classifications by KMeans method
dog_bytes['Group'] = cluster.KMeans(n_clusters=100, random_state=3, n_jobs=-1).fit_predict(dog_bytes[list(range(15))])
# 100分類から数が多い5分類を取得（表示）
#  Get 5 classifications with many from 100 classifications (display)
dog_bytes['Group'].value_counts()[:5]

Inspiration
===========

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
# 画像を表示するwindowを生成
# 単位はインチ
# Generate a window to display the image
# Unit is in inches
fig = plt.figure(figsize=(8, 80))
samples = []
# 特定の分類の画像データから5つずつサンプルを取得
# Get 5 samples from image data of specific classification
for i in range(100):
    # 特定の分類の画像データを取得
    # Get image data of a specific classification
    g = dog_bytes[dog_bytes['Group'] == i]
    if len(g) >= 5:
        # 特定の分類の画像データから5つのサンプルを取得
        # Get 5 samples from image data of specific classification
        samples += list(g['Path'].values[:5])

# 分類毎の画像を表示
# Display images by classification
for i in range(len(samples))[:50]:
    # 5行5列分のwindowの内、一つを取得
    # Get one of the 5 rows and 5 columns of windows
    ax = fig.add_subplot(len(samples)/5, 5, i+1, xticks=[], yticks=[])
    # 画像データを取得
    # Get image data
    img = Image.open(samples[i])
    # 画像データをリサイズ
    # 単位はpixel
    # 解像度(dpi) = pixel / インチ
    # Resize image data
    # Unit is pixel
    # Resolution (dpi) = pixel / inch
    img = img.resize((100,int(img.size[1]/(img.size[0]/100))), Image.ANTIALIAS)
    img = img.crop((0, 0, 64, 64))
    plt.imshow(img)

Motivation
==============

In [None]:
def sim_img(path):
    img = Image.open(path).convert('RGB')
    img = img.resize((100,int(img.size[1]/(img.size[0]/100))), Image.ANTIALIAS)
    img = img.crop((0, 0, 64, 64))
    return img

samples = []
for i in range(100):
    g = dog_bytes[dog_bytes['Group'] == i]
    if len(g) >= 23:
        s = g['Path'].values[:23]
        # 同じ分類内の画像データの二組の全組み合わせを作成
        # Create a complete combination of two sets of image data in the same classification
        s = list([p for p,_ in itertools.groupby(sorted([sorted(p) for p in list(itertools.permutations(s, 2))]))])
        samples += s
print(len(samples))

Submission
=============

In [None]:
z = zipfile.PyZipFile('images.zip', mode='w')
for i in range(10000):
    p1, p2 = samples[i]
    try:
        # 同じ分類内の２つの画像を混合し、新しい画像を作成
        # Mix two images in the same category and create a new image
        # out = p1 * (1 - 0.4) + p2 * 0.4
        im = Image.blend(sim_img(p1), sim_img(p2), alpha=0.4)
        f = str(i)+'.png'
        im.save(f,'PNG'); z.write(f); os.remove(f)
        if i % 1000==0:
            print(i)
    except:
        print(p1, p2)

print (len(z.namelist()))
z.close()