In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import re 
import os
from glob import glob
import matplotlib.pyplot as plt
import ast

%matplotlib inline

plt.style.use('seaborn') #make plots prettier

# EDA and Preprocessing

In this notebook there is a short EDA, and some preprocessing of the simplified training data to numpy arrays.

# File Paths

In [None]:
train_dir =  "../input/train_simplified/"
csv_files = glob(train_dir + "*.csv")

# Training Classes

I wanted to find out how evenly distributed the classes in the training examples are.

In [None]:
def extract_classname(filename):
    return re.search( r"fied/(.+)\.csv",filename).group(1) 

In [None]:
class_names = [ extract_classname(file) for file in csv_files]

In [None]:
def count_lines(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i - 1 #minus one for header row


In [None]:
%%time
#is a little slow, many lines to count
line_counts = [ count_lines(file) for file in csv_files]
counts = pd.DataFrame({"line_counts":line_counts})
counts["class"] = class_names

Certain classes are far more common it seems.

In [None]:
sns.distplot(counts.line_counts.values,kde=False)

In [None]:
counts.describe()

The most common classes are shown bellow

In [None]:
i = counts.line_counts.nlargest(10).index
ax = counts.iloc[i].line_counts.plot.bar()
ax.set_xticklabels(counts.loc[i,"class"]);

# Reading Data

Since each csv file contains a lot of rows, we'll just read in a smaller sample

In [None]:
def read_csvs(csv_files, nrows=1000):
    df =  pd.concat([ pd.read_csv(file,nrows=nrows) for file in csv_files])
    df.reset_index(inplace=True,drop=True)
    df['drawing'] = df.drawing.apply(ast.literal_eval)
    return df

In [None]:
%%time
#takes a litte while
df = read_csvs(csv_files)

# Number of strokes

I reckon that the number of strokes used could be a good indicator to the model which type of image is being drawn, since certain images will naturally take more or less strokes.



In [None]:
df["n_strokes"] = df.drawing.apply(len)

In [None]:
s = df[df.word.str.contains("rabbit|sun|hot dog")] #look at a subset of rabbit, sun and hot dog drawings

The number of strokes looks almost normally distributed, with differences in the mean, depending on thing being drawn.  

In [None]:
s = s.groupby('word').n_strokes.plot.kde()
s.apply(lambda ax: ax.legend());

# List 2 Numpy

In a later notebook I'll train a CNN but first I need to convert the list of points into an image.

In [None]:
def list2numpy(points_list,size=1):

    """
    Takes a list of points and converts it to a boolean
    numpy array of size 72 by 72. Increase size to
    double the output size.
    """
    
    fig, ax = plt.subplots(figsize=(size,size))
    fig.tight_layout(pad=0)
    ax.grid(False)
    ax.xaxis.set_visible(False)
    ax.yaxis.set_visible(False)
    ax.set_axis_off()


    for points in points_list:
        ax.set_xlim(0,255)
        ax.set_ylim(0,255)
        ax.invert_yaxis()
        ax.plot(points[0],points[1])

    fig.canvas.draw()

    X = np.array(fig.canvas.renderer._renderer)
    plt.close()

    return X[:,:,1] == 255 

In [None]:
plt.imshow(list2numpy(df.drawing[7],size=4),cmap="gray")
plt.axis('off')

In [None]:
#default size is 72 b 72
list2numpy(df.drawing[7],size=1).shape

In [None]:
#but larger size possible
list2numpy(df.drawing[7],size=2).shape

# Unrecognized vs Recognized Images

I was curious as to what the unrecognized images looked like? Was it because they are poorly drawn, of the algorithm wasn't smart engough.

In [None]:
def plot_images(df, w = 5, h =5):

    fig, axes = plt.subplots(w,h, figsize=(10,10))

    for i, ax in enumerate(axes.flatten()):
            ax.imshow(df.drawing[i], cmap="gray")
            ax.set_title(df.word[i])
            ax.set_axis_off()


For the most part it looks like many of these unrecognized images could be guessed by a human.  In some people have written the word instead of drawing a image.

In [None]:
n = 5 # change me to plot more images

In [None]:
#run cell a few times 
unrecognized = df[df.recognized == False].sample(n**2)
unrecognized.reset_index(inplace=True,drop=True)
unrecognized["drawing"] =   unrecognized.drawing.apply(list2numpy)
plot_images(unrecognized)

I think mabye I'd say the recognized images tend to be drawn better

In [None]:
recognized = df[df.recognized == True].sample(n**2)
recognized.reset_index(inplace=True,drop=True)
recognized["drawing"] =   recognized.drawing.apply(list2numpy)
plot_images(recognized)