## Data Visualization with PCA and TSNE

The following code explores the image set for the Kaggle competition "Plant Seedlings Classification". The data for this repo can be downloaded from https://www.kaggle.com/pavanireddyv/plant-seedling-classification-cnn/data.

In [17]:
# Import Libraries
import os
import pandas as pd
import numpy as np
import keras
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from mpl_toolkits.axes_grid1 import ImageGrid

from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing import image

from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize

from random import randint

In [2]:
# Set up paths
train_path = 'train'
test_path = 'test'

In [3]:
categories = ['Black-grass', 'Charlock', 'Cleavers', 'Common Chickweed', 'Common wheat', 'Fat Hen', 'Loose Silky-bent',
              'Maize', 'Scentless Mayweed', 'Shepherds Purse', 'Small-flowered Cranesbill', 'Sugar beet']
num_categories = len(categories)

In [4]:
for category in categories:
    print('{}: {} images'.format(category, len(os.listdir(os.path.join(train_path, category)))))

Black-grass: 263 images
Charlock: 390 images
Cleavers: 287 images
Common Chickweed: 611 images
Common wheat: 221 images
Fat Hen: 475 images
Loose Silky-bent: 654 images
Maize: 221 images
Scentless Mayweed: 516 images
Shepherds Purse: 231 images
Small-flowered Cranesbill: 496 images
Sugar beet: 385 images


In [5]:
train_list = []
for category_id, category in enumerate(categories):
    for file in os.listdir(os.path.join(train_path, category)):
        train_list.append([category_id, 'train/{}/{}'.format(category, file), category])
train_df = pd.DataFrame(train_list, columns=['category_id', 'file', 'category'])

In [6]:
def read_img(filepath, size):
    # Convert image to array
    img = image.load_img(filepath, target_size=size)
    img = image.img_to_array(img)
    return img

def display_img(filepath, size):
    # Display a specific image
    print(filepath)
    img = read_img(filepath['file'], size)
    plt.imshow(img/255.)
    plt.title(filepath['category'])
    plt.axis('off')
    
def display_random_image(df):
    # Display a random image
    r = randint(0,df.shape[0])
    file_loc = train_df.iloc[r]
    display_img(file_loc, (244,244))
    
def display_random_image_from_category(df, category):
    df = df[df['category'] == category]
    r = randint(0,df.shape[0])
    file_loc = df.iloc[r]
    display_img(file_loc, (244,244))

In [7]:
def append_all_images_to_array(train_df):
    img_array = []
    for i in range(0, len(train_df['file'])):
        temp_img = read_img(train_df['file'].iloc[i], (244,244))
        img_array.append(temp_img)
        
    return img_array

In [8]:
img_array = append_all_images_to_array(train_df)
train_df['img_array'] = img_array
train_df.shape

(4750, 4)

In [9]:
train_df.tail(5)

Unnamed: 0,category_id,file,category,img_array
4745,11,train/Sugar beet/fc293eacb.png,Sugar beet,"[[[91.0, 87.0, 92.0], [77.0, 76.0, 80.0], [81...."
4746,11,train/Sugar beet/fc441208c.png,Sugar beet,"[[[92.0, 63.0, 35.0], [96.0, 67.0, 38.0], [96...."
4747,11,train/Sugar beet/fed9406b2.png,Sugar beet,"[[[72.0, 56.0, 44.0], [75.0, 65.0, 53.0], [83...."
4748,11,train/Sugar beet/fef5e7066.png,Sugar beet,"[[[147.0, 143.0, 146.0], [145.0, 145.0, 145.0]..."
4749,11,train/Sugar beet/ffa401155.png,Sugar beet,"[[[99.0, 90.0, 71.0], [97.0, 83.0, 68.0], [101..."


In [18]:
pca = PCA(.8)

In [29]:
lower_dimensional_data = pca.fit_transform(train_df['img_array'])

ValueError: setting an array element with a sequence.

In [31]:
train_df['img_array'][0].shape

(244, 244, 3)