In [1]:
import os
import sys
import cv2
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import metrics
import pandas as pd
import matplotlib.pyplot as plt
from skimage.feature import local_binary_pattern
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.linear_model import SGDClassifier
# from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import MiniBatchKMeans
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


dataset_path='Dataset'
RANDOMSEED=1277
num_type=15
N_CLUSTERS = 100     
MAX_SAMPLES = 125      
# LBP 
radius = 2
n_points = 8 * radius
n_bins = 256

In [2]:
def find_all_file_name_with_type():
    ans=[]
    ans2=[]
    ans3=[]
    temp=0
    for subdir in os.listdir(dataset_path):
        subdir_path = os.path.join(dataset_path, subdir)
        # print(f"subdir: {subdir}")
        # Traverse all files in the subdir
        for file in os.listdir(subdir_path):
            file_path = os.path.join(subdir_path, file)
            if os.path.isfile(file_path):
               # print(f"file_path: {file_path}")
                ans.append(file_path)
                ans2.append(subdir)
                ans3.append(temp)
        temp+=1
                
    return ans, ans2, ans3
file_name,file_type,file_code =find_all_file_name_with_type()

In [3]:
def read_image(file_path):
    return cv2.imread(file_path, 0) #read picture

In [4]:
#  DataFrame
index_df = pd.DataFrame({
    'file': file_name,
    'character': file_type,
    'code':file_code,
    'image_data': [read_image(fp) for fp in file_name]  # read every file
})

index_df.head()

Unnamed: 0,file,character,code,image_data
0,Dataset\Agriculture\001.jpg,Agriculture,0,"[[162, 161, 158, 150, 141, 137, 138, 140, 139,..."
1,Dataset\Agriculture\002.jpg,Agriculture,0,"[[73, 73, 74, 74, 74, 73, 72, 71, 97, 124, 104..."
2,Dataset\Agriculture\003.jpg,Agriculture,0,"[[46, 45, 45, 44, 44, 45, 45, 46, 46, 46, 46, ..."
3,Dataset\Agriculture\004.jpg,Agriculture,0,"[[168, 166, 167, 169, 165, 158, 156, 158, 157,..."
4,Dataset\Agriculture\005.jpg,Agriculture,0,"[[126, 125, 116, 99, 86, 84, 88, 90, 88, 90, 9..."


In [5]:
def extract_color_sift_features(img):
    sift = cv2.SIFT_create()  
    # BRG to RGB
    rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  
    channels = cv2.split(rgb)  
    img_descriptors = []
        
    for channel in channels:  
        _, des = sift.detectAndCompute(channel, None)  
        if des is not None:  
            img_descriptors.append(des)  
        
    # append BRG 
    if len(img_descriptors) > 0:  
        img_descriptors = np.vstack(img_descriptors)  
        return img_descriptors
    else:  
        return np.array([]) 


In [6]:
sift_features = []

total_images = len(index_df['image_data'])

for i, img in enumerate(index_df['image_data']):
    sift_features.append(extract_color_sift_features(img))
    
    print(f"Processing {i+1}/{total_images} ({(i+1)/total_images*100:.2f}%)", end='\r')
index_df['sift_features'] = sift_features
print()  

Processing 12000/12000 (100.00%)


In [7]:
from skimage.feature import local_binary_pattern

def extract_lbp_features(image, radius=1, n_points=8):
    lbp = local_binary_pattern(image, n_points, radius, method='uniform')
    hist, _ = np.histogram(lbp.ravel(), bins=n_bins, range=(0, n_bins), density=True)
    return hist  

# take all LBP 
lbp_features = []

total_images = len(index_df['image_data'])

for i, img in enumerate(index_df['image_data']):
    lbp_features.append(extract_lbp_features(img))
    
    print(f"Processing {i+1}/{total_images} ({(i+1)/total_images*100:.2f}%)", end='\r')

# add LBP to index_df 
index_df['lbp_features'] = lbp_features
print() 

Processing 12000/12000 (100.00%)


In [8]:
index_df.head()

Unnamed: 0,file,character,code,image_data,sift_features,lbp_features
0,Dataset\Agriculture\001.jpg,Agriculture,0,"[[162, 161, 158, 150, 141, 137, 138, 140, 139,...","[[8.0, 0.0, 0.0, 0.0, 0.0, 0.0, 46.0, 136.0, 5...","[0.0351104736328125, 0.073272705078125, 0.0432..."
1,Dataset\Agriculture\002.jpg,Agriculture,0,"[[73, 73, 74, 74, 74, 73, 72, 71, 97, 124, 104...","[[137.0, 73.0, 2.0, 3.0, 5.0, 12.0, 3.0, 7.0, ...","[0.0435791015625, 0.06549072265625, 0.05491638..."
2,Dataset\Agriculture\003.jpg,Agriculture,0,"[[46, 45, 45, 44, 44, 45, 45, 46, 46, 46, 46, ...","[[22.0, 37.0, 27.0, 2.0, 0.0, 0.0, 26.0, 90.0,...","[0.032928466796875, 0.074493408203125, 0.04006..."
3,Dataset\Agriculture\004.jpg,Agriculture,0,"[[168, 166, 167, 169, 165, 158, 156, 158, 157,...","[[2.0, 0.0, 9.0, 66.0, 15.0, 20.0, 4.0, 2.0, 1...","[0.050872802734375, 0.080810546875, 0.06632995..."
4,Dataset\Agriculture\005.jpg,Agriculture,0,"[[126, 125, 116, 99, 86, 84, 88, 90, 88, 90, 9...","[[0.0, 0.0, 7.0, 25.0, 6.0, 7.0, 4.0, 0.0, 4.0...","[0.0339508056640625, 0.0785369873046875, 0.046..."


In [9]:
# # save to D:/temp
index_df['file'].to_pickle("D:/temp/index_df_file.pkl")
index_df['character'].to_pickle("D:/temp/index_df_character.pkl")
index_df['code'].to_pickle("D:/temp/index_df_code.pkl")
index_df['sift_features'].to_pickle("D:/temp/index_df_sift_features.pkl")
index_df['lbp_features'].to_pickle("D:/temp/index_df_lbp_features.pkl")