# import

In [16]:
import pickle
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from skimage.feature import local_binary_pattern
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn import preprocessing

from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import KernelPCA
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBClassifier
from sklearn import svm

import warnings
warnings.filterwarnings('ignore')

In [17]:
pic_shape = (200, 200)

In [18]:
train_dataset = pickle.load(open('E:/py/MachineLearing/MachineLearning-CourseExercise/my_train.pkl', 'rb'))
test_dataset = pickle.load(open('E:/py/MachineLearing/MachineLearning-CourseExercise/my_test.pkl', 'rb'))
print('训练集长度:', len(train_dataset['data']), '测试集长度:', len(test_dataset['data']))

训练集长度: 4750 测试集长度: 794


数据处理

In [19]:
for i in range(len(train_dataset['data'])):
    img=train_dataset['data'][i]
    target=train_dataset['target'][i]
    # 翻转增强
    train_dataset['data'].append(cv2.flip(img, -1))
    train_dataset['target'].append(target)
    train_dataset['data'].append(cv2.flip(img, 1))
    train_dataset['target'].append(target)

    # 旋转增强
    # getRotationMatrix2D(旋转中心,旋转角度,缩放比例)
    M = cv2.getRotationMatrix2D((int(pic_shape[0]*0.5),int(pic_shape[1]*0.5)), 45, 1)
    dst = cv2.warpAffine(img, M, pic_shape)
    train_dataset['data'].append(dst)
    train_dataset['target'].append(target)
    M = cv2.getRotationMatrix2D((int(pic_shape[0]*0.5),int(pic_shape[1]*0.5)), 90, 1)
    dst = cv2.warpAffine(img, M, pic_shape)
    train_dataset['data'].append(dst)
    train_dataset['target'].append(target)
    M = cv2.getRotationMatrix2D((int(pic_shape[0]*0.5),int(pic_shape[1]*0.5)), 135, 1)
    dst = cv2.warpAffine(img, M, pic_shape)
    train_dataset['data'].append(dst)
    train_dataset['target'].append(target)

#
print('数据拓展')

数据拓展


In [20]:
def create_mask_for_plant(image):
    image_hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    # 突出绿色部分
    sensitivity = 35
    lower_hsv = np.array([60 - sensitivity, 100, 50])
    upper_hsv = np.array([60 + sensitivity, 255, 255])
    # 输出的图片为二值化图只有黑白两种颜色
    mask = cv2.inRange(image_hsv, lower_hsv, upper_hsv)

    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (11, 11))
    # 形态学滤波
    # cv2.morphologyEx(img, op, kernel)
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)

    return mask


def segment_plant(image):
    mask = create_mask_for_plant(image)
    output = cv2.bitwise_and(image, image, mask=mask)
    return output


def sharpen_image(image):
    # cv2.GaussianBlur(SRC,ksize,sigmaX [,DST [,sigmaY [,borderType ] ] ] ) 
    # 减少噪声
    image_blurred = cv2.GaussianBlur(image, (0, 0), 3)
    image_sharp = cv2.addWeighted(image, 1.5, image_blurred, -0.5, 0)
    return image_sharp

In [22]:
# cv2.MORPH_OPEN	开运算(open) ,先腐蚀后膨胀的过程。开运算可以用来消除小黑点，在纤细点处分离物体、平滑较大物体的边界的 同时并不明显改变其面积。
# cv2.MORPH_CLOSE	闭运算(close)，先膨胀后腐蚀的过程。闭运算可以用来排除小黑洞。
# cv2.MORPH_GRADIENT	形态学梯度(morph-grad)，可以突出团块(blob)的边缘，保留物体的边缘轮廓。
# cv2.MORPH_TOPHAT	顶帽(top-hat)，将突出比原轮廓亮的部分。
# cv2.MORPH_BLACKHAT	黑帽(black-hat)，将突出比原轮廓暗的部分。

特征提取

In [21]:
winSize = pic_shape
blockSize = (int(pic_shape[0]*0.2),int(pic_shape[1]*0.2))
blockStride = (int(pic_shape[0]*0.2),int(pic_shape[1]*0.2))
cellSize = (int(pic_shape[0]*0.1),int(pic_shape[1]*0.1))
nbins = 4
hog = cv2.HOGDescriptor(winSize, blockSize, blockStride, cellSize, nbins)
orb=cv2.ORB_create(nfeatures=50)

In [23]:
# nfeatures ：最多提取的特征点的数量；
# scaleFactor ： 金字塔图像之间的尺度参数，类似于SIFT中的k；
# nlevels： 高斯金字塔的层数；
# edgeThreshold ：边缘阈值，这个值主要是根据后面的patchSize来定的，靠近边缘edgeThreshold以内的像素是不检测特征点的。
# firstLevel-：看过SIFT都知道，我们可以指定第一层的索引值，这里默认为0。
# WET_K ： 用于产生BIREF描述子的点对的个数，一般为2个，也可以设置为3个或4个，那么这时候描述子之间的距离计算就不能用汉明距离了，而是应该用一个变种。OpenCV中，如果设置WET_K = 2，则选用点对就只有2个点，匹配的时候距离参数选择NORM_HAMMING，如果WET_K设置为3或4，则BIREF描述子会选择3个或4个点，那么后面匹配的时候应该选择的距离参数为NORM_HAMMING2。
# scoreType ：用于对特征点进行排序的算法，你可以选择HARRIS_SCORE，也可以选择FAST_SCORE，但是它也只是比前者快一点点而已。
# patchSize ：用于计算BIREF描述子的特征点邻域大小。

In [24]:
winStride = (8, 8)
padding = (8, 8)

train_HOG_feature=[]
train_ORB_feature=[]
train_LBP_feature=[]
train_GRAY_feature=[]


test_HOG_feature=[]
test_ORB_feature=[]
test_LBP_feature=[]
test_GRAY_feature=[]

for img_data in tqdm(train_dataset['data']):
    image_segmented = segment_plant(img_data)
    image_sharpen = sharpen_image(image_segmented)
    gray = cv2.cvtColor(image_sharpen, cv2.COLOR_BGR2GRAY)
    # 图像数据生成mask和gray
    
    #resize后的小型图像
    gray_resized=cv2.resize(gray, (20, 20))
    train_GRAY_feature.append(gray_resized.reshape((-1,)))
    
    # lbp统计直方图
    lbp = local_binary_pattern(gray,P=8,R=3)
    max_bins=lbp.max()
    lbp_hist,_=np.histogram(lbp.reshape((-1,)), normed=True, density=True, bins=256, range=(0, max_bins))
    train_LBP_feature.append(lbp_hist)
    
    # orb特征
    ORB_zero=np.zeros((50,32))
    kp1, des1 = orb.detectAndCompute(gray, None)
    try:
        ORB=np.pad(des1,((0,50-des1.shape[0]),(0,0)),'constant')
    except:
        ORB=np.zeros((50,32))
    assert ORB.shape==(50,32)
    train_ORB_feature.append(ORB.reshape((-1,)))
    # hog特征
    #hog_result = hog.compute(image_sharpen, winStride, padding).reshape((-1,))
    hog_result = hog.compute(gray, winStride, padding).reshape((-1,))
    train_HOG_feature.append(hog_result)

print('HOG特征维度',train_HOG_feature[0].shape)
print('LBP特征维度',train_LBP_feature[0].shape)
print('ORB特征维度',train_ORB_feature[0].shape)
print('GRAY特征维度',train_GRAY_feature[0].shape)


for img_data in tqdm(test_dataset['data']):
    image_segmented = segment_plant(img_data)
    image_sharpen = sharpen_image(image_segmented)
    gray = cv2.cvtColor(image_sharpen, cv2.COLOR_BGR2GRAY)
    # 图像数据生成mask和gray

    #resize后的小型图像
    gray_resized=cv2.resize(gray, (20, 20))
    test_GRAY_feature.append(gray_resized.reshape((-1,)))
    
    # lbp统计直方图
    lbp = local_binary_pattern(gray,P=8,R=3)
    max_bins = lbp.max()
    lbp_hist,_ = np.histogram(lbp.reshape((-1,)), normed=True, density=True, bins=256, range=(0, max_bins))
    test_LBP_feature.append(lbp_hist)
    #
    # orb特征

    kp1, des1 = orb.detectAndCompute(gray, None)
    try:
        ORB=np.pad(des1,((0,50-des1.shape[0]),(0,0)),'constant')
    except:
        ORB=np.zeros((50,32))
    assert ORB.shape==(50,32)
    test_ORB_feature.append(ORB.reshape((-1,)))
    # hog特征
    #hog_result = hog.compute(image_sharpen, winStride, padding).reshape((-1,))
    hog_result = hog.compute(gray, winStride, padding).reshape((-1,))
    test_HOG_feature.append(hog_result)
# 特征提取



100%|██████████| 28500/28500 [05:07<00:00, 92.79it/s]


HOG特征维度 (3600,)
LBP特征维度 (256,)
ORB特征维度 (1600,)
GRAY特征维度 (400,)


100%|██████████| 794/794 [00:08<00:00, 89.54it/s]


In [25]:
# 特征归一化

train_MinMax_HOG = preprocessing.MinMaxScaler()
train_MinMax_HOG_data = train_MinMax_HOG.fit_transform(train_HOG_feature)
train_HOG_feature = train_MinMax_HOG.inverse_transform(train_MinMax_HOG_data)

train_MinMax_LBP = preprocessing.MinMaxScaler()
train_MinMax_LBP_data = train_MinMax_LBP.fit_transform(train_LBP_feature)
train_LBP_feature = train_MinMax_LBP.inverse_transform(train_MinMax_LBP_data)

train_MinMax_ORB = preprocessing.MinMaxScaler()
train_MinMax_ORB_data = train_MinMax_ORB.fit_transform(train_ORB_feature)
train_ORB_feature = train_MinMax_ORB.inverse_transform(train_MinMax_ORB_data)

train_MinMax_GRAY = preprocessing.MinMaxScaler()
train_MinMax_GRAY_data = train_MinMax_GRAY.fit_transform(train_GRAY_feature)
train_GRAY_feature = train_MinMax_GRAY.inverse_transform(train_MinMax_GRAY_data)


test_MinMax_HOG = preprocessing.MinMaxScaler()
test_MinMax_HOG_data = test_MinMax_HOG.fit_transform(test_HOG_feature)
test_HOG_feature = test_MinMax_HOG.inverse_transform(test_MinMax_HOG_data)

test_MinMax_LBP = preprocessing.MinMaxScaler()
test_MinMax_LBP_data = test_MinMax_LBP.fit_transform(test_LBP_feature)
test_LBP_feature = test_MinMax_LBP.inverse_transform(test_MinMax_LBP_data)

test_MinMax_ORB = preprocessing.MinMaxScaler()
test_MinMax_ORB_data = test_MinMax_ORB.fit_transform(test_ORB_feature)
test_ORB_feature = test_MinMax_ORB.inverse_transform(test_MinMax_ORB_data)

test_MinMax_GRAY = preprocessing.MinMaxScaler()
test_MinMax_GRAY_data = test_MinMax_GRAY.fit_transform(test_GRAY_feature)
test_GRAY_feature = test_MinMax_GRAY.inverse_transform(test_MinMax_GRAY_data)

print('特征归一化完成')

特征归一化完成


In [26]:
# 特征融合

train_feature=np.hstack([np.array(train_HOG_feature),np.array(train_LBP_feature),np.array(train_ORB_feature),np.array(train_GRAY_feature)])
test_feature=np.hstack([np.array(test_HOG_feature),np.array(test_LBP_feature),np.array(test_ORB_feature),np.array(test_GRAY_feature)])
# train_feature=np.hstack([np.array(train_HOG_feature),np.array(train_LBP_feature),np.array(train_ORB_feature)])
# test_feature=np.hstack([np.array(test_HOG_feature),np.array(test_LBP_feature),np.array(test_ORB_feature)])

# train_feature=np.array(train_HOG_feature)
# test_feature=np.array(test_HOG_feature)
print('train融合特征维度', train_feature.shape)
print('test融合特征维度', test_feature.shape)
print('特征提取结束')

train综合特征维度 (28500, 5856)
test综合特征维度 (794, 5856)
特征提取结束


In [38]:
# #数据降维
# print('数据降维开始')
# n_components=2000
# train_len=len(train_feature)
# data=np.vstack([train_feature,test_feature])

# pca_tsne = TSNE(n_components=3)
# newData_linear = pca_tsne.fit_transform(data)

# # sklearn_kpca = KernelPCA(n_components=n_components, kernel="rbf", gamma=15)
# # newData_nonlinear = sklearn_kpca.fit_transform(data)

# newData = newData_linear
# print(newData.shape)

数据降维开始
(29294, 3)


主成分分析（PCA）是使用线性映射将数据进行降维，但是通常情况下高维到低维是非线性的，往往达不到预期的结果。核主成分分析（KPCA）将原始数据通过选择适当的核函数（Kernel）映射到高维空间，再利用高维度空间进行线性降维，是一种用于非线性分类的降维工具。因此 KPCA的核心就是核函数。同时，KPCA采用了比较复杂的非线性映射，提高了非线性数据的处理效率。

In [39]:
# # 降维后的数据
# train_feature_low=newData[0:train_len]
# assert train_len==len(train_feature_low)
# test_feature_low=newData[train_len:]

In [11]:
# #特征数据存储
# train_feature_dist=train_dataset.copy()
# train_feature_dist['data']=train_feature
# pickle.dump(train_feature_dist,open('E:/py/MachineLearing/MachineLearning-CourseExercise/PlantSeedlingsClassification/train_feature_HOG.pkl','wb'))
# test_feature_dist=test_dataset.copy()
# test_feature_dist['data']=test_feature
# pickle.dump(test_feature_dist,open('E:/py/MachineLearing/MachineLearning-CourseExercise/PlantSeedlingsClassification/test_feature_HOG.pkl','wb'))
# print('特征数据存储结束')


特征数据存储结束


In [43]:
# # SVM分类(降维)

# print('SVM分类开始')

# modelSVM = svm.SVC()
# modelSVM.fit(train_feature_low, train_dataset['target'])
# predictedSVM = modelSVM.predict(test_feature_low)

# print('114514\n')

SVM分类开始
114514



In [47]:
# # SVM分类

# print('SVM分类开始')

# modelSVM = svm.SVC()
# modelSVM.fit(train_feature, train_dataset['target'])
# predictedSVM = modelSVM.predict(test_feature)

# print('114514\n')

SVM分类开始
114514



In [45]:
# # 随机森林分类(降维)
# print('RF分类开始')

# modelRF = RandomForestClassifier()
# modelRF.fit(train_feature_low, train_dataset['target'])
# predictedRF = modelRF.predict(test_feature_low)

# print('114514\n')

RF分类开始
114514



In [None]:
# # 随机森林分类
# print('RF分类开始')

# modelRF = RandomForestClassifier()
# modelRF.fit(train_feature, train_dataset['target'])
# predictedRF = modelRF.predict(test_feature)

# print('114514\n')

In [40]:
#XGBoost分类(降维)
print('Xgboost分类开始')

model = XGBClassifier(max_depth=5)
model.fit(train_feature_low, train_dataset['target'])
predictedXG = model.predict(test_feature_low)

print('114514\n')



Xgboost分类开始
114514



In [32]:
#XGBoost分类
print('Xgboost分类开始')

model = XGBClassifier(max_depth=5)
model.fit(train_feature, train_dataset['target'])
predictedXG = model.predict(test_feature)

print('114514\n')


Xgboost分类开始
114514



In [29]:
# def model_kf(my_model):
#     kf = KFold(5, shuffle=True, random_state=50).get_n_splits(train_feature)
#     result_list= np.sqrt(-cross_val_score(my_model, train_feature, train_dataset['target'], scoring="f1", cv = kf))
#     return(result_list)
# model_kf(model)

In [46]:
# 结果生成
# pred = np.exp(predicted)
# print(predicted)
# subSVM=pd.read_csv('E:/py/MachineLearing/MachineLearning-CourseExercise/PlantSeedlingsClassification/sample_submission.csv')
# subSVM['file'] = test_dataset['file_name']
# subSVM['species'] = list(map(lambda x:train_dataset['dict'][x], predictedSVM))
# subSVM.to_csv('E:/py/MachineLearing/MachineLearning-CourseExercise/PlantSeedlingsClassification/submission_SVM.csv', index=False)

# subRF=pd.read_csv('E:/py/MachineLearing/MachineLearning-CourseExercise/PlantSeedlingsClassification/sample_submission.csv')
# subRF['file'] = test_dataset['file_name']
# subRF['species'] = list(map(lambda x:train_dataset['dict'][x], predictedRF))
# subRF.to_csv('E:/py/MachineLearing/MachineLearning-CourseExercise/PlantSeedlingsClassification/submission_RF.csv', index=False)

subXG=pd.read_csv('E:/py/MachineLearing/MachineLearning-CourseExercise/PlantSeedlingsClassification/sample_submission.csv')
subXG['file'] = test_dataset['file_name']
subXG['species'] = list(map(lambda x:train_dataset['dict'][x], predictedXG))
subXG.to_csv('E:/py/MachineLearing/MachineLearning-CourseExercise/PlantSeedlingsClassification/submission_XG.csv', index=False)

#