In [1]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import MiniBatchKMeans
from skimage.feature import local_binary_pattern
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA


dataset_path='Dataset'
RANDOMSEED=1277
num_type=15
N_CLUSTERS = 100    
MAX_SAMPLES = 125     
label_list = [
    "Agriculture", "Airport", "Beach", "City", "Desert", "Forest",
    "Grassland", "Highway", "Lake", "Mountain", "Parking", "Port",
    "Railway", "Residential", "River"
]

In [2]:
code = pd.read_pickle("D:/temp/index_df_code.pkl")
sift_features = pd.read_pickle("D:/temp/index_df_sift_features.pkl")

index_df = pd.concat([code, sift_features], axis=1)

In [3]:
def create_visual_vocabulary(descriptors_list, n_clusters=N_CLUSTERS, max_samples=MAX_SAMPLES):
    all_descriptors = np.vstack([d for d in descriptors_list if len(d) > 0])
    
    if len(all_descriptors) > max_samples:
        np.random.seed(RANDOMSEED)
        all_descriptors = all_descriptors[np.random.choice(len(all_descriptors), max_samples, replace=False)]
    
    # use MiniBatchKMeans
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=RANDOMSEED)
    kmeans.fit(all_descriptors)
    return kmeans

In [4]:
def extract_bow_features(descriptors_list, kmeans):
    n_clusters = kmeans.n_clusters
    features = []
    total = len(descriptors_list)

    for i, descriptors in enumerate(descriptors_list):
        print(f"take BOW features:{i+1}/{total}",end='\r')

        hist = np.zeros(n_clusters)
        if descriptors is not None and len(descriptors) > 0:
            labels = kmeans.predict(descriptors)
            counts = np.bincount(labels)
            if len(counts) < n_clusters:
                hist[:len(counts)] = counts
            else:
                hist = counts[:n_clusters]
            hist = hist.astype(np.float32)
            hist /= hist.sum() + 1e-7
        features.append(hist)

    return np.array(features)


In [5]:
totsize=len(index_df)
print(totsize)
train_size=int(totsize*0.8)
test_size=int(totsize*0.2)

train_per_code=train_size//num_type
test_per_code=test_size//num_type

train_df=pd.DataFrame()
test_df=pd.DataFrame()
# use train_test_split to split test and train set
for code in index_df['code'].unique():
    # every word, each word is code
    sample_code= index_df[index_df['code'] == code]
    train_samples, test_samples = train_test_split(
        sample_code, 
        test_size=test_per_code, 
        train_size=train_per_code,
        random_state=RANDOMSEED, 
        shuffle=True
    )
 
    train_df = pd.concat([train_df, train_samples], ignore_index=True)
    test_df = pd.concat([test_df, test_samples], ignore_index=True)

print('finish')

12000
finish


In [6]:
def printresult_pro(y_true, y_pred, y_proba, label_names=None, top_n=3):
    # 1. normal score
    print("Evaluation Metrics:")
    print("Accuracy Score:", metrics.accuracy_score(y_true, y_pred))
    print("Recall Score (macro):", metrics.recall_score(y_true, y_pred, average='macro'))
    print("F1 Score (macro):", metrics.f1_score(y_true, y_pred, average='macro'))
    print("Confusion Matrix:")
    print(metrics.confusion_matrix(y_true, y_pred))

    # 2. precision / recall / f1 sorce for every class
    print("\nPer-Class Performance:")
    report = metrics.classification_report(y_true, y_pred, target_names=label_names, digits=3)
    print(report)



KNN-sift_features

In [7]:
# Extract a subset of training SIFT descriptors for clustering
train_sift_sample, _ = train_test_split(train_df['sift_features'], test_size=0.7, random_state=RANDOMSEED)
kmeans = create_visual_vocabulary(train_sift_sample)
print("Visual vocabulary created.")

# Retrieve full SIFT features from training and test sets
full_train_sift = train_df['sift_features']
test_sift_features = test_df['sift_features']

# Convert SIFT descriptors into Bag of Words (BoW) feature vectors
X_train_bow = extract_bow_features(full_train_sift, kmeans)
X_test_bow = extract_bow_features(test_sift_features, kmeans)

# Extract class labels
y_train = np.array(train_df['code'])
y_test = np.array(test_df['code'])


[WinError 2] 系统找不到指定的文件。
  File "C:\Users\lele1\anaconda3\envs\comp9417\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\lele1\anaconda3\envs\comp9417\Lib\subprocess.py", line 550, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\lele1\anaconda3\envs\comp9417\Lib\subprocess.py", line 1028, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\lele1\anaconda3\envs\comp9417\Lib\subprocess.py", line 1540, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Visual vocabulary created.
take BOW features:2400/2400

In [8]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler

pca = PCA(n_components=60)  #
X_train_bow = pca.fit_transform(X_train_bow)
X_test_bow = pca.transform(X_test_bow)

scaler = StandardScaler() # scaler
X_train_bow = scaler.fit_transform(X_train_bow)
X_test_bow = scaler.transform(X_test_bow)

In [9]:
knn_clf_bow = KNeighborsClassifier(n_neighbors=3)
knn_clf_bow.fit(X_train_bow, y_train)
knn_clf_bow_predict_y = knn_clf_bow.predict(X_test_bow)
knn_clf_bow_proba_y = knn_clf_bow.predict_proba(X_test_bow)



In [10]:
#result for KNN-sift_features
print("KNN-sift_features:")
printresult_pro(y_test, knn_clf_bow_predict_y, knn_clf_bow_proba_y,label_names=label_list)


KNN-sift_features:
Evaluation Metrics:
Accuracy Score: 0.41375
Recall Score (macro): 0.4137500000000001
F1 Score (macro): 0.40218994428189797
Confusion Matrix:
[[ 67  20   7  17   1   4   2   9   7   4   1   1   6  12   2]
 [  3  63   5  33   0   2   0  10   4   1   2   3  31   3   0]
 [ 12  11  47  23   8   7   2  10  10  12   2   2   5   3   6]
 [  1  18   3 103   0   1   0   3   0   5   0   5  17   3   1]
 [ 27   6  18   4  40  25   5   5   7  14   2   3   0   2   2]
 [  0   2   3   3   4 136   4   1   1   3   1   0   1   1   0]
 [ 19  11  11   3  12  29  45   6   5   1   5   0   1   9   3]
 [  6  23   3  17   4   2   0  60   1   1   1   1  34   5   2]
 [ 14  23  16  26   4  15   1   9  20  12   0   1   6   9   4]
 [  2   4   8  21   7   5   1   3   9  93   0   1   1   2   3]
 [  1  14   1   9   3   5   0   6   0   3 103   2   9   3   1]
 [  1  16   1  50   1   2   1   5   2   6   2  54  16   1   2]
 [  2  17   2  25   1   1   0  19   1   7   0   1  84   0   0]
 [  1  20   0  40   1

In [11]:
svm_rbf_sift = SVC(kernel='rbf', random_state=RANDOMSEED,probability=True)
svm_rbf_sift.fit(X_train_bow, y_train)
svm_rbf_sift_predict = svm_rbf_sift.predict(X_test_bow)
svm_rbf_sift_proba = svm_rbf_sift.predict_proba(X_test_bow)


In [12]:
print("SVM with SIFT (BOW features):")
printresult_pro(y_test, svm_rbf_sift_predict, svm_rbf_sift_proba,label_names=label_list)


SVM with SIFT (BOW features):
Evaluation Metrics:
Accuracy Score: 0.6104166666666667
Recall Score (macro): 0.6104166666666667
F1 Score (macro): 0.6059114406414307
Confusion Matrix:
[[ 92   1   4   1  10   0  14   7   5   2   1   2   3   7  11]
 [ 12  85   1   7   2   1   0   8   5   1   0   5  24   4   5]
 [  3   1  89   2  11   1  10   8  11   4   1   7   1   0  11]
 [  1  12   4 106   0   1   1   6   1   2   1  10   7   6   2]
 [  4   1   5   2  81   2  36   1   3  18   0   2   0   0   5]
 [  0   0   2   0   4 131  12   1   1   4   1   0   0   2   2]
 [ 13   0   5   0  22   8  95   3   5   0   1   0   0   3   5]
 [  1   6   2   1   3   1   3 107   7   0   4   0  11   5   9]
 [ 13   9  20   1   5   5   5  10  39  13   1   4   1   8  26]
 [  1   1   4   3  11   0   1   0   8 117   0   2   0   4   8]
 [  0   2   0   1   3   1   0  10   0   0 135   0   2   3   3]
 [  1  11   1  16   1   0   0   3   0   1   2 100  11   1  12]
 [  1  11   1  13   0   0   0  18   2   2   0   5 103   1   3]


In [13]:
rf_clf_bow = RandomForestClassifier(n_estimators=100, random_state=RANDOMSEED)
rf_clf_bow.fit(X_train_bow, y_train)
rf_clf_bow_predict = rf_clf_bow.predict(X_test_bow)
rf_clf_bow_prob = rf_clf_bow.predict_proba(X_test_bow)


In [14]:
print("Random Forest with SIFT (BOW features):")
printresult_pro(y_test, rf_clf_bow_predict, rf_clf_bow_prob,label_names=label_list)


Random Forest with SIFT (BOW features):
Evaluation Metrics:
Accuracy Score: 0.5575
Recall Score (macro): 0.5575
F1 Score (macro): 0.5484173462013172
Confusion Matrix:
[[ 95   3   9   2  11   0   4  11   6   1   2   1   1   7   7]
 [  7  73   1  11   1   1   0  14   1   2   1  10  23   4  11]
 [  7   5  85   4   8   2   3   8  12   9   0   5   2   1   9]
 [  1  10   3  99   1   0   0   4   0   1   3  14  14   9   1]
 [  7   0   7   1  87   7  18   1   5  14   2   5   0   1   5]
 [  0   0   3   0   4 128   8   1   4   6   1   0   0   2   3]
 [ 15   2  15   0  23   7  74   7   4   1   4   0   0   3   5]
 [  5  11   5   0   7   0   1  84   7   4   2   0  18  10   6]
 [  7   9  23   3   7   9   4  11  29  18   3   4   2  17  14]
 [  2   1   7   8   5   4   2   1   4 109   0   3   0   5   9]
 [  1   5   0   1   2   0   1   8   2   0 130   4   1   4   1]
 [  4  10   3  24   0   0   0   4   2   3   3  93  10   1   3]
 [  1  20   1  11   0   0   0  23   0   2   0   6  94   0   2]
 [  1   4   2 