In [22]:

import cv2
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
from scipy.spatial.distance import euclidean
from scipy.stats import pearsonr
from utils.main import lbp,lpq
from skimage.feature import hog


### Short Summary

The `lbp` and `lpq` functions were imported from `practice 3`, and the HOG (Histogram of Oriented Gradients) image was built using SciPy.

In [23]:
img=cv2.imread("cr1.jpg",1)
img_gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
img.shape

(232, 217, 3)

In [24]:
img_lbp=lbp(img)
img_lbp.shape

(232, 217)

In [25]:
img_lpq=lpq(img_gray)
img_lpq.shape

(232, 217)

In [26]:
hog_features,img_hog = hog(
    img_gray, 
    orientations=9, 
    pixels_per_cell=(8, 8),
    cells_per_block=(2, 2), 
    visualize=True,
    channel_axis=None
)
img_hog.shape

(232, 217)

### Feature Count Summary

- **Calculation**: **\( 232 \times 217 = 50,344 \)**
- **Conclusion**: Each image has **50,344 features**.



In [27]:
lbp_flattened=img_lbp.flatten()
lpq_flattened=img_lpq.flatten()
hog_flattened=img_hog.flatten()
original_flattened=img_gray.flatten()

In [28]:


# Prepare the data for feature selection
X = lbp_flattened.reshape(-1, 1)
y = original_flattened

# Create a Linear Regression model
model = LinearRegression()

# Perform Sequential Feature Selection
sfs = SequentialFeatureSelector(model, n_features_to_select=10, direction='forward')

# Fit the model
sfs.fit(img_lbp, img_gray)

# Get the indices of the selected features
selected_features_idx = sfs.get_support(indices=True)

# Select the best features
selected_features = lbp_flattened[selected_features_idx]

# Compute Euclidean distance for selected features
distance = euclidean(original_flattened[selected_features_idx], selected_features)

# Compute Pearson correlation for selected features
correlation, _ = pearsonr(original_flattened[selected_features_idx], selected_features)

print(f"Selected features indices: {selected_features_idx}")
print(f"Euclidean distance: {distance}")
print(f"Pearson correlation coefficient: {correlation}")


Selected features indices: [ 18  20  35  44  45  46  71 199 203 208]
Euclidean distance: 324.9122958584362
Pearson correlation coefficient: -0.30040055077890676


In [29]:


# Perform Sequential Feature Selection
# sbs = SequentialFeatureSelector(model, n_features_to_select=10, direction='backward')

# # Fit the model
# sbs.fit(img_lbp, img_gray)

# # Get the indices of the selected features
# selected_features_idx = sbs.get_support(indices=True)

# # Select the best features
# selected_features = lbp_flattened[selected_features_idx]

# # Compute Euclidean distance for selected features
# distance = euclidean(original_flattened[selected_features_idx], selected_features)

# # Compute Pearson correlation for selected features
# correlation, _ = pearsonr(original_flattened[selected_features_idx], selected_features)

# print(f"Selected features indices: {selected_features_idx}")
# print(f"Euclidean distance: {distance}")
# print(f"Pearson correlation coefficient: {correlation}")

### Summary: Challenges of Using SBS and SBFS on High-Dimensional Image Data

Given the dimensionality of images (200x200 pixels), which results in a total of 40,000 features, employing Sequential Backward Selection (SBS) and Sequential Backward Floating Selection (SBFS) may be impractical due to the following reasons:

1. **High Dimensionality**: With 40,000 features, these methods would need to evaluate an enormous number of possible feature subsets, leading to an exponential increase in computation time and complexity.

2. **Iterative Process**: Both SBS and SBFS involve iterative processes of adding and removing features. For each iteration, the algorithm evaluates the model performance, which becomes prohibitively time-consuming with such a large feature space.

3. **Computational Load**: Evaluating each subset of features, especially when many features are involved, can significantly slow down the process, making it impractical for real-time or near-real-time applications.

4. **Scalability Issues**: These methods are more suited for datasets with a smaller number of features. Scaling them to high-dimensional data such as 200x200 images requires substantial computational resources and time, which may not be feasible in most scenarios.

### Alternative Solutions:

1. **Dimensionality Reduction**: Consider using techniques like Principal Component Analysis (PCA) or Singular Value Decomposition (SVD) to reduce the number of features before applying feature selection methods.
2. **Feature Engineering**: Extract meaningful features from the images (e.g., edges, textures) using methods like Convolutional Neural Networks (CNNs) which automatically learn and select relevant features.
3. **Randomized Algorithms**: Use randomized feature selection algorithms that can provide near-optimal solutions with significantly less computational effort.

By considering these alternatives, effective feature selection can be achieved without the prohibitive computational cost associated with SBS or SBFS on high-dimensional data.

In [30]:

from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS


# Perform Sequential Floating Forward Selection (SFFS)
sffs = SFS(model, 
          k_features=10,  # Number of features to select
          forward=True,
          floating=True,
          scoring='neg_mean_squared_error',
          cv=0)  # No cross-validation for simplicity

# Fit the model
sffs.fit(img_hog, img_gray)

# Get the indices of the selected features
selected_features_idx = sffs.k_feature_idx_
print(selected_features_idx)
# Select the best features
selected_features = hog_flattened[list(selected_features_idx)] #lbp_flattened[selected_features_idx]
# Compute Euclidean distance for selected features 

# Compute Euclidean distance for selected features
# distance = euclidean(original_flattened[selected_features_idx], selected_features)
# correlation, _ = pearsonr(original_flattened[selected_features_idx], selected_features)
# print(f"Selected features indices: {selected_features_idx}")
# print(f"Euclidean distance: {distance}")


(28, 34, 35, 51, 60, 92, 123, 148, 179, 182)


In [31]:
distance = euclidean(original_flattened[list(selected_features_idx)], selected_features) 
print(f"Selected features indices: {selected_features_idx}")
print(f"Euclidean distance: {distance}")

Selected features indices: (28, 34, 35, 51, 60, 92, 123, 148, 179, 182)
Euclidean distance: 96.52978814852958
