# Logistic Regression Model for Pineapple Plantation Prediction

**Author**: Sage McGinley-Smith  
**Class**: CS 230: Deep Learning  
**Date**: November 2024

This notebook takes the dataset I will be using for my Unet model and makes logistic regression classifications on a pixel by pixel basis. Due to computational constraints, the model is trained on only 26000 datapoints of the millions of pixels in the dataset. This should serve as basis for comparison to future deep learning segmentation models. As evidenced, the precision and recall for classification of pixels as pineapple plantations is low.

# Install and Load Required Packages

In [None]:
!pip install rasterio
import rasterio
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Import Data From Drive Folder

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

# Load all four sets of bands and labels
label_files = [
    "/content/drive/My Drive/Senior Project/Training_Data_Full_Quads/mask_q2_2019.tif",
]
band_files = [
    "/content/drive/My Drive/Senior Project/Training_Data_Full_Quads/image_q2_2019.tif",
]
# Store bands and labels in lists
bands_list = []
labels_list = []

for band_path, label_path in zip(band_files, label_files):
    with rasterio.open(band_path) as bands_file:
        bands = bands_file.read()  # Shape: (num_bands, height, width)
        bands_list.append(bands)

    with rasterio.open(label_path) as labels_file:
        labels = labels_file.read(1)  # Shape: (height, width)
        labels_list.append(labels)

# Check dimensions consistency
num_bands, height, width = bands_list[0].shape


# Reshape and Select X and Y Vectors

In [None]:
with rasterio.open(band_files[0]) as src:
      band_names = src.descriptions
      print(band_names)

for i, bands in enumerate(bands_list):
    print(f"Shape of X Original: {bands.shape}")
for i, labels in enumerate(labels_list):
    print(f"Shape of Y Original: {labels.shape}")
reshaped_bands_list = [bands.reshape(9, -1).T for bands in bands_list]  # Reshape to (H * W, 9)
reshaped_labels_list = [labels.reshape(-1, 1) for labels in labels_list]  # Reshape to (H * W, 1)

# Example: Print shapes after reshaping
for i, (bands, labels) in enumerate(zip(reshaped_bands_list, reshaped_labels_list)):
    print(f"Reshaped X: {bands.shape}")
    print(f"Reshaped Y: {labels.shape}")

# Stack all reshaped bands and labels into single arrays
x = np.vstack(reshaped_bands_list)  # Shape will be (total_pixels, 9)
y = np.vstack(reshaped_labels_list)  # Shape will be (total_pixels, 1)
# Identify indices where y == 0 and y == 1
indices_0 = np.where(y == 0)[0]
indices_1 = np.where(y == 1)[0]

# Randomly select 26,000 indices from class 0 and 1,000 from class 1
selected_indices_0 = np.random.choice(indices_0, size=25000, replace=False)
selected_indices_1 = np.random.choice(indices_1, size=1000, replace=False)

# Combine selected indices and shuffle
selected_indices = np.concatenate([selected_indices_0, selected_indices_1])
np.random.shuffle(selected_indices)

# Select the corresponding x and y values
x = x[selected_indices]
y = y[selected_indices]

# Assuming band_names contains the names of the bands in the correct order
band_names = ['B2', 'B3', 'B4', 'B8', 'B11', 'NDVI', 'SAVI', 'NDMI', 'NDWI']

# Identify the indices for 'B2', 'B3', and 'B4'
selected_band_indices = [band_names.index(band) for band in ['B2', 'B3', 'B4']]

# Create a new x array with only the selected bands
x_rgb = x[:, selected_band_indices]
y_rgb = y

# Print the shapes of the final arrays
print(f"Final Shape of X (Selected): {x.shape}")
print(f"Final Shape of Y (Selected): {y.shape}")

# Split Data Into Train and Test and Fit Model (All Bands)


In [None]:
# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y  # Stratify ensures balanced label distribution
)
# Initialize the logistic regression model
log_reg_model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model on the training data
log_reg_model.fit(x_train, y_train)
# Predict on the test set
y_pred = log_reg_model.predict(x_test)

# Model Evaluation (All Bands)

In [None]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Split Data Into Train and Test and Fit Model (RGB Only)


In [None]:
# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(
    x_rgb, y_rgb, test_size=0.2, random_state=42, stratify=y  # Stratify ensures balanced label distribution
)
# Initialize the logistic regression model
log_reg_model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model on the training data
log_reg_model.fit(x_train, y_train)
# Predict on the test set
y_pred = log_reg_model.predict(x_test)

# Model Evaluation (RGB Only)

In [None]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))