Visualizing a example image

In [None]:
from PIL import Image

# Open the image file
image = Image.open('data/cloudy/train_10563.jpg')  ##Copy path of any image

# Get original dimensions
width, height = image.size

print(f"Width: {width}, Height: {height}")


In [None]:
from PIL import Image
import numpy as np

# Open the image using PIL
img= Image.open('data/cloudy/train_12.jpg').resize((100, 100))  # Resize to 100x100, or adjust dimensions as needed

# Convert the image to a NumPy array
img_array = np.array(img)

# Flatten the image into a 1D array
img_1d = img_array.flatten()

print("Shape of the original image:", img_array.shape)
print("Shape of the 1D array:", img_1d.shape)


ONLY FOR CLOUDY CLASS

In [None]:
import matplotlib.pyplot as plt

# Open the image using PIL
img= Image.open('data/cloudy/train_12.jpg').resize((100, 100))  # Resize to 100x100, or adjust dimensions as needed

# Convert the image to a NumPy array
img_array = np.array(img)
# Separate the channels (assuming RGBA)
r_channel = img_array[:, :, 0]  # Red channel
g_channel = img_array[:, :, 1]  # Green channel
b_channel = img_array[:, :, 2]  # Blue channel
a_channel = img_array[:, :, 3]  # Alpha (transparency) channel

# Display the channels
plt.figure(figsize=(10, 10))

plt.subplot(2, 2, 1)
plt.imshow(r_channel,cmap='Reds')
plt.title('Red Channel')
plt.axis('off')

plt.subplot(2, 2, 2)
plt.imshow(g_channel,cmap='Greens')
plt.title('Green Channel')
plt.axis('off')

plt.subplot(2, 2, 3)
plt.imshow(b_channel,cmap='Blues')
plt.title('Blue Channel')
plt.axis('off')

plt.subplot(2, 2, 4)
plt.imshow(a_channel,'gray')  # Display Alpha channel in grayscale
plt.title('Alpha Channel')
plt.axis('off')

plt.show()

Assessing the value of 4th channel (cloudy class)

In [None]:
# Extract the Alpha channel (4th channel)
alpha_channel = img_array[:, :, 3]  # Assuming RGBA image

# Calculate the mean of the Alpha channel
mean_alpha = np.mean(alpha_channel)

print(f"Mean of the Alpha channel: {mean_alpha}")


Making dataframe for each class

In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image

# Path to the parent folder containing subfolders
parent_folder = 'data'

# List of subfolders (classes)
subfolders = ['desert', 'green_area', 'water', 'cloudy']

# Process each subfolder and create a separate DataFrame
for subfolder in subfolders:
    folder_path = os.path.join(parent_folder, subfolder)
    
    # List to store the flattened images and their corresponding label
    data = []
    
    # Loop through each image in the subfolder
    for filename in os.listdir(folder_path):
        if filename.endswith('.jpg'):
            # Open the image and resize it to 100x100
            img = Image.open(os.path.join(folder_path, filename)).resize((100, 100))
            
            # Convert the image to a NumPy array
            img_array = np.array(img)
            
            # Check if the image has 3 channels (RGB), if so add a synthetic 4th channel
            if img_array.shape[2] == 3:
                # Add a new channel with a constant value of 255 for each pixel
                synthetic_channel = np.full((100, 100, 1), 255)
                img_array = np.concatenate((img_array, synthetic_channel), axis=2)
            
            # Flatten the image
            img_flattened = img_array.flatten()
            
            # Append the flattened image and label to the data list
            data.append(np.append(img_flattened, subfolder))
    
    # Create a DataFrame where the last column is the label (subfolder name)
    columns = [f'pixel_{i}' for i in range(100 * 100 * 4)] + ['label']
    df = pd.DataFrame(data, columns=columns)
    
    # Assign the DataFrame to a variable based on the subfolder name
    globals()[f'df_{subfolder}'] = df
    
    # Save the DataFrame to a CSV file
    df.to_csv(f'{subfolder}_data.csv', index=False)
    
    # Optionally print the head of each DataFrame
    print(f"DataFrame for {subfolder}:")
    print(df.head())


Separating the channel data for each class

In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image

# Path to the parent folder containing subfolders
parent_folder = 'data/data_3'
output_folder = 'channel_data'

# List of subfolders (classes), including 'cloudy'
subfolders = ['desert', 'green_area', 'water', 'cloudy']

# Create the output directory if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Process each subfolder and create DataFrames for each channel
for subfolder in subfolders:
    folder_path = os.path.join(parent_folder, subfolder)
    
    # Lists to store data for each channel
    data_red = []
    data_green = []
    data_blue = []
    
    # Loop through each image in the subfolder
    for filename in os.listdir(folder_path):
        if filename.endswith('.jpg'):
            # Open the image and resize it to 100x100
            img = Image.open(os.path.join(folder_path, filename)).resize((100, 100))
            
            # Convert the image to a NumPy array
            img_array = np.array(img)
            
            # Check if the image has 4 channels (RGBA), if so remove the Alpha channel
            if img_array.shape[2] == 4:
                img_array = img_array[:, :, :3]  # Only keep RGB
            
            # Flatten each channel
            img_red = img_array[:, :, 0].flatten()
            img_green = img_array[:, :, 1].flatten()
            img_blue = img_array[:, :, 2].flatten()
            
            # Append each channel's data
            data_red.append(np.append(img_red, subfolder))
            data_green.append(np.append(img_green, subfolder))
            data_blue.append(np.append(img_blue, subfolder))
    
    # Create DataFrames for each channel
    columns = [f'pixel_{i}' for i in range(100 * 100)] + ['label']
    
    df_red = pd.DataFrame(data_red, columns=columns)
    df_green = pd.DataFrame(data_green, columns=columns)
    df_blue = pd.DataFrame(data_blue, columns=columns)
    
    # Save the DataFrames to CSV files
    df_red.to_csv(os.path.join(output_folder, f'df_red_{subfolder}.csv'), index=False)
    df_green.to_csv(os.path.join(output_folder, f'df_green_{subfolder}.csv'), index=False)
    df_blue.to_csv(os.path.join(output_folder, f'df_blue_{subfolder}.csv'), index=False)
    
    # Optionally print the head of each DataFrame
    print(f"DataFrames for {subfolder}:")
    print("Red channel DataFrame:")
    print(df_red.head())
    print("Green channel DataFrame:")
    print(df_green.head())
    print("Blue channel DataFrame:")
    print(df_blue.head())


Making the histograms for each channel of each class

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# Path to the folder containing channel CSV files
#csv_folder = '/media/fgrslab/79488dcd-83a4-4da2-b6d9-06085c1a671b/Saurabh/Satellite/dsp project/channel_data'
csv_folder = 'channel_data'
# Path to the output folder where histograms will be saved
histogram_folder = 'channel_histograms'

# Create the histogram directory if it doesn't exist
if not os.path.exists(histogram_folder):
    os.makedirs(histogram_folder)

# Function to generate and save histograms for a given channel
def generate_histograms(channel_name):
    # List all CSV files for the given channel
    csv_files = [f for f in os.listdir(csv_folder) if f.startswith(f'df_{channel_name}_') and f.endswith('.csv')]
    
    for csv_file in csv_files:
        label = csv_file.replace(f'df_{channel_name}_', '').replace('.csv', '')
        csv_path = os.path.join(csv_folder, csv_file)
        
        # Load the DataFrame for the channel
        df_channel = pd.read_csv(csv_path)
        
        # Extract channel data
        channel_data = df_channel.drop(columns='label').values.flatten()
        
        # Plot histogram
        plt.figure(figsize=(8, 6))
        plt.hist(channel_data, bins=50, color=channel_name, edgecolor='black')
        #plt.title(f'Histogram of {channel_name.capitalize()} Channel - {label.capitalize()}')
        #plt.xlabel('Pixel Value', fontsize=20)
        #plt.ylabel('Frequency', fontsize=20) 
        plt.xlim(0, 255)  # Set x-axis limits
        plt.grid(True)
        plt.xticks(fontsize=24)
        plt.yticks(fontsize=24)
        plt.tight_layout()
        # Save histogram as an image file
        histogram_file = os.path.join(histogram_folder, f'{channel_name}_channel_histogram_{label}.png')
        plt.savefig(histogram_file)
        plt.close()
        
        print(f"Saved histogram for {label} at {histogram_file}")

# Generate histograms for Blue, Green, and Red channels
for channel in ['blue', 'green', 'red']:
    generate_histograms(channel)


Making combined Dataframe which has data of all classes

In [None]:
import pandas as pd
import glob
import os

# Path to the folder containing the CSV files
folder_path = 'label_data'

# List all CSV files in the folder
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))

# Read each CSV file into a DataFrame and store them in a list
dataframes = [pd.read_csv(file) for file in csv_files]

# Concatenate all DataFrames vertically
combined_df = pd.concat(dataframes, axis=0, ignore_index=True)

# Optionally, save the combined DataFrame to a new CSV file
combined_df.to_csv('combined_file.csv', index=False)


In [None]:
import pandas as pd
combined_df=pd.read_csv('combined_file.csv')

Applying Principle Component Anlaysis (PCA) to reduce the number of features

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Assuming 'combined_df' is your DataFrame with the combined data

# Shuffle the entire DataFrame
shuffled_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Separate features and the label column
labels = shuffled_df['label']
features = shuffled_df.drop('label', axis=1)

# Standardize the data
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Apply PCA
pca = PCA(n_components=100)  # Specify the number of components you want
features_pca = pca.fit_transform(features_scaled)

# Create a DataFrame with the PCA-transformed data
pca_df = pd.DataFrame(features_pca, columns=[f'PC{i+1}' for i in range(100)])

# Reattach the label column
pca_df['label'] = labels

# Optionally, save the PCA DataFrame to a new CSV file
pca_df.to_csv('pca100_combined_file.csv', index=False)

GridSearchCV (100 PCs)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the PCA-transformed DataFrame
pca_df = pd.read_csv('pca100_combined_file.csv')

# Separate features and label
X = pca_df.drop('label', axis=1)
y = pca_df['label']

# Encode the labels as numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# Initialize the XGBoost classifier
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [100, 150,200],        # Number of boosting rounds
    'max_depth': [3, 6,10],               # Maximum depth of a tree
    'learning_rate': [0.01, 0.1],      # Step size shrinkage
    'gamma': [0, 0.1],                 # Minimum loss reduction required to make a further partition
    'subsample': [0.8, 1]            # Fraction of samples used for fitting
}

# Perform grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best model to make predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Decode the predictions back to original labels
y_pred_decoded = label_encoder.inverse_transform(y_pred)
y_test_decoded = label_encoder.inverse_transform(y_test)

# Evaluate the model
accuracy = accuracy_score(y_test_decoded, y_pred_decoded)
report = classification_report(y_test_decoded, y_pred_decoded)

print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load the PCA-transformed DataFrame
pca_df = pd.read_csv('pca1000_combined_file.csv')

# Separate features and label
X = pca_df.drop('label', axis=1)
y = pca_df['label']

# Encode the labels as numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# Initialize the XGBoost classifier
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [100, 150,200],        # Number of boosting rounds
    'max_depth': [3, 6,10],               # Maximum depth of a tree
    'learning_rate': [0.01, 0.1],      # Step size shrinkage
    'gamma': [0, 0.1],                 # Minimum loss reduction required to make a further partition
    'subsample': [0.8, 1]            # Fraction of samples used for fitting
}

# Perform grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best model to make predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Decode the predictions back to original labels
y_pred_decoded = label_encoder.inverse_transform(y_pred)
y_test_decoded = label_encoder.inverse_transform(y_test)

# Calculate metrics
accuracy = accuracy_score(y_test_decoded, y_pred_decoded)
precision = precision_score(y_test_decoded, y_pred_decoded, average='weighted')
recall = recall_score(y_test_decoded, y_pred_decoded, average='weighted')
f1 = f1_score(y_test_decoded, y_pred_decoded, average='weighted')
report = classification_report(y_test_decoded, y_pred_decoded)

# Print metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print('Classification Report:')
print(report)

CNN

In [None]:
import os
import shutil
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Paths to the original data and new train/validation directories
source_dir = 'data'         # Main folder containing subfolders of each class
train_dir = 'split/train'   # Folder to save training images
val_dir = 'split/val'       # Folder to save validation images

# Ensure train and validation directories exist
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

# Function to split data into train and validation sets
def split_data(source_dir, train_dir, val_dir, test_size=0.3, random_state=42):
    for class_name in os.listdir(source_dir):
        class_path = os.path.join(source_dir, class_name)
        if not os.path.isdir(class_path):
            continue

        # Create subfolders for each class in train/val directories
        os.makedirs(os.path.join(train_dir, class_name), exist_ok=True)
        os.makedirs(os.path.join(val_dir, class_name), exist_ok=True)

        # List all image files in the current class folder
        images = os.listdir(class_path)
        
        # Split images into training and validation sets
        train_images, val_images = train_test_split(images, test_size=test_size, random_state=random_state)

        # Copy images to train/val class subfolders
        for image in train_images:
            shutil.copy(os.path.join(class_path, image), os.path.join(train_dir, class_name, image))
        for image in val_images:
            shutil.copy(os.path.join(class_path, image), os.path.join(val_dir, class_name, image))

# Perform the data split
split_data(source_dir, train_dir, val_dir)

# Function to add a synthetic 4th channel (alpha) to the image
def add_fourth_channel(img):
    if img.shape[-1] == 3:  # Only add if the image has 3 channels
        fourth_channel = np.full((img.shape[0], img.shape[1], 1), 255, dtype=np.uint8)
        img = np.concatenate((img, fourth_channel), axis=-1)
    return img

# Function to load and preprocess the image with the added 4th channel
def preprocess_image(image_path):
    img = load_img(image_path, target_size=(256, 256))  # Load the image from file path
    img_array = img_to_array(img)  # Convert to NumPy array
    img_array = add_fourth_channel(img_array)  # Add the 4th channel if needed
    return img_array / 255.0  # Rescale image pixels to [0, 1]

# Function to load dataset (images and labels)
def load_dataset(directory):
    images = []
    labels = []
    class_names = os.listdir(directory)  # Class names based on folder structure
    class_map = {class_name: idx for idx, class_name in enumerate(class_names)}  # Map class names to indices

    for class_name in class_names:
        class_folder = os.path.join(directory, class_name)
        if os.path.isdir(class_folder):
            for img_file in os.listdir(class_folder):
                img_path = os.path.join(class_folder, img_file)
                img = preprocess_image(img_path)  # Preprocess the image
                images.append(img)
                labels.append(class_map[class_name])  # Use the class index for the label

    images = np.array(images)
    labels = np.array(labels)
    return images, labels, class_names

# Load training and validation datasets
train_images, train_labels, class_names = load_dataset(train_dir)
val_images, val_labels, _ = load_dataset(val_dir)

# Convert labels to one-hot encoding
train_labels = to_categorical(train_labels, num_classes=len(class_names))
val_labels = to_categorical(val_labels, num_classes=len(class_names))

# Define the CNN model
def create_cnn(input_shape=(256, 256, 4), num_classes=4):
    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(128, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Create the CNN model
model = create_cnn(input_shape=(256, 256, 4), num_classes=len(class_names))

# Train the model
model.fit(
    train_images, train_labels,
    validation_data=(val_images, val_labels),
    epochs=10,
    batch_size=32
)

# Get the true labels and predicted labels for the validation set
y_pred = model.predict(val_images)
y_pred = np.argmax(y_pred, axis=1)  # Predicted class indices
y_true = np.argmax(val_labels, axis=1)  # True class indices

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)
print("Validation Accuracy:", accuracy)

# Classification report
print(classification_report(y_true, y_pred, target_names=class_names))


SVM

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load the PCA-transformed DataFrame
pca_df = pd.read_csv('pca1000_combined_file.csv')

# Separate features and label
X = pca_df.drop('label', axis=1)
y = pca_df['label']

# Encode the labels as numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# Standardize the features (important for SVM)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the SVM classifier
model = SVC(verbose=True)

# Define the parameter grid for grid search
param_grid = {
    'C': [0.1, 1, 10],            # Regularization parameter
    'kernel': ['linear', 'rbf','sigmoid'],   # Kernel type
    'gamma': ['scale', 'auto'],    # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’
}

# Perform grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best model to make predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Decode the predictions back to original labels
y_pred_decoded = label_encoder.inverse_transform(y_pred)
y_test_decoded = label_encoder.inverse_transform(y_test)

# Calculate metrics
accuracy = accuracy_score(y_test_decoded, y_pred_decoded)
precision = precision_score(y_test_decoded, y_pred_decoded, average='weighted')
recall = recall_score(y_test_decoded, y_pred_decoded, average='weighted')
f1 = f1_score(y_test_decoded, y_pred_decoded, average='weighted')
report = classification_report(y_test_decoded, y_pred_decoded)

# Print metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print('Classification Report:')
print(report)


Logistic Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load the PCA-transformed DataFrame
pca_df = pd.read_csv('pca1000_combined_file.csv')

# Separate features and label
X = pca_df.drop('label', axis=1)
y = pca_df['label']

# Encode the labels as numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# Standardize the features (important for Logistic Regression)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the Logistic Regression classifier
model = LogisticRegression(solver='liblinear', max_iter=1000)

# Define the parameter grid for grid search
param_grid = {
    'C': [0.1, 1, 10],            # Regularization strength
    'penalty': ['l1', 'l2'],      # Regularization type
    'solver': ['liblinear'],      # Solver for optimization
}

# Perform grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best model to make predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Decode the predictions back to original labels
y_pred_decoded = label_encoder.inverse_transform(y_pred)
y_test_decoded = label_encoder.inverse_transform(y_test)

# Calculate metrics
accuracy = accuracy_score(y_test_decoded, y_pred_decoded)
precision = precision_score(y_test_decoded, y_pred_decoded, average='weighted')
recall = recall_score(y_test_decoded, y_pred_decoded, average='weighted')
f1 = f1_score(y_test_decoded, y_pred_decoded, average='weighted')
report = classification_report(y_test_decoded, y_pred_decoded)

# Print metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print('Classification Report:')
print(report)


Decision Tree

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load the PCA-transformed DataFrame
pca_df = pd.read_csv('/content/pca1000_combined_file.csv')

# Separate features and label
X = pca_df.drop('label', axis=1)
y = pca_df['label']

# Encode the labels as numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# Standardize the features (important for Decision Trees, though not strictly necessary)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the Decision Tree classifier
model = DecisionTreeClassifier(random_state=42)

# Define the parameter grid for grid search
param_grid = {
    'criterion': ['gini', 'entropy'],       # The function to measure the quality of a split
    'max_depth': [5, 10, 20],         # The maximum depth of the tree
    'min_samples_split': [2, 10, 20],       # The minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 5, 10],         # The minimum number of samples required to be at a leaf node
}

# Perform grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best model to make predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Decode the predictions back to original labels
y_pred_decoded = label_encoder.inverse_transform(y_pred)
y_test_decoded = label_encoder.inverse_transform(y_test)

# Calculate metrics with zero_division set to 0
accuracy = accuracy_score(y_test_decoded, y_pred_decoded)
precision = precision_score(y_test_decoded, y_pred_decoded, average='weighted', zero_division=0)
recall = recall_score(y_test_decoded, y_pred_decoded, average='weighted', zero_division=0)
f1 = f1_score(y_test_decoded, y_pred_decoded, average='weighted', zero_division=0)
report = classification_report(y_test_decoded, y_pred_decoded, zero_division=0)

# Print metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print('Classification Report:')
print(report)

Random Forest

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load the PCA-transformed DataFrame
pca_df = pd.read_csv('/content/pca1000_combined_file.csv')

# Separate features and label
X = pca_df.drop('label', axis=1)
y = pca_df['label']

# Encode the labels as numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# Standardize the features (important for Random Forest, though not strictly necessary)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the Random Forest classifier
model = RandomForestClassifier(random_state=42)

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [50, 100, 200],         # Number of trees in the forest
    'max_depth': [ 10, 20, 30],        # Maximum depth of the tree
    'min_samples_split': [2, 10, 20],       # The minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 5, 10],         # The minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2'], # The number of features to consider when looking for the best split
}

# Perform grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best model to make predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Decode the predictions back to original labels
y_pred_decoded = label_encoder.inverse_transform(y_pred)
y_test_decoded = label_encoder.inverse_transform(y_test)

# Calculate metrics with zero_division set to 0
accuracy = accuracy_score(y_test_decoded, y_pred_decoded)
precision = precision_score(y_test_decoded, y_pred_decoded, average='weighted', zero_division=0)
recall = recall_score(y_test_decoded, y_pred_decoded, average='weighted', zero_division=0)
f1 = f1_score(y_test_decoded, y_pred_decoded, average='weighted', zero_division=0)
report = classification_report(y_test_decoded, y_pred_decoded, zero_division=0)

# Print metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print('Classification Report:')
print(report)