In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, precision_score
from scipy import ndimage, fft
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

# Class to preprocess light flux data
class LightFluxProcessor:

    def __init__(self, fourier=True, normalize=True, gaussian=True, standardize=True):
        self.fourier = fourier
        self.normalize = normalize
        self.gaussian = gaussian
        self.standardize = standardize

    def fourier_transform(self, X):
        return np.abs(fft.fft(X.values, n=X.size))

    def process(self, df_train_x, df_dev_x):
        # Apply fourier transform
        if self.fourier:
            print("Applying Fourier...")
            shape_train = df_train_x.shape
            shape_dev = df_dev_x.shape
            df_train_x = df_train_x.apply(self.fourier_transform, axis=1)
            df_dev_x = df_dev_x.apply(self.fourier_transform, axis=1)

            df_train_x_build = np.zeros(shape_train)
            df_dev_x_build = np.zeros(shape_dev)

            for ii, x in enumerate(df_train_x):
                df_train_x_build[ii] = x

            for ii, x in enumerate(df_dev_x):
                df_dev_x_build[ii] = x

            df_train_x = pd.DataFrame(df_train_x_build)
            df_dev_x = pd.DataFrame(df_dev_x_build)

            # Keep first half of data as it is symmetrical after previous steps
            df_train_x = df_train_x.iloc[:, :(df_train_x.shape[1] // 2)].values
            df_dev_x = df_dev_x.iloc[:, :(df_dev_x.shape[1] // 2)].values

        # Normalize
        if self.normalize:
            print("Normalizing...")
            df_train_x = pd.DataFrame(normalize(df_train_x))
            df_dev_x = pd.DataFrame(normalize(df_dev_x))

        # Gaussian filter to smooth out data
        if self.gaussian:
            print("Applying Gaussian Filter...")
            df_train_x = ndimage.gaussian_filter(df_train_x, sigma=10)
            df_dev_x = ndimage.gaussian_filter(df_dev_x, sigma=10)

        if self.standardize:
            # Standardize X data
            print("Standardizing...")
            std_scaler = StandardScaler()
            df_train_x = std_scaler.fit_transform(df_train_x)
            df_dev_x = std_scaler.transform(df_dev_x)

        print("Finished Processing!")
        return df_train_x, df_dev_x

# Helper function to shuffle and split the dataframe
def np_X_Y_from_df(df):
    df = shuffle(df)
    df_X = df.drop(['LABEL'], axis=1)
    X = np.array(df_X)
    Y_raw = np.array(df['LABEL']).reshape((len(df['LABEL']), 1))
    Y = Y_raw == 2
    return X, Y

# Function to train and evaluate SVM models with different kernels
def train_and_evaluate_kernel(X_train, Y_train, X_dev, Y_dev, kernels=['linear', 'rbf', 'poly']):
    results = []

    # Loop through the kernels
    for kernel in kernels:
        print(f"Training SVM with {kernel} kernel...")

        # Initialize the SVM model with the corresponding kernel
        if kernel == 'poly':
            model = SVC(kernel=kernel, degree=4)  # Polynomial kernel with degree 4
        else:
            model = SVC(kernel=kernel)  # Linear or Gaussian (RBF) kernel

        # Train the model
        model.fit(X_train, Y_train.ravel())  # Flatten Y_train to avoid warning

        # Make predictions
        train_predictions = model.predict(X_train)
        dev_predictions = model.predict(X_dev)

        # Calculate precision
        precision_train = precision_score(Y_train, train_predictions)
        precision_dev = precision_score(Y_dev, dev_predictions)

        # Confusion matrices
        confusion_train = confusion_matrix(Y_train, train_predictions)
        confusion_dev = confusion_matrix(Y_dev, dev_predictions)

        # Store the results
        results.append({
            'kernel': kernel,
            'precision_train': precision_train,
            'precision_dev': precision_dev,
            'confusion_train': confusion_train,
            'confusion_dev': confusion_dev
        })
        
        # Display results for the current kernel
        print(f"Kernel: {kernel}")
        print(f"Train Precision: {precision_train}")
        print(f"Dev Precision: {precision_dev}")
        print(f"Confusion Matrix (Train):\n{confusion_train}")
        print(f"Confusion Matrix (Dev):\n{confusion_dev}")
        print("-" * 50)
    
    return results


# Load the data
root_dir = "/documents/kepler-20241219T145340Z-001/kepler/data_injected/"
train_dataset_path = os.path.join(root_dir, "exoTrain.csv")
dev_dataset_path = os.path.join(root_dir, "exoTest.csv")

print("Loading datasets...")
df_train = pd.read_csv(train_dataset_path, encoding="ISO-8859-1")
df_dev = pd.read_csv(dev_dataset_path, encoding="ISO-8859-1")
print("Loaded datasets!")

# Generate X and Y dataframes
df_train_x = df_train.drop('LABEL', axis=1)
df_dev_x = df_dev.drop('LABEL', axis=1)
df_train_y = df_train.LABEL
df_dev_y = df_dev.LABEL

# Preprocess the data
LFP = LightFluxProcessor(
    fourier=True,
    normalize=True,
    gaussian=True,
    standardize=True)
df_train_x, df_dev_x = LFP.process(df_train_x, df_dev_x)

# Rejoin X and Y dataframes
df_train_processed = pd.DataFrame(df_train_x).join(pd.DataFrame(df_train_y))
df_dev_processed = pd.DataFrame(df_dev_x).join(pd.DataFrame(df_dev_y))

# Convert dataframes to numpy arrays
X_train, Y_train = np_X_Y_from_df(df_train_processed)
X_dev, Y_dev = np_X_Y_from_df(df_dev_processed)

# Train and evaluate SVM models with different kernels
results = train_and_evaluate_kernel(X_train, Y_train, X_dev, Y_dev)

# Save the results to a text file
with open("report_injection_assignment2_taskE.txt", "w") as f:
    for result in results:
        f.write(f"Kernel: {result['kernel']}\n")
        f.write(f"Train Precision: {result['precision_train']}\n")
        f.write(f"Dev Precision: {result['precision_dev']}\n")
        f.write(f"Confusion Matrix (Train):\n{result['confusion_train']}\n")
        f.write(f"Confusion Matrix (Dev):\n{result['confusion_dev']}\n")
        f.write("-" * 50 + "\n")



Loading datasets...
Loaded datasets!
Applying Fourier...
Normalizing...
Applying Gaussian Filter...
Standardizing...
Finished Processing!
Training SVM with linear kernel...
Kernel: linear
Train Precision: 0.5624139964615688
Dev Precision: 0.5543859649122806
Confusion Matrix (Train):
[[   0 2226]
 [   0 2861]]
Confusion Matrix (Dev):
[[  0 254]
 [  0 316]]
--------------------------------------------------
Training SVM with rbf kernel...
Kernel: rbf
Train Precision: 0.5653296266878475
Dev Precision: 0.5543859649122806
Confusion Matrix (Train):
[[  37 2189]
 [  14 2847]]
Confusion Matrix (Dev):
[[  0 254]
 [  0 316]]
--------------------------------------------------
Training SVM with poly kernel...
Kernel: poly
Train Precision: 0.5687980574666127
Dev Precision: 0.5597345132743363
Confusion Matrix (Train):
[[  95 2131]
 [  50 2811]]
Confusion Matrix (Dev):
[[ 55 199]
 [ 63 253]]
--------------------------------------------------
Evaluation completed and results saved to report_injection_