In [None]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (644.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m644.8/644.8 MB[0m [31m415.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25hCollecting termcolor>=1.1.0
  Downloading termcolor-3.1.0-py3-none-any.whl (7.7 kB)
Collecting ml-dtypes<1.0.0,>=0.5.1
  Downloading ml_dtypes-0.5.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting astunparse>=1.6.0
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3
  Downloading protobuf-5.29.5-cp38-abi3-manylinux2014_x86_64.whl (319 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.9/319.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0ma [36

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [25]:
# Define dataset paths (update if necessary)
train_dataset_path = "kepler/data_no_injection/exoTrain.csv"
test_dataset_path = "kepler/data_no_injection/exoTest.csv"
dev_dataset_path = "kepler/data_no_injection/exoTest.csv"

# Load the dataset
df_train = pd.read_csv(train_dataset_path, encoding="ISO-8859-1")
df_test = pd.read_csv(test_dataset_path, encoding="ISO-8859-1")

# Prepare features (X) and labels (y)
X_train = df_train.drop(columns=["LABEL"]).values
Y_train = (df_train["LABEL"] == 2).astype(int)  # Binary: 1 = Planet, 0 = Not a planet

X_test = df_test.drop(columns=["LABEL"]).values
Y_test = (df_test["LABEL"] == 2).astype(int)

# Load development (test) dataset
df_dev = pd.read_csv(dev_dataset_path, encoding="ISO-8859-1")

# Prepare test features and labels
X_dev = df_dev.drop(columns=["LABEL"]).values
Y_dev = (df_dev["LABEL"] == 2).astype(int)  # Convert to binary classification (1: Planet, 0: Not a planet)

print("Loaded Test Data:")
print("X_dev shape:", X_dev.shape)
print("Y_dev shape:", Y_dev.shape)

# Normalize input data
scaler = tf.keras.layers.Normalization()
scaler.adapt(X_train)

Loaded Test Data:
X_dev shape: (570, 3197)
Y_dev shape: (570,)


In [26]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout

def build_network(shape):
      model = tf.keras.models.Sequential([
        tf.keras.layers.Input(shape),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    return model

In [27]:
!pip install -U imbalanced-learn

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [28]:

from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_train_sm, Y_train_sm = sm.fit_resample(X_train, Y_train)  # Balances the dataset

# Print new class distribution
print("Before SMOTE:", np.bincount(Y_train.ravel()))
print("After SMOTE:", np.bincount(Y_train_sm.ravel()))



Before SMOTE: [5050   37]
After SMOTE: [5050 5050]


  print("Before SMOTE:", np.bincount(Y_train.ravel()))
  print("After SMOTE:", np.bincount(Y_train_sm.ravel()))


In [29]:
# Ensure input shape is correctly set
input_shape = X_train_sm.shape[1:]

# Build the model
model = build_network(input_shape)

In [31]:
history = model.fit(X_train_sm, Y_train_sm, epochs=20, batch_size=32, validation_data=(X_dev, Y_dev), verbose=1)

Epoch 1/20
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.8043 - loss: 61.7644 - val_accuracy: 0.9421 - val_loss: 19.2836
Epoch 2/20
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.8531 - loss: 50.2436 - val_accuracy: 0.9754 - val_loss: 11.4387
Epoch 3/20
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.8130 - loss: 109.9028 - val_accuracy: 0.9035 - val_loss: 45.6459
Epoch 4/20
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.8502 - loss: 148.8736 - val_accuracy: 0.9404 - val_loss: 57.1645
Epoch 5/20
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.8792 - loss: 17.8292 - val_accuracy: 0.9526 - val_loss: 60.9584
Epoch 6/20
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.9206 - loss: 19.2416 - val_accuracy: 0.9632 - val_loss: 66.4001
Epoch 7/

In [32]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

# Make predictions
y_pred = (model.predict(X_dev) > 0.5).astype("int32")

# Compute evaluation metrics
conf_matrix = confusion_matrix(Y_dev, y_pred)
precision = precision_score(Y_dev, y_pred)
recall = recall_score(Y_dev, y_pred)
f1 = f1_score(Y_dev, y_pred)

# Print results
print("Confusion Matrix:\n", conf_matrix)
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")  
print(f"F1-score: {f1:.4f}")  


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
Confusion Matrix:
 [[561   4]
 [  5   0]]
Precision: 0.0000
Recall: 0.0000
F1-score: 0.0000


In [33]:
#     hgvkhgvkjhbkjbhkjbkjbjkbkjkjhgo uybougbkyghbkjhgkj

In [7]:
import pandas as pd
import numpy as np
from scipy import ndimage, fft
from sklearn.preprocessing import normalize, StandardScaler, MinMaxScaler


class LightFluxProcessor:

    def __init__(self, fourier=True, normalize=True, gaussian=True, standardize=True):
        self.fourier = fourier
        self.normalize = normalize
        self.gaussian = gaussian
        self.standardize = standardize

    def fourier_transform(self, X):
        return np.abs(fft.fft(X.to_numpy(), n=X.size))  # Convert X to a NumPy array

    def process(self, df_train_x, df_dev_x):
        # Apply fourier transform
        if self.fourier:
            print("Applying Fourier...")
            shape_train = df_train_x.shape
            shape_dev = df_dev_x.shape
            df_train_x = df_train_x.apply(self.fourier_transform, axis=1)
            df_dev_x = df_dev_x.apply(self.fourier_transform, axis=1)

            df_train_x_build = np.zeros(shape_train)
            df_dev_x_build = np.zeros(shape_dev)

            for ii, x in enumerate(df_train_x):
                df_train_x_build[ii] = x

            for ii, x in enumerate(df_dev_x):
                df_dev_x_build[ii] = x

            df_train_x = pd.DataFrame(df_train_x_build)
            df_dev_x = pd.DataFrame(df_dev_x_build)

            # Keep first half of data as it is symmetrical after previous steps
            df_train_x = df_train_x.iloc[:, : (df_train_x.shape[1] // 2)].values
            df_dev_x = df_dev_x.iloc[:, : (df_dev_x.shape[1] // 2)].values

        # Normalize
        if self.normalize:
            print("Normalizing...")
            df_train_x = pd.DataFrame(normalize(df_train_x))
            df_dev_x = pd.DataFrame(normalize(df_dev_x))

            # df_train_x = df_train_x.div(df_train_x.sum(axis=1), axis=0)
            # df_dev_x = df_dev_x.div(df_dev_x.sum(axis=1), axis=0)

        # Gaussian filter to smooth out data
        if self.gaussian:
            print("Applying Gaussian Filter...")
            df_train_x = ndimage.filters.gaussian_filter(df_train_x, sigma=10)
            df_dev_x = ndimage.filters.gaussian_filter(df_dev_x, sigma=10)

        if self.standardize:
            # Standardize X data
            print("Standardizing...")
            std_scaler = StandardScaler()
            df_train_x = std_scaler.fit_transform(df_train_x)
            df_dev_x = std_scaler.transform(df_dev_x)

        print("Finished Processing!")
        return df_train_x, df_dev_x


In [8]:
# import keras
# from keras.models import Sequential
# from keras.layers import Dense, Activation, Dropout

# # from keras.layers.normalization import BatchNormalization
# from keras import metrics

# from keras.callbacks import ModelCheckpoint

from imblearn.over_sampling import SMOTE

from pathlib import Path

from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    confusion_matrix,
)
import matplotlib.pyplot as plt
import math
import time

from sklearn.metrics import classification_report

from scipy import ndimage, fft
from sklearn.preprocessing import normalize

# from .preprocess_data import LightFluxProcessor
import tensorflow as tf

np.random.seed(1)

LOAD_MODEL = True  # continue training previous weights or start fresh
RENDER_PLOT = False  # render loss and accuracy plots


def build_network(shape):
    model = tf.keras.models.Sequential(
        [
            tf.keras.layers.Input(shape),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(1, activation="relu"),
            tf.keras.layers.Dense(1, activation="sigmoid"),
        ]
    )
    loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    model.compile(optimizer="adam", loss=loss_fn, metrics=["accuracy"])
    return model


def np_X_Y_from_df(df):
    df = shuffle(df)
    df_X = df.drop(["LABEL"], axis=1)
    X = np.array(df_X)
    Y_raw = np.array(df["LABEL"]).reshape((len(df["LABEL"]), 1))
    Y = Y_raw == 2
    return X, Y


if __name__ == "__main__":
    train_dataset_path = "/kepler/data_no_injection/exoTrain.csv"
    dev_dataset_path = "kepler/data_no_injection/exoTest.csv"

    print("Loading datasets...")
    df_train = pd.read_csv(train_dataset_path, encoding="ISO-8859-1")
    df_dev = pd.read_csv(dev_dataset_path, encoding="ISO-8859-1")

    # Generate X and Y dataframe sets
    df_train_x = df_train.drop("LABEL", axis=1)
    df_dev_x = df_dev.drop("LABEL", axis=1)
    df_train_y = df_train.LABEL
    df_dev_y = df_dev.LABEL

    # Process dataset
    LFP = LightFluxProcessor(
        fourier=True, normalize=True, gaussian=True, standardize=True
    )
    df_train_x, df_dev_x = LFP.process(df_train_x, df_dev_x)

    # Rejoin X and Y
    df_train_processed = pd.DataFrame(df_train_x).join(pd.DataFrame(df_train_y))
    df_dev_processed = pd.DataFrame(df_dev_x).join(pd.DataFrame(df_dev_y))

    # Load X and Y numpy arrays
    X_train, Y_train = np_X_Y_from_df(df_train_processed)
    X_dev, Y_dev = np_X_Y_from_df(df_dev_processed)

    # Print data set stats
    (num_examples, n_x) = (
        X_train.shape
    )  # (n_x: input size, m : number of examples in the train set)
    n_y = Y_train.shape[1]  # n_y : output size
    print("X_train.shape: ", X_train.shape)
    print("Y_train.shape: ", Y_train.shape)
    print("X_dev.shape: ", X_dev.shape)
    print("Y_dev.shape: ", Y_dev.shape)
    print("n_x: ", n_x)
    print("num_examples: ", num_examples)
    print("n_y: ", n_y)

    # Build model
    model = build_network(X_train.shape[1:])

    # Load weights
    load_path = ""
    my_file = Path(load_path)
    if LOAD_MODEL and my_file.is_file():
        model.load_weights(load_path)
        print("------------")
        print("Loaded saved weights")
        print("------------")

    sm = SMOTE()
    X_train_sm, Y_train_sm = sm.fit_resample(X_train, Y_train)
    # X_train_sm, Y_train_sm = X_train, Y_train

    # Train
    # checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    # callbacks_list = [checkpoint]
    print("Training...")
    history = model.fit(X_train_sm, Y_train_sm, epochs=50, batch_size=32)

    # Metrics
    train_outputs = model.predict(X_train, batch_size=32)
    dev_outputs = model.predict(X_dev, batch_size=32)
    train_outputs = np.rint(train_outputs)
    dev_outputs = np.rint(dev_outputs)
    accuracy_train = accuracy_score(Y_train, train_outputs)
    accuracy_dev = accuracy_score(Y_dev, dev_outputs)
    precision_train = precision_score(Y_train, train_outputs)
    precision_dev = precision_score(Y_dev, dev_outputs)
    recall_train = recall_score(Y_train, train_outputs)
    recall_dev = recall_score(Y_dev, dev_outputs)
    confusion_matrix_train = confusion_matrix(Y_train, train_outputs)
    confusion_matrix_dev = confusion_matrix(Y_dev, dev_outputs)

    # # Save model
    # print("Saving model...")
    # save_weights_path = "checkpoints_v2/weights-recall-{}-{}.weights.h5".format(
    #     recall_train, recall_dev
    # )  # load_path
    # model.save_weights(save_weights_path)
    # save_path = "models_v2/model-recall-{}-{}.weights.h5".format(
    #     recall_train, recall_dev
    # )  # load_path
    # # model.save(save_path)

    print("train set error", 1.0 - accuracy_train)
    print("dev set error", 1.0 - accuracy_dev)
    print("------------")
    print("precision_train", precision_train)
    print("precision_dev", precision_dev)
    print("------------")
    print("recall_train", recall_train)
    print("recall_dev", recall_dev)
    print("------------")
    print("confusion_matrix_train")
    print(confusion_matrix_train)
    print("confusion_matrix_dev")
    print(confusion_matrix_dev)
    print("------------")
    print("Train Set Positive Predictions", np.count_nonzero(train_outputs))
    print("Dev Set Positive Predictions", np.count_nonzero(dev_outputs))
    #  Predicting 0's will give you error:
    print("------------")
    print("All 0's error train set", 37 / 5087)
    print("All 0's error dev set", 5 / 570)

    print("------------")
    print("------------")

    if RENDER_PLOT:
        # list all data in history
        print(history.history.keys())
        # summarize history for accuracy
        plt.plot(history.history["accuracy"])
        # plt.plot(history.history['val_acc'])
        plt.title("model accuracy")
        plt.ylabel("accuracy")
        plt.xlabel("epoch")
        plt.legend(["train", "test"], loc="upper left")
        plt.show()

        # summarize history for loss
        plt.plot(history.history["loss"])
        # plt.plot(history.history['val_loss'])
        plt.title("model loss")
        plt.ylabel("loss")
        plt.xlabel("epoch")
        plt.legend(["train", "test"], loc="upper left")
        plt.show()


Loading datasets...
Applying Fourier...
Normalizing...
Applying Gaussian Filter...


  df_train_x = ndimage.filters.gaussian_filter(df_train_x, sigma=10)
  df_dev_x = ndimage.filters.gaussian_filter(df_dev_x, sigma=10)


Standardizing...
Finished Processing!
X_train.shape:  (5087, 1598)
Y_train.shape:  (5087, 1)
X_dev.shape:  (570, 1598)
Y_dev.shape:  (570, 1)
n_x:  1598
num_examples:  5087
n_y:  1


2025-02-16 16:25:40.925962: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Training...
Epoch 1/50


  output, from_logits = _get_logits(


[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.7482 - loss: 0.5448
Epoch 2/50
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.7590 - loss: 0.4895
Epoch 3/50
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.7737 - loss: 0.4665
Epoch 4/50
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7744 - loss: 0.4598
Epoch 5/50
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7711 - loss: 0.4563
Epoch 6/50
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7702 - loss: 0.4558
Epoch 7/50
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.7745 - loss: 0.4502
Epoch 8/50
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.7812 - loss: 0.4456
Epoch 9/50
[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━