In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Load data
file_path = "/Users/carmenshero/Desktop/Datasets2/PT4_Training.csv"
df = pd.read_csv(file_path)

# Remove missing ec_numbers
df = df[df["ec_numbers"].notna() & (df["ec_numbers"] != "MISSING")].copy()

# Extract EC prefix and store in new column
df["EC_Prefix"] = df["ec_numbers"].str.extract(r"^(\d+\.\d+)")

# Remove EC_Prefixes with fewer than 2 entries
prefix_counts = df["EC_Prefix"].value_counts()
valid_prefixes = prefix_counts[prefix_counts >= 2].index
df = df[df["EC_Prefix"].isin(valid_prefixes)].reset_index(drop=True)

print("✔ Cleaned dataset loaded with shape:", df.shape)

✔ Cleaned dataset loaded with shape: (12954, 33)


In [4]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

X_combined = df.iloc[:, 3:33]

# Re-encode filtered EC prefixes into integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df["EC_Prefix"])
y_categorical = to_categorical(y_encoded)

print("✔ Features and labels prepared.")
print(f"Number of classes: {y_categorical.shape[1]}")

✔ Features and labels prepared.
🔢 Number of classes: 61


In [5]:
from sklearn.model_selection import train_test_split

# Split while stratifying by encoded EC labels
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y_categorical, test_size=0.2, random_state=42, stratify=y_encoded
)

print("✔ Train-test split complete.")
print(f"Training samples: {len(X_train)} | Test samples: {len(X_test)}")

✔ Train-test split complete.
📊 Training samples: 10363 | Test samples: 2591


In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

model = Sequential([
    Dense(512, activation="relu", input_shape=(X_combined.shape[1],)),
    BatchNormalization(),
    Dropout(0.4),

    Dense(256, activation="relu"),
    BatchNormalization(),
    Dropout(0.3),

    Dense(128, activation="relu"),
    Dropout(0.3),

    Dense(64, activation="relu"),
    Dropout(0.2),

    Dense(y_categorical.shape[1], activation="softmax")  # Output layer
])

# Compile the model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

print("Training complete with increased capacity.")


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.0731 - loss: nan - val_accuracy: 0.0357 - val_loss: nan
Epoch 2/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0401 - loss: nan - val_accuracy: 0.0357 - val_loss: nan
Epoch 3/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0380 - loss: nan - val_accuracy: 0.0357 - val_loss: nan
Epoch 4/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0383 - loss: nan - val_accuracy: 0.0357 - val_loss: nan
Epoch 5/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0430 - loss: nan - val_accuracy: 0.0357 - val_loss: nan
Epoch 6/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0397 - loss: nan - val_accuracy: 0.0357 - val_loss: nan
Epoch 7/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0

In [None]:
# Evaluate on the held-out test data
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

In [7]:
# Load the full dataset again
full_df = pd.read_csv("/Users/carmenshero/Desktop/Datasets/Partial_Testing.csv")

# Extract MISSING rows
df_missing = full_df[full_df["ec_numbers"] == "MISSING"].copy()

# Get features for prediction (must match training input)
X_missing = df_missing.iloc[:, 5:28]

# Predict probabilities and convert to EC number predictions
y_missing_pred = model.predict(X_missing)
y_pred_labels = label_encoder.inverse_transform(np.argmax(y_missing_pred, axis=1))

# Store results
df_missing["Predicted_EC"] = y_pred_labels

# Save predictions
df_missing.to_csv("/Users/carmenshero/Desktop/Datasets/NN_EC_Predictions.csv", index=False)
print(" Saved predicted EC values for MISSING rows.")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/carmenshero/Desktop/Datasets/Partial_Testing.csv'