In [13]:
import pandas as pd
import numpy as np
import sklearn

In [14]:
file_path = 'data/cosmicclassifierTraining.csv'
df = pd.read_csv(file_path)
df.head(5)

Unnamed: 0,Atmospheric Density,Surface Temperature,Gravity,Water Content,Mineral Abundance,Orbital Period,Proximity to Star,Magnetic Field Strength,Radiation Levels,Atmospheric Composition Index,Prediction
0,0.472806,,-0.313872,-2.089299,-0.152201,-0.885649,0.900105,,Category_6,0.692907,5.0
1,4.180154,-1.157515,2.430956,-1.59585,-3.188678,-0.609434,-0.199828,Category_9,Category_9,,0.0
2,-0.129008,1.621592,-0.785741,2.081196,-1.413796,-0.095152,-3.502577,,Category_8,-0.677182,4.0
3,-3.122,-2.299818,1.072092,0.353524,-0.192529,2.917067,-1.972329,,Category_11,0.109429,1.0
4,-1.459426,2.890268,0.148757,-0.804439,0.494875,0.04491,-0.438796,Category_6,Category_10,0.407941,9.0


In [15]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [16]:
num_missing_target = df['Prediction'].isna().sum()
print(f"Missing values in 'Prediction': {num_missing_target}")
# As the target is critical for training, drop rows where 'Prediction' is NaN
df = df.dropna(subset=['Prediction'])

# -----------------------------------
# Step 3: Handle Categorical Features and Their Missing Values
# -----------------------------------
# For columns like 'Magnetic Field Strength' and 'Radiation Levels', fill missing entries with a placeholder
df['Magnetic Field Strength'] = df['Magnetic Field Strength'].fillna('missing')
df['Radiation Levels'] = df['Radiation Levels'].fillna('missing')

# Encode these categorical variables into numeric values
magnetic_encoder = LabelEncoder()
radiation_encoder = LabelEncoder()

df['Magnetic_Field_encoded'] = magnetic_encoder.fit_transform(df['Magnetic Field Strength'])
df['Radiation_Levels_encoded'] = radiation_encoder.fit_transform(df['Radiation Levels'])

Missing values in 'Prediction': 3039


In [17]:
features = [
    'Atmospheric Density', 
    'Surface Temperature', 
    'Gravity', 
    'Water Content', 
    'Mineral Abundance', 
    'Orbital Period', 
    'Proximity to Star', 
    'Magnetic_Field_encoded', 
    'Radiation_Levels_encoded', 
    'Atmospheric Composition Index'
]
# Extract the features DataFrame
X = df[features]

# Apply KNN imputation (here using 5 neighbors by default) on the feature set
imputer = KNNImputer(n_neighbors=5)
X_imputed = imputer.fit_transform(X)
X_imputed = pd.DataFrame(X_imputed, columns=features)

# Define the target variable. Assume it is already numeric (if not, encode accordingly)
y = df['Prediction'].values


In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y, test_size=0.20, random_state=42, stratify=y
)

# -----------------------------------
# Step 6: Scale the Features
# -----------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -----------------------------------
# Step 7: Build the Neural Network Model
# -----------------------------------
model = Sequential()
# First hidden layer: 128 neurons + Dropout
model.add(Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)))
model.add(Dropout(0.2))
# Second hidden layer: 64 neurons + Dropout
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
# Third hidden layer: 32 neurons + Dropout (an extra hidden layer)
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
# Output layer: 10 neurons (for 10 classes) with softmax activation
model.add(Dense(10, activation='softmax'))

# Compile the model with the Adam optimizer and sparse categorical crossentropy loss
model.compile(optimizer=Adam(learning_rate=0.001), 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

# -----------------------------------
# Step 8: Train the Model with Early Stopping
# -----------------------------------
# Configure early stopping to monitor validation loss with a patience of 10 epochs
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model for up to 200 epochs (early stopping will halt training if no improvement is seen)
history = model.fit(X_train_scaled, y_train, 
                    epochs=200, 
                    batch_size=32, 
                    validation_split=0.2, 
                    callbacks=[early_stop],
                    verbose=1)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/200
[1m1140/1140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.5886 - loss: 1.2135 - val_accuracy: 0.8336 - val_loss: 0.5107
Epoch 2/200
[1m1140/1140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 919us/step - accuracy: 0.8029 - loss: 0.6098 - val_accuracy: 0.8521 - val_loss: 0.4489
Epoch 3/200
[1m1140/1140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 899us/step - accuracy: 0.8248 - loss: 0.5586 - val_accuracy: 0.8602 - val_loss: 0.4235
Epoch 4/200
[1m1140/1140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 883us/step - accuracy: 0.8364 - loss: 0.5216 - val_accuracy: 0.8671 - val_loss: 0.4083
Epoch 5/200
[1m1140/1140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 890us/step - accuracy: 0.8449 - loss: 0.5031 - val_accuracy: 0.8717 - val_loss: 0.3989
Epoch 6/200
[1m1140/1140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 897us/step - accuracy: 0.8447 - loss: 0.4909 - val_accuracy: 0.8761 - val_loss: 0.3913


In [19]:
# Step 9: Evaluate the Model on the Test Set
# -----------------------------------
test_loss, test_acc = model.evaluate(X_test_scaled, y_test, verbose=0)
print("Test Accuracy:", test_acc)

# Generate predictions on the test set and output a detailed classification report
y_pred_proba = model.predict(X_test_scaled)
y_pred = np.argmax(y_pred_proba, axis=1)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Test Accuracy: 0.8950232863426208
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 399us/step
Classification Report:
              precision    recall  f1-score   support

         0.0       0.94      0.94      0.94      1127
         1.0       0.98      0.96      0.97      1279
         2.0       0.89      0.93      0.91      1129
         3.0       0.89      0.81      0.85      1163
         4.0       0.87      0.87      0.87      1111
         5.0       0.86      0.88      0.87      1026
         6.0       0.94      0.95      0.94      1128
         7.0       0.93      0.92      0.92      1186
         8.0       0.83      0.85      0.84      1114
         9.0       0.81      0.83      0.82      1130

    accuracy                           0.90     11393
   macro avg       0.89      0.89      0.89     11393
weighted avg       0.90      0.90      0.89     11393



In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Define feature list
features = [
    'Atmospheric Density', 
    'Surface Temperature', 
    'Gravity', 
    'Water Content', 
    'Mineral Abundance', 
    'Orbital Period', 
    'Proximity to Star', 
    'Magnetic_Field_encoded', 
    'Radiation_Levels_encoded', 
    'Atmospheric Composition Index'
]

# Extract features dataframe
X = df[features]

# Define the target variable
y = df['Prediction'].values

# Create a pipeline with feature scaling and the tuned KNN classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(
        algorithm='auto',
        metric='manhattan',
        n_neighbors=4,
        p=1,
        weights='distance'
    ))
])

# Optionally, split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y, test_size=0.20, random_state=42, stratify=y
)

# Train the final model
pipeline.fit(X_train, y_train)

# Evaluate model performance on the test set
y_pred = pipeline.predict(X_test)
print("Final KNN Classifier Performance:")
print(classification_report(y_test, y_pred))

Final KNN Classifier Performance:
              precision    recall  f1-score   support

         0.0       0.94      0.95      0.95      1127
         1.0       0.96      0.97      0.97      1279
         2.0       0.93      0.93      0.93      1129
         3.0       0.86      0.81      0.83      1163
         4.0       0.87      0.84      0.86      1111
         5.0       0.87      0.85      0.86      1026
         6.0       0.94      0.95      0.95      1128
         7.0       0.92      0.92      0.92      1186
         8.0       0.82      0.83      0.82      1114
         9.0       0.78      0.83      0.80      1130

    accuracy                           0.89     11393
   macro avg       0.89      0.89      0.89     11393
weighted avg       0.89      0.89      0.89     11393

