In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# -----------------------------------
# Step 1: Load the CSV File into a DataFrame
# -----------------------------------
# Replace the file_path with your actual CSV file path
file_path = 'data/cosmicclassifierTraining.csv'
df = pd.read_csv(file_path)

# -----------------------------------
# Step 2: Handle the Target Column (Prediction)
# -----------------------------------
# Check for missing values in the 'Prediction' column and drop rows where the target is missing
print("Missing values in 'Prediction':", df['Prediction'].isna().sum())
df = df.dropna(subset=['Prediction'])

# -----------------------------------
# Step 3: Handle and Encode Categorical Features
# -----------------------------------
# For categorical columns, fill missing values with a placeholder
df['Magnetic Field Strength'] = df['Magnetic Field Strength'].fillna('missing')
df['Radiation Levels'] = df['Radiation Levels'].fillna('missing')

# Encode categorical columns into numeric values
magnetic_encoder = LabelEncoder()
radiation_encoder = LabelEncoder()
df['Magnetic_Field_encoded'] = magnetic_encoder.fit_transform(df['Magnetic Field Strength'])
df['Radiation_Levels_encoded'] = radiation_encoder.fit_transform(df['Radiation Levels'])

# -----------------------------------
# Step 4: Define Features and Apply KNN Imputation
# -----------------------------------
features = [
    'Atmospheric Density', 
    'Surface Temperature', 
    'Gravity', 
    'Water Content', 
    'Mineral Abundance', 
    'Orbital Period', 
    'Proximity to Star', 
    'Magnetic_Field_encoded', 
    'Radiation_Levels_encoded', 
    'Atmospheric Composition Index'
]
X = df[features]

# Impute missing numerical values using KNN imputation
knn_imputer = KNNImputer(n_neighbors=5)
X_imputed = knn_imputer.fit_transform(X)
X_imputed = pd.DataFrame(X_imputed, columns=features)

# Define the target variable
y = df['Prediction'].values

# -----------------------------------
# Step 5: Create a Pipeline with Scaling and Tuned KNN Classifier
# -----------------------------------
# Best hyperparameters: algorithm='auto', metric='manhattan', n_neighbors=4, p=1, weights='distance'
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(
        algorithm='auto',
        metric='manhattan',
        n_neighbors=4,
        p=1,
        weights='distance'
    ))
])

# -----------------------------------
# Option 1: Evaluate with Stratified K-Fold Cross-Validation
# -----------------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X_imputed, y, cv=cv, scoring='accuracy')
print("Cross-validation scores:", cv_scores)
print("Average CV accuracy:", cv_scores.mean())

# -----------------------------------
# Option 2: Evaluate with Hold-Out Train/Test Split
# -----------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y, test_size=0.20, random_state=42, stratify=y
)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print("\nClassification Report (Hold-out Test Set):")
print(classification_report(y_test, y_pred))


Missing values in 'Prediction': 3039
Cross-validation scores: [0.88677258 0.88983497 0.88816713 0.89220506 0.88728933]
Average CV accuracy: 0.8888538136836438

Classification Report (Hold-out Test Set):
              precision    recall  f1-score   support

         0.0       0.94      0.95      0.95      1127
         1.0       0.96      0.97      0.97      1279
         2.0       0.93      0.93      0.93      1129
         3.0       0.86      0.81      0.83      1163
         4.0       0.87      0.84      0.86      1111
         5.0       0.87      0.85      0.86      1026
         6.0       0.94      0.95      0.95      1128
         7.0       0.92      0.92      0.92      1186
         8.0       0.82      0.83      0.82      1114
         9.0       0.78      0.83      0.80      1130

    accuracy                           0.89     11393
   macro avg       0.89      0.89      0.89     11393
weighted avg       0.89      0.89      0.89     11393

