In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
file_path = 'data/cosmicclassifierTraining.csv'
df = pd.read_csv(file_path)
df.head(5)

Unnamed: 0,Atmospheric Density,Surface Temperature,Gravity,Water Content,Mineral Abundance,Orbital Period,Proximity to Star,Magnetic Field Strength,Radiation Levels,Atmospheric Composition Index,Prediction
0,0.472806,,-0.313872,-2.089299,-0.152201,-0.885649,0.900105,,Category_6,0.692907,5.0
1,4.180154,-1.157515,2.430956,-1.59585,-3.188678,-0.609434,-0.199828,Category_9,Category_9,,0.0
2,-0.129008,1.621592,-0.785741,2.081196,-1.413796,-0.095152,-3.502577,,Category_8,-0.677182,4.0
3,-3.122,-2.299818,1.072092,0.353524,-0.192529,2.917067,-1.972329,,Category_11,0.109429,1.0
4,-1.459426,2.890268,0.148757,-0.804439,0.494875,0.04491,-0.438796,Category_6,Category_10,0.407941,9.0


In [4]:
print(df.shape)
print(list(df.columns))  


(60000, 11)
['Atmospheric Density', 'Surface Temperature', 'Gravity', 'Water Content', 'Mineral Abundance', 'Orbital Period', 'Proximity to Star', 'Magnetic Field Strength', 'Radiation Levels', 'Atmospheric Composition Index', 'Prediction']


In [5]:
missing_values = df.isna().sum()
print("\nMissing values per column:")
print(missing_values)


Missing values per column:
Atmospheric Density              2984
Surface Temperature              3032
Gravity                          2984
Water Content                    3077
Mineral Abundance                2921
Orbital Period                   2997
Proximity to Star                2945
Magnetic Field Strength          3058
Radiation Levels                 3021
Atmospheric Composition Index    2942
Prediction                       3039
dtype: int64


In [6]:
print("\nDescriptive statistics for numeric columns:")
df.describe()


Descriptive statistics for numeric columns:


Unnamed: 0,Atmospheric Density,Surface Temperature,Gravity,Water Content,Mineral Abundance,Orbital Period,Proximity to Star,Atmospheric Composition Index,Prediction
count,57016.0,56968.0,57016.0,56923.0,57079.0,57003.0,57055.0,57058.0,56961.0
mean,-0.000202,-0.000288,-0.000469,0.001938,-0.00073,-0.001043,0.000188,0.000211,4.454381
std,2.263527,1.936598,1.804605,1.689267,1.605524,1.511685,1.316682,1.120303,2.890055
min,-4.364843,-5.503527,-5.553877,-5.816755,-5.077363,-4.801046,-4.537187,-4.007504,0.0
25%,-1.55581,-1.426786,-1.279002,-1.21887,-1.078449,-1.04892,-0.937097,-0.709852,2.0
50%,-0.18867,-0.330037,0.046231,-0.004676,0.040008,0.036651,-0.062001,0.049292,4.0
75%,1.308113,1.503646,1.255432,1.063391,1.095483,1.0588,0.888149,0.789511,7.0
max,9.324018,5.638094,6.03029,6.287045,5.584059,5.111014,4.942699,3.852567,9.0


In [7]:
# Drop rows with NaN values
df = df.dropna().reset_index(drop=True)

In [8]:
print("\nDescriptive statistics for numeric columns:")
df.describe()


Descriptive statistics for numeric columns:


Unnamed: 0,Atmospheric Density,Surface Temperature,Gravity,Water Content,Mineral Abundance,Orbital Period,Proximity to Star,Atmospheric Composition Index,Prediction
count,34059.0,34059.0,34059.0,34059.0,34059.0,34059.0,34059.0,34059.0,34059.0
mean,-0.011847,-0.002704,0.00397,0.003003,0.005906,-0.001933,-0.005676,0.004431,4.457999
std,2.257107,1.936163,1.803941,1.688711,1.602094,1.514356,1.317587,1.125303,2.890949
min,-4.283309,-5.426189,-5.553877,-5.816755,-5.077363,-4.801046,-4.537187,-4.007504,0.0
25%,-1.560969,-1.423844,-1.276465,-1.218254,-1.069377,-1.054219,-0.942822,-0.713259,2.0
50%,-0.199246,-0.329629,0.053229,0.003468,0.051432,0.047483,-0.068085,0.050758,4.0
75%,1.291944,1.498558,1.263103,1.062735,1.101213,1.064815,0.879585,0.799353,7.0
max,9.324018,5.638094,6.03029,6.287045,5.335537,5.111014,4.731871,3.852567,9.0


In [9]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Profiling Report")
profile.to_file("your_report.html")

  from .autonotebook import tqdm as notebook_tqdm
Summarize dataset: 100%|██████████| 101/101 [00:05<00:00, 19.97it/s, Completed]                                                         
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.57s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.25s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 139.88it/s]


In [10]:
print("Unique values in 'Prediction':", df['Prediction'].unique())
print("Unique values in 'Magnetic Field Strength':", df['Magnetic Field Strength'].unique())
print("Unique values in 'Radiation Levels':", df['Radiation Levels'].unique())


Unique values in 'Prediction': [9. 1. 3. 7. 2. 6. 0. 5. 8. 4.]
Unique values in 'Magnetic Field Strength': ['Category_6' 'Category_8' 'Category_14' 'Category_13' 'Category_7'
 'Category_10' 'Category_9' 'Category_12' 'Category_11' 'Category_15'
 'Category_4' 'Category_5' 'Category_16' 'Category_17' 'Category_3'
 'Category_18' 'Category_2' 'Category_19' 'Category_1' 'Category_20']
Unique values in 'Radiation Levels': ['Category_10' 'Category_7' 'Category_8' 'Category_12' 'Category_9'
 'Category_5' 'Category_6' 'Category_11' 'Category_13' 'Category_14'
 'Category_3' 'Category_4' 'Category_15' 'Category_16' 'Category_2'
 'Category_1' 'Category_17' 'Category_18' 'Category_19' 'Category_20']


In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoders
magnetic_encoder = LabelEncoder()
radiation_encoder = LabelEncoder()

# Encode "Magnetic Field Strength"
df['Magnetic_Field_encoded'] = magnetic_encoder.fit_transform(df['Magnetic Field Strength'])

# Encode "Radiation Levels"
df['Radiation_Levels_encoded'] = radiation_encoder.fit_transform(df['Radiation Levels'])

# Print Encoded Mapping
print("Magnetic Field Encoding:", dict(zip(magnetic_encoder.classes_, magnetic_encoder.transform(magnetic_encoder.classes_))))
print("Radiation Levels Encoding:", dict(zip(radiation_encoder.classes_, radiation_encoder.transform(radiation_encoder.classes_))))


Magnetic Field Encoding: {'Category_1': np.int64(0), 'Category_10': np.int64(1), 'Category_11': np.int64(2), 'Category_12': np.int64(3), 'Category_13': np.int64(4), 'Category_14': np.int64(5), 'Category_15': np.int64(6), 'Category_16': np.int64(7), 'Category_17': np.int64(8), 'Category_18': np.int64(9), 'Category_19': np.int64(10), 'Category_2': np.int64(11), 'Category_20': np.int64(12), 'Category_3': np.int64(13), 'Category_4': np.int64(14), 'Category_5': np.int64(15), 'Category_6': np.int64(16), 'Category_7': np.int64(17), 'Category_8': np.int64(18), 'Category_9': np.int64(19)}
Radiation Levels Encoding: {'Category_1': np.int64(0), 'Category_10': np.int64(1), 'Category_11': np.int64(2), 'Category_12': np.int64(3), 'Category_13': np.int64(4), 'Category_14': np.int64(5), 'Category_15': np.int64(6), 'Category_16': np.int64(7), 'Category_17': np.int64(8), 'Category_18': np.int64(9), 'Category_19': np.int64(10), 'Category_2': np.int64(11), 'Category_20': np.int64(12), 'Category_3': np.int

In [12]:
df.head()

Unnamed: 0,Atmospheric Density,Surface Temperature,Gravity,Water Content,Mineral Abundance,Orbital Period,Proximity to Star,Magnetic Field Strength,Radiation Levels,Atmospheric Composition Index,Prediction,Magnetic_Field_encoded,Radiation_Levels_encoded
0,-1.459426,2.890268,0.148757,-0.804439,0.494875,0.04491,-0.438796,Category_6,Category_10,0.407941,9.0,16,1
1,-2.971646,-0.648251,-0.915859,0.255504,-0.537165,-2.072251,1.355523,Category_8,Category_7,1.876232,1.0,18,17
2,-3.306354,-0.316716,-0.431264,0.389815,-1.961216,-1.510182,0.538593,Category_8,Category_7,0.934055,1.0,18,17
3,-0.752712,-2.492542,-1.072433,-2.561734,1.158838,-1.262638,1.447444,Category_14,Category_8,0.726009,3.0,5,18
4,1.129258,-3.333453,-4.423914,-1.020409,0.71129,-0.606784,0.047264,Category_13,Category_12,-1.630504,3.0,4,3


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Define the features to use (including encoded columns)
features = ['Atmospheric Density', 'Surface Temperature', 'Gravity', 'Water Content', 
            'Mineral Abundance', 'Orbital Period', 'Proximity to Star', 
            'Magnetic_Field_encoded', 'Radiation_Levels_encoded', 'Atmospheric Composition Index']

# Assume 'prediction' column exists in df with the class labels
X = df[features]
y = df['Prediction']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Predict on the test set
predictions = rf_classifier.predict(X_test)

# Evaluate the classifier's performance
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

         0.0       0.95      0.94      0.94       658
         1.0       0.97      0.97      0.97       768
         2.0       0.92      0.89      0.91       719
         3.0       0.84      0.85      0.84       703
         4.0       0.86      0.86      0.86       627
         5.0       0.87      0.87      0.87       594
         6.0       0.92      0.95      0.94       660
         7.0       0.93      0.91      0.92       711
         8.0       0.87      0.84      0.86       672
         9.0       0.82      0.85      0.83       700

    accuracy                           0.90      6812
   macro avg       0.89      0.89      0.89      6812
weighted avg       0.90      0.90      0.90      6812



In [15]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import classification_report

features = [
    'Atmospheric Density', 
    'Surface Temperature', 
    'Gravity', 
    'Water Content', 
    'Mineral Abundance', 
    'Orbital Period', 
    'Proximity to Star', 
    'Magnetic_Field_encoded', 
    'Radiation_Levels_encoded', 
    'Atmospheric Composition Index'
]

# Use 'Prediction' as the target column
X = df[features]
y = df['Prediction']

# ---------------------------
# Step 4: Set Up Stratified K-Fold Cross Validation
# ---------------------------
# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Set up StratifiedKFold cross-validation with 5 splits
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate the classifier using accuracy as the metric
cv_scores = cross_val_score(rf_classifier, X, y, cv=cv, scoring='accuracy')

# Print out the cross-validation scores and the average accuracy
print("Cross-validation scores:", cv_scores)
print("Average CV accuracy:", cv_scores.mean())

# ---------------------------
# Step 5: Optional - Hold-out Train/Test Split for Additional Evaluation
# ---------------------------
# Split the data into training and testing sets (20% for testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y)

# Train the Random Forest classifier on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
predictions = rf_classifier.predict(X_test)

# Evaluate the classifier’s performance on the test set
print("\nClassification Report (Hold-out Test Set):")
print(classification_report(y_test, predictions))


Cross-validation scores: [0.89547857 0.89004698 0.89856136 0.89445097 0.89237997]
Average CV accuracy: 0.8941835695823128

Classification Report (Hold-out Test Set):
              precision    recall  f1-score   support

         0.0       0.95      0.95      0.95       663
         1.0       0.96      0.96      0.96       771
         2.0       0.91      0.91      0.91       682
         3.0       0.84      0.86      0.85       700
         4.0       0.85      0.85      0.85       664
         5.0       0.87      0.87      0.87       607
         6.0       0.94      0.95      0.94       676
         7.0       0.94      0.92      0.93       697
         8.0       0.87      0.84      0.86       666
         9.0       0.81      0.81      0.81       686

    accuracy                           0.89      6812
   macro avg       0.89      0.89      0.89      6812
weighted avg       0.89      0.89      0.89      6812



In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Define the hyperparameter grid for the Random Forest classifier
param_grid = {
    'n_estimators': [100, 200, 300],            # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],            # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],            # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],              # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]                  # Whether bootstrap samples are used when building trees
}

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Set up GridSearchCV with StratifiedKFold cross-validation (5 splits)
grid_search = GridSearchCV(
    estimator=rf_classifier,
    param_grid=param_grid,
    scoring='accuracy',                         # Use accuracy as the evaluation metric
    cv=5,                                       # 5-fold cross-validation
    verbose=3,                                  # Print progress during search
    n_jobs=-1                                   # Use all available cores for parallel processing
)

# Fit GridSearchCV on your dataset to find the best hyperparameters
grid_search.fit(X, y)

# Print out the best parameters found and the corresponding cross-validation score
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Optional: Use the best model for further evaluation or testing
best_rf_model = grid_search.best_estimator_

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.20, random_state=42, stratify=y
)

#Fit the best model on the training data
best_rf_model.fit(X_train, y_train)

#Predict on the test set
predictions = best_rf_model.predict(X_test)

#Display the classification report
print("\nClassification Report (Hold-out Test Set):")
print(classification_report(y_test, predictions))


cm = confusion_matrix(y_test, predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()

#--- Visualize the Feature Importances ---
importances = best_rf_model.feature_importances_
feature_names = X.columns

#Sorting feature importances in descending order
indices = np.argsort(importances)[::-1]

print("Feature Importances:")
for i in range(len(feature_names)):
    print(f"{i+1}. {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")

#Plotting the feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x=importances[indices], y=feature_names[indices])
plt.title("Feature Importances from Best Random Forest Model")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV 1/5] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.895 total time=  17.8s
[CV 5/5] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.899 total time=  18.0s
[CV 4/5] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.888 total time=  18.2s
[CV 3/5] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.890 total time=  18.3s
[CV 2/5] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.889 total time=  18.3s
[CV 1/5] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.896 total time=  37.4s
[CV 2/5] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.893 tot

  plt.show()


Feature Importances:
1. Atmospheric Density: 0.1712
2. Surface Temperature: 0.1552
3. Water Content: 0.1300
4. Mineral Abundance: 0.1139
5. Orbital Period: 0.1060
6. Proximity to Star: 0.0905
7. Gravity: 0.0817
8. Atmospheric Composition Index: 0.0641
9. Magnetic_Field_encoded: 0.0632
10. Radiation_Levels_encoded: 0.0241


  plt.show()


In [18]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix

# Assuming the best model from grid search is stored in best_rf_model
# and the hold-out test split (X_test, y_test) has already been defined.

# Make predictions on the test set using the best estimator
predictions = best_rf_model.predict(X_test)

# Plot the Confusion Matrix
cm = confusion_matrix(y_test, predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=sorted(y.unique()), yticklabels=sorted(y.unique()))
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# Visualize the Feature Importances
importances = best_rf_model.feature_importances_
feature_names = X.columns

# Sorting the feature importances in descending order
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
sns.barplot(x=importances[indices], y=feature_names[indices])
plt.title("Feature Importances from Best Random Forest Model")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

# Optionally, print the feature importances in sorted order for reference
print("Feature Importances:")
for idx in indices:
    print(f"{feature_names[idx]}: {importances[idx]:.4f}")


Feature Importances:
Atmospheric Density: 0.1712
Surface Temperature: 0.1552
Water Content: 0.1300
Mineral Abundance: 0.1139
Orbital Period: 0.1060
Proximity to Star: 0.0905
Gravity: 0.0817
Atmospheric Composition Index: 0.0641
Magnetic_Field_encoded: 0.0632
Radiation_Levels_encoded: 0.0241


  plt.show()
  plt.show()


Gradient Boosting

In [19]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report

# Define the features and target variable
features = [
    'Atmospheric Density', 
    'Surface Temperature', 
    'Gravity', 
    'Water Content', 
    'Mineral Abundance', 
    'Orbital Period', 
    'Proximity to Star', 
    'Magnetic_Field_encoded', 
    'Radiation_Levels_encoded', 
    'Atmospheric Composition Index'
]
X = df[features]
y = df['Prediction']

# Split the dataset into training and testing sets (20% test size)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# Initialize the Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(random_state=42)

# Train the classifier on the training subset
gb_classifier.fit(X_train, y_train)

# Evaluate the model using 5-fold cross-validation on the entire dataset
cv_scores = cross_val_score(gb_classifier, X, y, cv=5, scoring='accuracy')
print("Gradient Boosting - CV Scores:", cv_scores)
print("Gradient Boosting - Average CV Score:", cv_scores.mean())

# Predict on the test set
predictions = gb_classifier.predict(X_test)

# Generate and display a classification report on the test set
print("Classification Report for Gradient Boosting Classifier:")
print(classification_report(y_test, predictions))


Gradient Boosting - CV Scores: [0.86964181 0.86641221 0.86376982 0.86376982 0.87197181]
Gradient Boosting - Average CV Score: 0.8671130937114008
Classification Report for Gradient Boosting Classifier:
              precision    recall  f1-score   support

         0.0       0.95      0.91      0.93       663
         1.0       0.95      0.96      0.95       771
         2.0       0.88      0.87      0.87       682
         3.0       0.83      0.83      0.83       700
         4.0       0.81      0.82      0.82       664
         5.0       0.81      0.81      0.81       607
         6.0       0.92      0.91      0.92       676
         7.0       0.90      0.90      0.90       697
         8.0       0.81      0.85      0.83       666
         9.0       0.76      0.77      0.76       686

    accuracy                           0.86      6812
   macro avg       0.86      0.86      0.86      6812
weighted avg       0.86      0.86      0.86      6812



KNN Classifier

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Define the features and target variable
features = [
    'Atmospheric Density', 
    'Surface Temperature', 
    'Gravity', 
    'Water Content', 
    'Mineral Abundance', 
    'Orbital Period', 
    'Proximity to Star', 
    'Magnetic_Field_encoded', 
    'Radiation_Levels_encoded', 
    'Atmospheric Composition Index'
]
X = df[features]
y = df['Prediction']

# Split the dataset into training and testing sets (20% test size)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# Initialize the KNN classifier with n_neighbors set to 5 (default)
knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Train the classifier on the training subset
knn_classifier.fit(X_train, y_train)

# Evaluate the model using 5-fold cross-validation on the entire dataset
cv_scores = cross_val_score(knn_classifier, X, y, cv=5, scoring='accuracy')
print("KNN - Cross-validation scores:", cv_scores)
print("KNN - Average CV Score:", cv_scores.mean())

# Predict on the test set
predictions = knn_classifier.predict(X_test)

# Generate and display a classification report on the test set
print("Classification Report for KNN Classifier:")
print(classification_report(y_test, predictions))


KNN - Cross-validation scores: [0.89929536 0.90267176 0.89283617 0.89650617 0.90779621]
KNN - Average CV Score: 0.8998211331829715
Classification Report for KNN Classifier:
              precision    recall  f1-score   support

         0.0       0.95      0.96      0.95       663
         1.0       0.95      0.98      0.96       771
         2.0       0.94      0.93      0.94       682
         3.0       0.88      0.85      0.86       700
         4.0       0.84      0.88      0.86       664
         5.0       0.87      0.87      0.87       607
         6.0       0.95      0.96      0.95       676
         7.0       0.92      0.91      0.92       697
         8.0       0.87      0.85      0.86       666
         9.0       0.80      0.79      0.80       686

    accuracy                           0.90      6812
   macro avg       0.90      0.90      0.90      6812
weighted avg       0.90      0.90      0.90      6812



In [22]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Define your feature set and target variable
features = [
    'Atmospheric Density', 
    'Surface Temperature', 
    'Gravity', 
    'Water Content', 
    'Mineral Abundance', 
    'Orbital Period', 
    'Proximity to Star', 
    'Magnetic_Field_encoded', 
    'Radiation_Levels_encoded', 
    'Atmospheric Composition Index'
]
X = df[features]
y = df['Prediction']

# Create a pipeline with scaling and the KNN classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

# Define the hyperparameter grid to search
param_grid = {
    'knn__n_neighbors': range(1, 31),                   # Test k values from 1 to 30
    'knn__weights': ['uniform', 'distance'],            # Weighting schemes
    'knn__metric': ['euclidean', 'manhattan', 'minkowski'],  # Distance metrics
    'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algorithm types
    'knn__p': [1, 2]                                    # p=1 for Manhattan, p=2 for Euclidean (with minkowski)
}

# Setup GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=3,
    n_jobs=-1  # Use all available cores
)

# Optionally, split the data into training and testing sets to later evaluate the tuned model
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# Fit the grid search on the training data
grid_search.fit(X_train, y_train)

# Output the best parameters and the best cross-validation accuracy
print("Best Hyperparameters:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)

# Retrieve the best tuned model from GridSearchCV
best_knn_model = grid_search.best_estimator_

# Evaluate on the held-out test set
predictions = best_knn_model.predict(X_test)
print("\nClassification Report for Tuned KNN:")
print(classification_report(y_test, predictions))


Fitting 5 folds for each of 1440 candidates, totalling 7200 fits
[CV 2/5] END knn__algorithm=auto, knn__metric=euclidean, knn__n_neighbors=1, knn__p=1, knn__weights=distance;, score=0.888 total time=   1.0s
[CV 3/5] END knn__algorithm=auto, knn__metric=euclidean, knn__n_neighbors=1, knn__p=1, knn__weights=distance;, score=0.888 total time=   1.0s
[CV 1/5] END knn__algorithm=auto, knn__metric=euclidean, knn__n_neighbors=1, knn__p=1, knn__weights=distance;, score=0.890 total time=   1.2s
[CV 5/5] END knn__algorithm=auto, knn__metric=euclidean, knn__n_neighbors=1, knn__p=1, knn__weights=uniform;, score=0.883 total time=   1.3s
[CV 3/5] END knn__algorithm=auto, knn__metric=euclidean, knn__n_neighbors=1, knn__p=1, knn__weights=uniform;, score=0.888 total time=   1.4s
[CV 2/5] END knn__algorithm=auto, knn__metric=euclidean, knn__n_neighbors=1, knn__p=1, knn__weights=uniform;, score=0.888 total time=   1.4s
[CV 4/5] END knn__algorithm=auto, knn__metric=euclidean, knn__n_neighbors=1, knn__p=1,

Neural Networks

In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Define the features and target variable
features = [
    'Atmospheric Density', 
    'Surface Temperature', 
    'Gravity', 
    'Water Content', 
    'Mineral Abundance', 
    'Orbital Period', 
    'Proximity to Star', 
    'Magnetic_Field_encoded', 
    'Radiation_Levels_encoded', 
    'Atmospheric Composition Index'
]
X = df[features].values
y = df['Prediction'].values

# Split the dataset into training and testing sets (using stratification)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# Scale the features to standardize the input distribution
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build the MLP neural network model
model = Sequential()
# First hidden layer with 128 neurons and dropout for regularization
model.add(Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)))
model.add(Dropout(0.2))
# Second hidden layer with 64 neurons
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
# Output layer with 10 neurons (one per class) and softmax activation for probability distribution
model.add(Dense(10, activation='softmax'))

# Compile the model using the Adam optimizer and sparse categorical crossentropy loss,
# which is well-suited for multi-class problems where the target variable is encoded as integers.
model.compile(optimizer=Adam(learning_rate=0.001), 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

# Train the model with a validation split to monitor overfitting
history = model.fit(X_train_scaled, y_train, 
                    epochs=50, 
                    batch_size=32, 
                    validation_split=0.2, 
                    verbose=1)

# Evaluate the model on the held-out test set
test_loss, test_acc = model.evaluate(X_test_scaled, y_test, verbose=0)
print("Test Accuracy:", test_acc)

# Generate a detailed classification report for further analysis
y_pred_proba = model.predict(X_test_scaled)
y_pred = np.argmax(y_pred_proba, axis=1)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m682/682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.6219 - loss: 1.2074 - val_accuracy: 0.8286 - val_loss: 0.5189
Epoch 2/50
[1m682/682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8199 - loss: 0.5621 - val_accuracy: 0.8558 - val_loss: 0.4380
Epoch 3/50
[1m682/682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8385 - loss: 0.4869 - val_accuracy: 0.8640 - val_loss: 0.3975
Epoch 4/50
[1m682/682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8528 - loss: 0.4521 - val_accuracy: 0.8694 - val_loss: 0.3842
Epoch 5/50
[1m682/682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 949us/step - accuracy: 0.8535 - loss: 0.4316 - val_accuracy: 0.8789 - val_loss: 0.3581
Epoch 6/50
[1m682/682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8646 - loss: 0.4045 - val_accuracy: 0.8804 - val_loss: 0.3500
Epoch 7/50
[1m682/682[0m [32m━━━━━

In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Define your feature set and target variable
features = [
    'Atmospheric Density', 
    'Surface Temperature', 
    'Gravity', 
    'Water Content', 
    'Mineral Abundance', 
    'Orbital Period', 
    'Proximity to Star', 
    'Magnetic_Field_encoded', 
    'Radiation_Levels_encoded', 
    'Atmospheric Composition Index'
]
X = df[features].values
y = df['Prediction'].values

# Split the dataset into training and testing sets (using stratification)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# Scale features to standardize the input distribution (important for NN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build the neural network model with an additional hidden layer
model = Sequential()
# First hidden layer with 128 neurons and dropout for regularization
model.add(Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)))
model.add(Dropout(0.2))
# Second hidden layer with 64 neurons and dropout
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
# Third hidden layer (additional) with 32 neurons and dropout
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
# Output layer with 10 neurons (one per class) and softmax activation
model.add(Dense(10, activation='softmax'))

# Compile the model using Adam optimizer and sparse categorical crossentropy loss 
model.compile(optimizer=Adam(learning_rate=0.001), 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

# Define EarlyStopping callback: monitor validation loss with a patience of 10 epochs
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model: allow up to 200 epochs, but early stopping will halt training if no improvement
history = model.fit(X_train_scaled, y_train, 
                    epochs=200, 
                    batch_size=32, 
                    validation_split=0.2, 
                    callbacks=[early_stop],
                    verbose=1)

# Evaluate the trained model on the hold-out test set
test_loss, test_acc = model.evaluate(X_test_scaled, y_test, verbose=0)
print("Test Accuracy:", test_acc)

# Generate predictions and display a classification report
y_pred_proba = model.predict(X_test_scaled)
y_pred = np.argmax(y_pred_proba, axis=1)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Epoch 1/200


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m682/682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.5466 - loss: 1.3570 - val_accuracy: 0.8279 - val_loss: 0.5201
Epoch 2/200
[1m682/682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 939us/step - accuracy: 0.7917 - loss: 0.6301 - val_accuracy: 0.8510 - val_loss: 0.4515
Epoch 3/200
[1m682/682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8169 - loss: 0.5521 - val_accuracy: 0.8637 - val_loss: 0.4084
Epoch 4/200
[1m682/682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 933us/step - accuracy: 0.8431 - loss: 0.4926 - val_accuracy: 0.8690 - val_loss: 0.3915
Epoch 5/200
[1m682/682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 937us/step - accuracy: 0.8441 - loss: 0.4877 - val_accuracy: 0.8710 - val_loss: 0.3800
Epoch 6/200
[1m682/682[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 933us/step - accuracy: 0.8510 - loss: 0.4559 - val_accuracy: 0.8741 - val_loss: 0.3683
Epoch 7/200
[1m682/682[0