In [None]:
# 🚀 Asteroid Classification - Planet Hunt Hackathon

# 📦 Import Required Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from collections import Counter

warnings.filterwarnings("ignore")

# 📅 Load Dataset
def load_data(path):
    df = pd.read_csv(path)
    print("✅ Data Loaded Successfully")
    print("\nStructure:\n", df.info())
    print("\nMissing Values:\n", df.isnull().sum())
    return df

# Path to your dataset
data_path = "/content/drive/MyDrive/asteroid_data_analysis/asteroid_data.csv"
df = load_data(data_path)

# 🔍 Drop identifier / non-numeric columns that could cause errors
drop_cols = ['name', 'id', 'designation', 'full_name', 'orbit_id']
df.drop(columns=drop_cols, inplace=True, errors='ignore')

# 🔐 Encode string labels in 'class' column
label_encoder = LabelEncoder()
df['class'] = label_encoder.fit_transform(df['class'])
label_map = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("\u2705 Class label mapping (original ➔ encoded):")
print(label_map)

# 🧶 Feature Engineering
df['MOID_SMA_Ratio'] = df['moid_ld'] / df['a']

# 📊 Descriptive Statistics
def analyze_features(data, cols):
    for col in cols:
        print(f"\n🤪 Stats for {col}:")
        print(f"Range: {data[col].max() - data[col].min()}")
        print(f"Mean: {data[col].mean()} | Median: {data[col].median()} | Std: {data[col].std()}")
        sns.histplot(data[col], kde=True)
        plt.title(f"{col} Distribution")
        plt.savefig(f"{col}_distribution.png")
        plt.clf()

main_features = ['H', 'diameter', 'albedo']
analyze_features(df, main_features)

# 🔄 Normalization Preview
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[main_features])
print("\n📉 Scaled Features Preview:\n", pd.DataFrame(scaled_features, columns=main_features).head())

# 📈 Correlation Plot (Only numeric features)
def correlation_plot(data):
    numeric_data = data.select_dtypes(include=[np.number])
    plt.figure(figsize=(10, 7))
    sns.heatmap(numeric_data.corr(), annot=True, cmap="coolwarm")
    plt.title("Feature Correlation Heatmap")
    plt.savefig("correlation_heatmap.png")
    plt.clf()

correlation_plot(df)

# 📅 Optional: Detection Methods Heatmap
if 'year' in df.columns and 'detection_method' in df.columns:
    pivot = df.pivot_table(index='detection_method', columns='year', aggfunc='size', fill_value=0)
    sns.heatmap(pivot, cmap="YlOrBr")
    plt.title("Detection Method Over Years")
    plt.savefig("detection_heatmap.png")
    plt.clf()

# 📌 Class Distribution Before Balancing
sns.countplot(x='class', data=df)
plt.title("Asteroid Class Distribution")
plt.savefig("class_countplot.png")
plt.clf()

# 🧼 Prepare Feature Matrix and Target
target = 'class'
X = df.drop(columns=[target])
X = X.select_dtypes(include=[np.number])  # Keep only numeric columns
y = df[target]

# ⚠️ Class Imbalance - Before
print("\n🌟 Class Distribution Before SMOTE:\n", y.value_counts())

class_counts = Counter(y)
min_class_count = min(class_counts.values())

if min_class_count < 2:
    print("❌ SMOTE cannot be applied. A class has less than 2 samples.")
    print("✅ Proceeding without SMOTE. Consider duplicating minority samples manually or dropping rare classes.")
    X_res, y_res = X.copy(), y.copy()  # Just clone data without SMOTE
else:
    # Safe SMOTE
    k_neighbors_value = min(5, min_class_count - 1)
    print(f"✅ Applying SMOTE with k_neighbors = {k_neighbors_value}")
    smote = SMOTE(random_state=42, k_neighbors=k_neighbors_value)
    X_res, y_res = smote.fit_resample(X, y)

print("\n📊 Class distribution after SMOTE or fallback:\n", pd.Series(y_res).value_counts())

# 📌 Drop classes with < 2 samples (required for stratify)
class_counts = pd.Series(y_res).value_counts()
valid_classes = class_counts[class_counts >= 2].index
mask = y_res.isin(valid_classes)
X_res = X_res[mask]
y_res = y_res[mask]
print("\n📅 Class distribution after cleaning:\n", y_res.value_counts())

# 🔁 Re-encode target classes after cleaning to ensure XGBoost sees correct label space
y_res = pd.Series(LabelEncoder().fit_transform(y_res), index=y_res.index)

# 🔀 Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, stratify=y_res, random_state=42
)

# 🤖 Model Training - XGBoost
model = XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=6,
                      use_label_encoder=False, eval_metric='mlogloss', verbosity=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 📈 Evaluation Metrics
print("\n📉 Accuracy Score:", accuracy_score(y_test, y_pred))
print("\n📔 Classification Report:\n", classification_report(y_test, y_pred))

# 🗃️ Confusion Matrix Plot
conf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat, annot=True, cmap='Blues', fmt='d')
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.savefig("confusion_matrix.png")
plt.clf()

# 🔍 Feature Importance
importances = model.feature_importances_
sorted_indices = np.argsort(importances)[::-1]
feat_names = X.columns[sorted_indices]

plt.figure(figsize=(10, 6))
sns.barplot(x=importances[sorted_indices], y=feat_names)
plt.title("Feature Importance")
plt.savefig("feature_importance.png")
plt.clf()

print("\n📦 All plots saved successfully. Use them in your final PDF report.")

