In [None]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

warnings.filterwarnings("ignore")

# 📥 2. Load Dataset
def load_data(path):
    df = pd.read_csv(path)
    print("✅ Data Loaded Successfully")
    print("\nStructure:\n", df.info())
    print("\nMissing Values:\n", df.isnull().sum())
    return df

data_path = "C:\Users\singh\OneDrive\Documents\Hackathon\planet hunt\asteroid_data_analysis\asteroid_data.csv" 
df = load_data(data_path)

# 📊 3. Descriptive Stats and Feature Insights
def analyze_features(data, cols):
    for col in cols:
        print(f"\n🧪 Stats for {col}:")
        print(f"Range: {data[col].max() - data[col].min()}")
        print(f"Mean: {data[col].mean()} | Median: {data[col].median()} | Std: {data[col].std()}")
        sns.histplot(data[col], kde=True)
        plt.title(f"{col} Distribution")
        plt.savefig(f"{col}_distribution.png")
        plt.clf()

main_features = ['H', 'diameter', 'albedo']
analyze_features(df, main_features)

# 🔄 Normalization Check
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[main_features])
print("\n📉 Scaled Features Preview:\n", pd.DataFrame(scaled_features, columns=main_features).head())

# 📈 4. Visual Exploration
def correlation_plot(data):
    plt.figure(figsize=(10, 7))
    sns.heatmap(data.corr(), annot=True, cmap="coolwarm")
    plt.title("Feature Correlation Heatmap")
    plt.savefig("correlation_heatmap.png")
    plt.clf()

correlation_plot(df)

# Optional: Detection heatmap if applicable
if 'year' in df.columns and 'detection_method' in df.columns:
    pivot = df.pivot_table(index='detection_method', columns='year', aggfunc='size', fill_value=0)
    sns.heatmap(pivot, cmap="YlOrBr")
    plt.title("Detection Method Over Years")
    plt.savefig("detection_heatmap.png")
    plt.clf()

# Class distribution
sns.countplot(x='class', data=df)
plt.title("Asteroid Class Distribution")
plt.savefig("class_countplot.png")
plt.clf()

# 🛠️ 5. Feature Engineering
df['MOID_SMA_Ratio'] = df['moid_ld'] / df['a']

# Drop non-numeric identifiers
df.drop(columns=['name', 'id'], inplace=True, errors='ignore')

# 🎯 6. Handle Class Imbalance with SMOTE
target = 'class'
X = df.drop(target, axis=1)
y = df[target]

print("\n🎯 Class Distribution Before SMOTE:\n", y.value_counts())

oversampler = SMOTE(random_state=42)
X_res, y_res = oversampler.fit_resample(X, y)

print("\n📊 After SMOTE:\n", pd.Series(y_res).value_counts())

# 🔀 7. Split the Data
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, stratify=y_res, random_state=42)

# ⚙️ 8. XGBoost Classifier
model = XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=6,
                      use_label_encoder=False, eval_metric='mlogloss', verbosity=0)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 📏 9. Evaluation Metrics
print("\n✅ Accuracy Score:", accuracy_score(y_test, y_pred))
print("\n📄 Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
conf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat, annot=True, cmap='Blues', fmt='d')
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.savefig("confusion_matrix.png")
plt.clf()

# 🔍 Feature Importance
importances = model.feature_importances_
sorted_indices = np.argsort(importances)[::-1]
feat_names = X.columns[sorted_indices]

plt.figure(figsize=(10, 6))
sns.barplot(x=importances[sorted_indices], y=feat_names)
plt.title("Feature Importance")
plt.savefig("feature_importance.png")
plt.clf()

print("\n All plots and analysis completed. Use them in your final PDF report.")
