---
---
---

# NASA Meteorite Landings Prediction

---
---
---

<br>

## Dependencies

### Packages

In [1]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from kagglehub import KaggleDatasetAdapter
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import kagglehub
import numpy as np

### Utilities

In [2]:
from lib.data_profiling import profile_data

### Dataset

In [3]:
# Importing data
initial_df = kagglehub.dataset_load(adapter=KaggleDatasetAdapter.PANDAS,
                                    handle='nasa/meteorite-landings',
                                    path='meteorite-landings.csv')

# -------------------------

initial_df.head()

Unnamed: 0,name,id,nametype,recclass,mass,fall,year,reclat,reclong,GeoLocation
0,Aachen,1,Valid,L5,21.0,Fell,1880.0,50.775,6.08333,"(50.775000, 6.083330)"
1,Aarhus,2,Valid,H6,720.0,Fell,1951.0,56.18333,10.23333,"(56.183330, 10.233330)"
2,Abee,6,Valid,EH4,107000.0,Fell,1952.0,54.21667,-113.0,"(54.216670, -113.000000)"
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976.0,16.88333,-99.9,"(16.883330, -99.900000)"
4,Achiras,370,Valid,L6,780.0,Fell,1902.0,-33.16667,-64.95,"(-33.166670, -64.950000)"


## Data Profiling

In [4]:
# # Profiling the data
# profile_data(initial_df,
#              '../res/data-profiling',
#              file_name='initial-df',
#              report_title='Initial Dataset')

## Feature Engineering

In [5]:
# Selecting only necessary columns
final_df = initial_df[['nametype', 'recclass', 'mass', 'fall', 'reclat', 'reclong']]

In [6]:
# Removing all 'Relic' entries, then 'nametype' feature will be no longer necessary
final_df = final_df[final_df['nametype'] != 'Relic']
final_df = final_df.drop(columns=['nametype'])

In [7]:
# Keeping only the top 10 classes and group the rest as 'Others'
top_10_classes = final_df['recclass'].value_counts().nlargest(10).index
final_df['recclass'] = final_df['recclass'].apply(lambda x: x if x in top_10_classes else 'Others')

In [8]:
# Imputing missing values in 'mean' with the median
final_df['mass'] = final_df['mass'].fillna(final_df['mass'].median())

# Transforming the 'mass' feature using log1p
final_df['mass'] = np.log1p(final_df['mass'])

In [9]:
# Imputing missing values in 'reclat' and 'reclong' with the median
final_df['reclat'] = final_df['reclat'].fillna(final_df['reclat'].median())
final_df['reclong'] = final_df['reclong'].fillna(final_df['reclong'].median())

In [10]:
# # Profiling the data
# profile_data(final_df,
#              '../res/data-profiling',
#              file_name='final-df',
#              report_title='Final Dataset')

## Machine Learning

In [11]:
# Target
final_df['fall'] = final_df['fall'].map({'Fell': 1, 'Found': 0})

# Attributes
num_attr = ['mass', 'reclat', 'reclong']
cat_attr = ['recclass']

# -------------------------

# Implementing the pipeline
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder())
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_attr),
        ('cat', cat_transformer, cat_attr)
    ]
)
model = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(class_weight='balanced'))
])

# -------------------------

# Splitting train and test sets
X = final_df[num_attr + cat_attr]
y = final_df['fall']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -------------------------

# Fitting the model, setting the threshold for positive class
model.fit(X_train, y_train)
y_prob = model.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.8).astype(int)

# -------------------------

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      8916
           1       0.72      0.71      0.72       228

    accuracy                           0.99      9144
   macro avg       0.86      0.85      0.85      9144
weighted avg       0.99      0.99      0.99      9144

