Importing Librarires

In [1]:
pip install seaborn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np
from sklearn.preprocessing import LabelEncoder
import random
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import time
import os

Data Preprocessing

In [3]:
# Loading the dataset
data = pd.read_csv('dataset.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4534 entries, 0 to 4533
Data columns (total 24 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Name                          4534 non-null   int64  
 1   Epoch Date Close Approach     3280 non-null   float64
 2   Relative Velocity km per sec  3184 non-null   object 
 3   Relative Velocity km per hr   3033 non-null   float64
 4   Miles per hour                3668 non-null   float64
 5   Miss Dist.(Astronomical)      3933 non-null   float64
 6   Miss Dist.(lunar)             3417 non-null   float64
 7   Miss Dist.(kilometers)        3166 non-null   float64
 8   Miss Dist.(miles)             3882 non-null   float64
 9   Jupiter Tisserand Invariant   2802 non-null   float64
 10  Epoch Osculation              3007 non-null   float64
 11  Semi Major Axis               3346 non-null   float64
 12  Asc Node Longitude            3438 non-null   float64
 13  Per

In [5]:
data.head()

Unnamed: 0,Name,Epoch Date Close Approach,Relative Velocity km per sec,Relative Velocity km per hr,Miles per hour,Miss Dist.(Astronomical),Miss Dist.(lunar),Miss Dist.(kilometers),Miss Dist.(miles),Jupiter Tisserand Invariant,...,Aphelion Dist,Perihelion Time,Mean Anomaly,Mean Motion,approach_year,approach_month,approach_day,Orbital Period,Orbit Uncertainity,Hazardous
0,3703080,788947200000.0,Very Slow,22017.003799,13680.509944,0.419483,163.178711,62753692.0,38993336.0,,...,2.005764,,264.837533,0.590551,,1.0,1.0,Low,Medium,True
1,3723955,,Slow,65210.346095,40519.173105,,,57298148.0,35603420.0,,...,1.497352,2457795.0,173.741112,0.84533,1995.0,1.0,1.0,Low,,False
2,2446862,789552000000.0,,27326.560182,16979.661798,0.050956,19.82189,7622911.5,4736657.5,4.557,...,1.966857,2458120.0,,,1995.0,,8.0,Medium,,True
3,3092506,790156800000.0,Very Slow,40225.948191,24994.839864,,,,26522368.0,5.093,...,1.527904,2457902.0,68.741007,0.700277,1995.0,,15.0,Low,Medium,False
4,3514799,790156800000.0,Very Slow,35426.991794,,0.407832,158.646713,61010824.0,,5.154,...,,2457814.0,,,,,15.0,,Low,True


In [6]:
# Categorize columns as numerical or categorical
numerical_cols = data.select_dtypes(include=['number']).columns.tolist()
categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()

print("\nNumerical columns:", numerical_cols)
print("Categorical columns:", categorical_cols)


Numerical columns: ['Name', 'Epoch Date Close Approach', 'Relative Velocity km per hr', 'Miles per hour', 'Miss Dist.(Astronomical)', 'Miss Dist.(lunar)', 'Miss Dist.(kilometers)', 'Miss Dist.(miles)', 'Jupiter Tisserand Invariant', 'Epoch Osculation', 'Semi Major Axis', 'Asc Node Longitude', 'Perihelion Arg', 'Aphelion Dist', 'Perihelion Time', 'Mean Anomaly', 'Mean Motion', 'approach_year', 'approach_month', 'approach_day']
Categorical columns: ['Relative Velocity km per sec', 'Orbital Period', 'Orbit Uncertainity']


In [7]:
# Missing Values
data.isnull().sum()

Name                               0
Epoch Date Close Approach       1254
Relative Velocity km per sec    1350
Relative Velocity km per hr     1501
Miles per hour                   866
Miss Dist.(Astronomical)         601
Miss Dist.(lunar)               1117
Miss Dist.(kilometers)          1368
Miss Dist.(miles)                652
Jupiter Tisserand Invariant     1732
Epoch Osculation                1527
Semi Major Axis                 1188
Asc Node Longitude              1096
Perihelion Arg                  1134
Aphelion Dist                    815
Perihelion Time                 1564
Mean Anomaly                     918
Mean Motion                     1508
approach_year                    819
approach_month                  1528
approach_day                     543
Orbital Period                   530
Orbit Uncertainity              1767
Hazardous                          0
dtype: int64

In [8]:
#Total Null values
Misiing_values=data.isnull().sum().sum()
print("Total misiing values :",Misiing_values)

Total misiing values : 25378


In [9]:
# Type of values in categorical columns
for col in categorical_cols:
    if len(data[col].unique()) <= 10: 
        print(f"\nUnique values in '{col}':", data[col].unique())


Unique values in 'Relative Velocity km per sec': ['Very Slow' 'Slow' nan 'Fast' 'Very Fast']

Unique values in 'Orbital Period': ['Low' 'Medium' nan 'High']

Unique values in 'Orbit Uncertainity': ['Medium' nan 'Low' 'High']


In [10]:
# Correlation matrix for numerical columns
print("\nCorrelation matrix for numerical columns:")
print(data[numerical_cols].corr())


Correlation matrix for numerical columns:
                                 Name  Epoch Date Close Approach  \
Name                         1.000000                   0.188469   
Epoch Date Close Approach    0.188469                   1.000000   
Relative Velocity km per hr -0.173408                  -0.087520   
Miles per hour              -0.172888                  -0.083833   
Miss Dist.(Astronomical)    -0.155269                  -0.139330   
Miss Dist.(lunar)           -0.140484                  -0.142746   
Miss Dist.(kilometers)      -0.142508                  -0.145067   
Miss Dist.(miles)           -0.153537                  -0.147898   
Jupiter Tisserand Invariant  0.005657                  -0.143766   
Epoch Osculation             0.001506                  -0.032676   
Semi Major Axis              0.022772                   0.114847   
Asc Node Longitude          -0.007010                  -0.000859   
Perihelion Arg              -0.004971                   0.004541   
Aphel

In [11]:
# Impute missing values using KNN imputation
numerical_cols = data.select_dtypes(include=['number']).columns.tolist()
categorical_cols = data.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

In [12]:
# Scale numerical features (important for KNN)
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

In [13]:
# Scale numerical features (important for KNN)
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

In [14]:
# Create a KNN imputer object 
imputer = KNNImputer(n_neighbors=10)

In [15]:
# Impute missing values for numerical columns
data[numerical_cols] = imputer.fit_transform(data[numerical_cols])

In [16]:
# Impute missing values for categorical columns (using KNN for categorical features is less common)
for col in categorical_cols:
    if data[col].isnull().any():
        mapping = {category: i for i, category in enumerate(data[col].unique())}
        data[col] = data[col].map(mapping)
        data[col] = imputer.fit_transform(data[col].values.reshape(-1, 1))
        reverse_mapping = {i: category for category, i in mapping.items()}
        data[col] = data[col].map(reverse_mapping)

In [17]:
# Inverse transform numerical features to get the original scale
data[numerical_cols] = scaler.inverse_transform(data[numerical_cols])

In [18]:
# Apply floor function to 'approach_year', 'approach_month', and 'approach_day'
data['approach_year'] = data['approach_year'].apply(np.floor).astype(int)
data['approach_month'] = data['approach_month'].apply(np.floor).astype(int)
data['approach_day'] = data['approach_day'].apply(np.floor).astype(int)

In [19]:
# Calculate statistics for "Relative Velocity km per hr"
min_velocity = data['Relative Velocity km per hr'].min()
max_velocity = data['Relative Velocity km per hr'].max()
mean_velocity = data['Relative Velocity km per hr'].mean()
std_velocity = data['Relative Velocity km per hr'].std()

In [20]:
# Define bins based on statistics
bins = [min_velocity, 
        mean_velocity - std_velocity, 
        mean_velocity, 
        mean_velocity + std_velocity, 
        max_velocity]
labels = ['Very Slow', 'Slow', 'Medium', 'Fast']

In [21]:
# Create a new column with binned velocity categories
data['binned_velocity'] = pd.cut(data['Relative Velocity km per hr'], bins=bins, labels=labels, include_lowest=True)

# Impute missing values in "Relative Velocity km per sec" using the binned categories
data['Relative Velocity km per sec'].fillna(data['binned_velocity'], inplace=True)

# Drop the temporary "binned_velocity" column
data.drop('binned_velocity', axis=1, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Relative Velocity km per sec'].fillna(data['binned_velocity'], inplace=True)


In [22]:
# Print the bins and labels being used
print("Bins for 'Relative Velocity km per hr':")
for i in range(len(bins) - 1):
    print(f"- {labels[i]}: [{bins[i]:.2f}, {bins[i+1]:.2f})")

Bins for 'Relative Velocity km per hr':
- Very Slow: [-1.86, -0.91)
- Slow: [-0.91, -0.03)
- Medium: [-0.03, 0.84)
- Fast: [0.84, 4.15)


In [23]:
# Print the "Relative Velocity km per sec" column after imputation
print("\n'Relative Velocity km per sec' column after imputation:")
print(data['Relative Velocity km per sec'])


'Relative Velocity km per sec' column after imputation:
0       Very Slow
1            Slow
2            Slow
3       Very Slow
4       Very Slow
          ...    
4529         Slow
4530    Very Slow
4531    Very Slow
4532         Slow
4533    Very Fast
Name: Relative Velocity km per sec, Length: 4534, dtype: object


In [24]:
# Filling missing values in 'Orbital Period' and 'Orbit Uncertainty' with random categories
categories = ['Low', 'Medium', 'High']
data['Orbital Period'] = data['Orbital Period'].apply(lambda x: random.choice(categories) if pd.isnull(x) else x)
data['Orbit Uncertainity'] = data['Orbit Uncertainity'].apply(lambda x: random.choice(categories) if pd.isnull(x) else x)

In [25]:
data = pd.read_csv('Preprocessed_dataset.csv')

# Convert non-numeric columns to numeric using LabelEncoder
label_encoders = {}
for column in tqdm(data.select_dtypes(include=['object']).columns, desc="Encoding columns"):
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Separate the features and the target variable
X = data.drop('Hazardous', axis=1)
y = data['Hazardous']


Encoding columns: 100%|██████████| 3/3 [00:00<00:00, 996.19it/s]


Test and Train Split

In [34]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
# Create ensemble models
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42,criterion='gini', max_depth=100, min_samples_split=3, min_samples_leaf=3, min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None, monotonic_cst=None)
gb_clf = GradientBoostingClassifier(n_estimators=100, random_state=42,learning_rate=0.1)


In [46]:
# Create a VotingClassifier ensemble model
ensemble_clf = VotingClassifier(estimators=[('rf', rf_clf), ('gb', gb_clf)], voting='hard')

In [47]:
# Train the ensemble model
for _ in tqdm(range(100), desc="Training progress"):
    time.sleep(0.01)
ensemble_clf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = ensemble_clf.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the ensemble model: {accuracy:.2f}')

Training progress: 100%|██████████| 100/100 [00:01<00:00, 96.13it/s]


Accuracy of the ensemble model: 0.86
