In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
file_path = 'data/cosmicclassifierTraining.csv'
df = pd.read_csv(file_path)
df.head(5)

Unnamed: 0,Atmospheric Density,Surface Temperature,Gravity,Water Content,Mineral Abundance,Orbital Period,Proximity to Star,Magnetic Field Strength,Radiation Levels,Atmospheric Composition Index,Prediction
0,0.472806,,-0.313872,-2.089299,-0.152201,-0.885649,0.900105,,Category_6,0.692907,5.0
1,4.180154,-1.157515,2.430956,-1.59585,-3.188678,-0.609434,-0.199828,Category_9,Category_9,,0.0
2,-0.129008,1.621592,-0.785741,2.081196,-1.413796,-0.095152,-3.502577,,Category_8,-0.677182,4.0
3,-3.122,-2.299818,1.072092,0.353524,-0.192529,2.917067,-1.972329,,Category_11,0.109429,1.0
4,-1.459426,2.890268,0.148757,-0.804439,0.494875,0.04491,-0.438796,Category_6,Category_10,0.407941,9.0


In [4]:
print(df.shape)
print(list(df.columns))  


(60000, 11)
['Atmospheric Density', 'Surface Temperature', 'Gravity', 'Water Content', 'Mineral Abundance', 'Orbital Period', 'Proximity to Star', 'Magnetic Field Strength', 'Radiation Levels', 'Atmospheric Composition Index', 'Prediction']


In [5]:
missing_values = df.isna().sum()
print("\nMissing values per column:")
print(missing_values)


Missing values per column:
Atmospheric Density              2984
Surface Temperature              3032
Gravity                          2984
Water Content                    3077
Mineral Abundance                2921
Orbital Period                   2997
Proximity to Star                2945
Magnetic Field Strength          3058
Radiation Levels                 3021
Atmospheric Composition Index    2942
Prediction                       3039
dtype: int64


In [6]:
print("\nDescriptive statistics for numeric columns:")
df.describe()


Descriptive statistics for numeric columns:


Unnamed: 0,Atmospheric Density,Surface Temperature,Gravity,Water Content,Mineral Abundance,Orbital Period,Proximity to Star,Atmospheric Composition Index,Prediction
count,57016.0,56968.0,57016.0,56923.0,57079.0,57003.0,57055.0,57058.0,56961.0
mean,-0.000202,-0.000288,-0.000469,0.001938,-0.00073,-0.001043,0.000188,0.000211,4.454381
std,2.263527,1.936598,1.804605,1.689267,1.605524,1.511685,1.316682,1.120303,2.890055
min,-4.364843,-5.503527,-5.553877,-5.816755,-5.077363,-4.801046,-4.537187,-4.007504,0.0
25%,-1.55581,-1.426786,-1.279002,-1.21887,-1.078449,-1.04892,-0.937097,-0.709852,2.0
50%,-0.18867,-0.330037,0.046231,-0.004676,0.040008,0.036651,-0.062001,0.049292,4.0
75%,1.308113,1.503646,1.255432,1.063391,1.095483,1.0588,0.888149,0.789511,7.0
max,9.324018,5.638094,6.03029,6.287045,5.584059,5.111014,4.942699,3.852567,9.0


In [7]:
# Drop rows with NaN values
df = df.dropna().reset_index(drop=True)

In [8]:
print("\nDescriptive statistics for numeric columns:")
df.describe()


Descriptive statistics for numeric columns:


Unnamed: 0,Atmospheric Density,Surface Temperature,Gravity,Water Content,Mineral Abundance,Orbital Period,Proximity to Star,Atmospheric Composition Index,Prediction
count,34059.0,34059.0,34059.0,34059.0,34059.0,34059.0,34059.0,34059.0,34059.0
mean,-0.011847,-0.002704,0.00397,0.003003,0.005906,-0.001933,-0.005676,0.004431,4.457999
std,2.257107,1.936163,1.803941,1.688711,1.602094,1.514356,1.317587,1.125303,2.890949
min,-4.283309,-5.426189,-5.553877,-5.816755,-5.077363,-4.801046,-4.537187,-4.007504,0.0
25%,-1.560969,-1.423844,-1.276465,-1.218254,-1.069377,-1.054219,-0.942822,-0.713259,2.0
50%,-0.199246,-0.329629,0.053229,0.003468,0.051432,0.047483,-0.068085,0.050758,4.0
75%,1.291944,1.498558,1.263103,1.062735,1.101213,1.064815,0.879585,0.799353,7.0
max,9.324018,5.638094,6.03029,6.287045,5.335537,5.111014,4.731871,3.852567,9.0


In [9]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Profiling Report")
profile.to_file("your_report.html")

  from .autonotebook import tqdm as notebook_tqdm
Summarize dataset: 100%|██████████| 101/101 [00:05<00:00, 18.82it/s, Completed]                                                         
Generate report structure: 100%|██████████| 1/1 [00:01<00:00,  1.81s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.25s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 64.76it/s]


In [10]:
print("Unique values in 'Prediction':", df['Prediction'].unique())
print("Unique values in 'Magnetic Field Strength':", df['Magnetic Field Strength'].unique())
print("Unique values in 'Radiation Levels':", df['Radiation Levels'].unique())


Unique values in 'Prediction': [9. 1. 3. 7. 2. 6. 0. 5. 8. 4.]
Unique values in 'Magnetic Field Strength': ['Category_6' 'Category_8' 'Category_14' 'Category_13' 'Category_7'
 'Category_10' 'Category_9' 'Category_12' 'Category_11' 'Category_15'
 'Category_4' 'Category_5' 'Category_16' 'Category_17' 'Category_3'
 'Category_18' 'Category_2' 'Category_19' 'Category_1' 'Category_20']
Unique values in 'Radiation Levels': ['Category_10' 'Category_7' 'Category_8' 'Category_12' 'Category_9'
 'Category_5' 'Category_6' 'Category_11' 'Category_13' 'Category_14'
 'Category_3' 'Category_4' 'Category_15' 'Category_16' 'Category_2'
 'Category_1' 'Category_17' 'Category_18' 'Category_19' 'Category_20']


In [11]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoders
magnetic_encoder = LabelEncoder()
radiation_encoder = LabelEncoder()

# Encode "Magnetic Field Strength"
df['Magnetic_Field_encoded'] = magnetic_encoder.fit_transform(df['Magnetic Field Strength'])

# Encode "Radiation Levels"
df['Radiation_Levels_encoded'] = radiation_encoder.fit_transform(df['Radiation Levels'])

# Print Encoded Mapping
print("Magnetic Field Encoding:", dict(zip(magnetic_encoder.classes_, magnetic_encoder.transform(magnetic_encoder.classes_))))
print("Radiation Levels Encoding:", dict(zip(radiation_encoder.classes_, radiation_encoder.transform(radiation_encoder.classes_))))


Magnetic Field Encoding: {'Category_1': np.int64(0), 'Category_10': np.int64(1), 'Category_11': np.int64(2), 'Category_12': np.int64(3), 'Category_13': np.int64(4), 'Category_14': np.int64(5), 'Category_15': np.int64(6), 'Category_16': np.int64(7), 'Category_17': np.int64(8), 'Category_18': np.int64(9), 'Category_19': np.int64(10), 'Category_2': np.int64(11), 'Category_20': np.int64(12), 'Category_3': np.int64(13), 'Category_4': np.int64(14), 'Category_5': np.int64(15), 'Category_6': np.int64(16), 'Category_7': np.int64(17), 'Category_8': np.int64(18), 'Category_9': np.int64(19)}
Radiation Levels Encoding: {'Category_1': np.int64(0), 'Category_10': np.int64(1), 'Category_11': np.int64(2), 'Category_12': np.int64(3), 'Category_13': np.int64(4), 'Category_14': np.int64(5), 'Category_15': np.int64(6), 'Category_16': np.int64(7), 'Category_17': np.int64(8), 'Category_18': np.int64(9), 'Category_19': np.int64(10), 'Category_2': np.int64(11), 'Category_20': np.int64(12), 'Category_3': np.int

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
import pandas as pd

# Define features (X) and target (y)
X = df[['Atmospheric Density', 'Surface Temperature', 'Gravity', 'Water Content', 
        'Mineral Abundance', 'Orbital Period', 'Proximity to Star', 
        'Magnetic_Field_encoded', 'Radiation_Levels_encoded', 'Atmospheric Composition Index']]

y = df['Prediction']  # Target variable

# Train a RandomForest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Get feature importance scores
feature_importance = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)

# Print results
print("Feature Importance (Random Forest):\n", feature_importance)

# Calculate Mutual Information
mutual_info = mutual_info_classif(X, y)
mutual_info_series = pd.Series(mutual_info, index=X.columns).sort_values(ascending=False)

print("\nMutual Information Scores:\n", mutual_info_series)


Feature Importance (Random Forest):
 Atmospheric Density              0.169299
Surface Temperature              0.155090
Water Content                    0.130819
Mineral Abundance                0.113398
Orbital Period                   0.105740
Proximity to Star                0.092436
Gravity                          0.081731
Atmospheric Composition Index    0.064001
Magnetic_Field_encoded           0.063570
Radiation_Levels_encoded         0.023916
dtype: float64

Mutual Information Scores:
 Surface Temperature              0.520912
Atmospheric Density              0.481845
Water Content                    0.357254
Mineral Abundance                0.314030
Orbital Period                   0.237313
Magnetic_Field_encoded           0.211651
Proximity to Star                0.181322
Gravity                          0.178893
Atmospheric Composition Index    0.113620
Radiation_Levels_encoded         0.059524
dtype: float64


In [13]:
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

dt = DecisionTreeClassifier(max_depth=3, random_state=42)
dt.fit(X, y)

plt.figure(figsize=(12, 6))
plot_tree(dt, feature_names=X.columns, class_names=[str(i) for i in range(10)], filled=True)
matplotlib.use("TkAgg")
plt.show()


NameError: name 'matplotlib' is not defined