1. Machine learning and data science projects focused on star classification and prediction
2. Astrophysics research and celestial body analysis
3. Educational tools for astronomy and space science courses
4. Visualization projects exploring the diversity of stellar objects

Whether you're training models to classify stars, studying stellar evolution, or simply exploring the wonders of the universe, this dataset provides a solid foundation for your cosmic endeavors. Dive into the data and let the stars guide your discoveries!

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
data = pd.read_csv("K:\Learning Progress\project\cleaned_star_data.csv")

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

In [4]:
#bersihkan dari duplicated data
data.duplicated().sum()

0

In [5]:
#cek data kosong
data.isnull().sum()

Temperature (K)           1
Luminosity(L/Lo)          1
Radius(R/Ro)              1
Absolute magnitude(Mv)    1
Star type                 1
Star color                1
Spectral Class            1
dtype: int64

In [6]:
data = data.dropna()

In [7]:
print("Star type: ", data['Star type'].unique())
print("Star color: ", data['Star color'].unique())
print("Spectral class :", data['Spectral Class'].unique())

Star type:  [0. 1. 2. 3. 4. 5.]
Star color:  ['Red' 'Blue-White' 'White' ' ' 'Yellow-White' 'Blue']
Spectral class : ['M' 'B' 'A' 'F' 'O' ' ' 'K' 'G']


In [8]:
#ups, ada data kosong berkedok ''
data['Star color'] = data['Star color'].replace(' ', np.nan)
data['Spectral Class'] = data['Spectral Class'].replace(' ', np.nan)

In [9]:
data.isna().sum()

Temperature (K)           0
Luminosity(L/Lo)          0
Radius(R/Ro)              0
Absolute magnitude(Mv)    0
Star type                 0
Star color                6
Spectral Class            2
dtype: int64

In [10]:
# Convert object columns to numeric, forcing errors to NaN (if any)
data[['Temperature (K)', 'Luminosity(L/Lo)', 'Radius(R/Ro)', 'Absolute magnitude(Mv)']] = data[['Temperature (K)', 'Luminosity(L/Lo)', 'Radius(R/Ro)', 'Absolute magnitude(Mv)']].apply(lambda x: pd.to_numeric(x, errors='coerce'))

data = data.dropna()

In [11]:
data

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
1,3042.0,0.000500,0.1542,16.60,0.0,Red,M
2,2600.0,0.000300,0.1020,18.70,0.0,Red,M
4,1939.0,0.000138,0.1030,20.06,0.0,Red,M
6,2637.0,0.000730,0.1270,17.22,0.0,Red,M
7,2600.0,0.000400,0.0960,17.40,0.0,Red,M
...,...,...,...,...,...,...,...
235,38940.0,374830.000000,1356.0000,-9.93,5.0,Blue,O
236,30839.0,834042.000000,1194.0000,-10.63,5.0,Blue,O
237,8829.0,537493.000000,1423.0000,-10.73,5.0,White,A
238,9235.0,404940.000000,1112.0000,-11.23,5.0,White,A


In [12]:
num = ['Temperature (K)','Luminosity(L/Lo)', 'Radius(R/Ro)','Absolute magnitude(Mv)']
cats = ['Star type', 'Star color', 'Spectral Class']

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 215 entries, 1 to 239
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Temperature (K)         215 non-null    float64
 1   Luminosity(L/Lo)        215 non-null    float64
 2   Radius(R/Ro)            215 non-null    float64
 3   Absolute magnitude(Mv)  215 non-null    float64
 4   Star type               215 non-null    float64
 5   Star color              215 non-null    object 
 6   Spectral Class          215 non-null    object 
dtypes: float64(5), object(2)
memory usage: 13.4+ KB


In [14]:
# Convert specific columns to numeric, coercing errors to NaN
data['Temperature (K)'] = pd.to_numeric(data['Temperature (K)'], errors='coerce')
data['Luminosity(L/Lo)'] = pd.to_numeric(data['Luminosity(L/Lo)'], errors='coerce')
data['Radius(R/Ro)'] = pd.to_numeric(data['Radius(R/Ro)'], errors='coerce')
data['Absolute magnitude(Mv)'] = pd.to_numeric(data['Absolute magnitude(Mv)'], errors='coerce')

In [15]:
# Select only numerical columns
numerical_columns = data.select_dtypes(include=['number']).columns
# Fill NaN values with the mean for each numerical column
data[numerical_columns] = data[numerical_columns].fillna(data[numerical_columns].mean())

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 215 entries, 1 to 239
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Temperature (K)         215 non-null    float64
 1   Luminosity(L/Lo)        215 non-null    float64
 2   Radius(R/Ro)            215 non-null    float64
 3   Absolute magnitude(Mv)  215 non-null    float64
 4   Star type               215 non-null    float64
 5   Star color              215 non-null    object 
 6   Spectral Class          215 non-null    object 
dtypes: float64(5), object(2)
memory usage: 13.4+ KB


In [17]:
data[num].describe()

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv)
count,215.0,215.0,215.0,215.0
mean,10506.595349,108604.722141,232.865406,4.379716
std,9453.871649,183781.510616,514.646696,10.556794
min,1939.0,8e-05,0.0084,-11.92
25%,3355.0,0.000815,0.1009,-6.227
50%,5936.0,0.085,0.795,6.228
75%,15129.0,198000.0,40.5,13.88
max,40000.0,849420.0,1948.5,20.06


In [18]:
#Classification of the Star type

categorical_columns = ['Star color', 'Spectral Class']

# One-hot encode the categorical columns
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 215 entries, 1 to 239
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Temperature (K)          215 non-null    float64
 1   Luminosity(L/Lo)         215 non-null    float64
 2   Radius(R/Ro)             215 non-null    float64
 3   Absolute magnitude(Mv)   215 non-null    float64
 4   Star type                215 non-null    float64
 5   Star color_Blue-White    215 non-null    uint8  
 6   Star color_Red           215 non-null    uint8  
 7   Star color_White         215 non-null    uint8  
 8   Star color_Yellow-White  215 non-null    uint8  
 9   Spectral Class_B         215 non-null    uint8  
 10  Spectral Class_F         215 non-null    uint8  
 11  Spectral Class_G         215 non-null    uint8  
 12  Spectral Class_K         215 non-null    uint8  
 13  Spectral Class_M         215 non-null    uint8  
 14  Spectral Class_O         2

In [20]:
data

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color_Blue-White,Star color_Red,Star color_White,Star color_Yellow-White,Spectral Class_B,Spectral Class_F,Spectral Class_G,Spectral Class_K,Spectral Class_M,Spectral Class_O
1,3042.0,0.000500,0.1542,16.60,0.0,0,1,0,0,0,0,0,0,1,0
2,2600.0,0.000300,0.1020,18.70,0.0,0,1,0,0,0,0,0,0,1,0
4,1939.0,0.000138,0.1030,20.06,0.0,0,1,0,0,0,0,0,0,1,0
6,2637.0,0.000730,0.1270,17.22,0.0,0,1,0,0,0,0,0,0,1,0
7,2600.0,0.000400,0.0960,17.40,0.0,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,38940.0,374830.000000,1356.0000,-9.93,5.0,0,0,0,0,0,0,0,0,0,1
236,30839.0,834042.000000,1194.0000,-10.63,5.0,0,0,0,0,0,0,0,0,0,1
237,8829.0,537493.000000,1423.0000,-10.73,5.0,0,0,1,0,0,0,0,0,0,0
238,9235.0,404940.000000,1112.0000,-11.23,5.0,0,0,1,0,0,0,0,0,0,0


In [22]:
X = data.drop(columns='Star type')
y = data['Star type']
print(X.shape, y.shape)

(215, 14) (215,)


In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) 

In [24]:
from sklearn.linear_model import LinearRegression
# Inisialisasi model
model = LinearRegression()

# Latih model
model.fit(X_train, y_train)

In [25]:
print (f'nilai intercept adalah : {model.intercept_}')

nilai intercept adalah : 3.9094829678825973


In [26]:
y_pred = model.predict(X_test)

In [27]:
data = pd.DataFrame({'actual': y_test, 'predicted':y_pred})
data

Unnamed: 0,actual,predicted
222,4.0,4.374544
46,4.0,3.534582
106,4.0,3.806192
189,0.0,0.43426
193,1.0,0.901119
90,3.0,2.946009
113,5.0,4.944992
156,3.0,3.328922
8,0.0,0.31308
99,3.0,3.22787


In [28]:
from sklearn import metrics

print('mean absolute error:', metrics.mean_absolute_error(y_test, y_pred))
print('mean squared error:', metrics.mean_squared_error(y_test, y_pred))
print('Root mean squared error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('model R^2 Square Value :', metrics.r2_score(y_test, y_pred))

mean absolute error: 0.2887581502935353
mean squared error: 0.11843258814257428
Root mean squared error: 0.3441403611065902
model R^2 Square Value : 0.9577906986361565
