In [12]:
from sklearn.model_selection import train_test_split
import pandas as pd
from decimal import Decimal

raw_data_dir = "../data/raw/"
raw_data_filename = "star_classification.csv"
df = pd.read_csv(raw_data_dir + raw_data_filename)



anl_size = Decimal("0.1")
test_size = Decimal("0.2")
train_size = Decimal("0.7")

# suppose your full DataFrame is called df, and the column to stratify on is 'class'
temp_size = train_size + test_size
# 1) split off 10% of the data
df_anl, df_temp = train_test_split(
    df,
    test_size=float(temp_size),              # leave 90% in df_temp
    stratify=df['class'],        # keep class proportions
    random_state=42              # any integer for reproducibility
)

# 2) split the remaining 90% into 70% and 20% of the original
#    since df_temp is 90% of the data, we need test_size = (20/90) ≈ 0.2222
df_train, df_test = train_test_split(
    df_temp,
    test_size=float(test_size/temp_size),             # ≈ 0.2222 → 20% of original
    stratify=df_temp['class'],   # still stratify on the same column
    random_state=42
)

# Sanity check
print(len(df_anl), len(df_train), len(df_test))            # should be roughly 0.10, 0.70, 0.20 × len(df)
print(df_anl['class'].value_counts(normalize=True))  # proportions should match those in df
print(df_train['class'].value_counts(normalize=True))
print(df_test['class'].value_counts(normalize=True))


df_anl.to_csv(raw_data_dir + 'analysis.csv', index=False)
df_train.to_csv(raw_data_dir + 'train.csv', index=False)
df_test.to_csv(raw_data_dir + 'test.csv', index=False)

10000 70000 20000
class
GALAXY    0.5945
STAR      0.2159
QSO       0.1896
Name: proportion, dtype: float64
class
GALAXY    0.594443
STAR      0.215943
QSO       0.189614
Name: proportion, dtype: float64
class
GALAXY    0.59445
STAR      0.21595
QSO       0.18960
Name: proportion, dtype: float64
