In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

df = pd.read_csv("../dataset/abalone.csv",index_col=0)

In [15]:
# 데이터 확인
df

Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [17]:
# 단변량 시퀀스를 여러 샘플로 분할하는 함수
def split_sequence(sequence, n_steps):
    X, y = list(), list()
    for i in range(len(sequence)):
        end_ix = i + n_steps
        if end_ix > len(sequence)-1:
            break
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [19]:
# 결측치 확인
(df == 0).sum()

Sex               0
Length            0
Diameter          0
Height            2
Whole_weight      0
Shucked_weight    0
Viscera_weight    0
Shell_weight      0
Rings             0
dtype: int64

In [21]:
# 결측치 제거
# Outcome 열을 제외한 나머지 열에서 0이 포함된 행을 제거
df = df[(df!= 0).all(axis=1)]

# 결과 출력
print(df.shape)  # 0 값이 제거된 후의 데이터 크기 확인

(4175, 9)


In [23]:
# Sex 값을 F -> 0, M -> 1로 변환
df['Sex'] = df['Sex'].replace({'F': 0, 'M': 1, 'I':2})

  df['Sex'] = df['Sex'].replace({'F': 0, 'M': 1, 'I':2})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sex'] = df['Sex'].replace({'F': 0, 'M': 1, 'I':2})


In [25]:
X = df.drop('Rings', axis=1).values
y = df['Rings'].values
y_adjusted = y - 1 # Rings 값이 1부터 시작하므로 0으로 조정

In [27]:
# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 시퀀스 길이 설정
n_steps = 3  # 시퀀스 길이 (조정 가능)

# 시퀀스 데이터로 분할
X_seq, y_seq = split_sequence(y_adjusted, n_steps)

In [75]:
# 학습용 데이터셋 분할
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

# LSTM 입력에 맞게 차원 확장
X_train = X_train[:, :, np.newaxis]
X_test = X_test[:, :, np.newaxis]

num_classes = len(np.unique(y_train)) +1

# LSTM 모델 구축
model = Sequential()
model.add(LSTM(64, input_shape=(n_steps, 1), return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(32))
model.add(Dropout(0.3))
model.add(Dense(1))  # 출력 레이어, 회귀이므로 1

# 모델 컴파일
model.compile(optimizer='adam', loss='mean_squared_error')

  super().__init__(**kwargs)


In [95]:
# 모델 학습
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2)

Epoch 1/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 7.5320 - val_loss: 7.5883
Epoch 2/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 7.8869 - val_loss: 7.6290
Epoch 3/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 7.4164 - val_loss: 7.5903
Epoch 4/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 7.3128 - val_loss: 7.6411
Epoch 5/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 7.0379 - val_loss: 7.5935
Epoch 6/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 8.1327 - val_loss: 7.8561
Epoch 7/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 7.5445 - val_loss: 7.5603
Epoch 8/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 7.4639 - val_loss: 7.7142
Epoch 9/30
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [97]:
# 모델 평가
loss = model.evaluate(X_test, y_test)
print(f'Test Loss (MSE): {loss}')

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 668us/step - loss: 6.5536
Test Loss (MSE): 6.9541096687316895
