### 보스턴 주택 가격 데이터 세트를 Peceptron 기반에서 학습 및 테스트하기 위한 데이터 로드
* 보스턴 주택 가격 데이터 csv파일을 다운로드하고 이를 DataFrame으로 생성

In [1]:
import pandas as pd
import numpy as np

boston_df = pd.read_csv('data/boston_house_price.csv')
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [2]:
print(boston_df.shape)

(506, 14)


### Weight와 Bias의 Update 값을 계산하는 함수 생성.
* w1은 RM(방의 계수) 피처의 Weight 값
* w2는 LSTAT(하위계층 비율) 피처의 Weight 값
* bias는 Bias
* N은 입력 데이터 건수
![](https://raw.githubusercontent.com/chulminkw/CNN_PG_Torch/main/image/gradient_descent.png)


In [3]:
import torch

# gradient_descent()함수에서 반복적으로 호출되면서 update될 weight/bias 값을 계산하는 함수. 
# rm은 RM(방 개수), lstat(하위계층 비율), target은 PRICE임. 전체 해당 tensor가 다 입력됨. 
# 반환 값은 weight와 bias가 update되어야 할 값과 Mean Squared Error 값을 loss로 반환.
def get_update_weights_value(bias, w1, w2, rm, lstat, target, learning_rate=0.01):
    # 데이터 건수
    N = target.shape[0]
    # 예측 값. 
    predicted = w1 * rm + w2 * lstat + bias
    # 실제값과 예측값의 차이
    diff = target - predicted 
    
    # weight와 bias를 얼마나 update할 것인지를 계산.  
    w1_update = -(2/N) * learning_rate * (torch.matmul(rm, diff))
    w2_update = -(2/N) * learning_rate * (torch.matmul(lstat, diff))
    bias_update = -(2/N) * learning_rate * torch.sum(diff)
    
    # Mean Squared Error값을 계산. 
    mse_loss = torch.mean(diff ** 2)
    
    # weight와 bias가 update되어야 할 값과 Mean Squared Error 값을 반환. 
    return bias_update, w1_update, w2_update, mse_loss

### Gradient Descent 를 적용하는 함수 생성
* iter_epochs 수만큼 반복적으로 get_update_weights_value()를 호출하여 update될 weight/bias값을 구한 뒤 Weight/Bias를 Update적용. 

In [4]:
# RM, LSTAT feature tensor와 PRICE target tensor를 입력 받아서 iter_epochs수만큼 반복적으로 Weight와 Bias를 update적용. 
def gradient_descent(features, target, iter_epochs=1000, learning_rate=0.01, verbose=True):
    # w1, w2는 1차원 tensor로 변환하되 초기 값은 0으로 설정
    # bias도 1차원 tensor로 변환하되 초기 값은 1로 설정. 
    w1 = torch.zeros(1, dtype=torch.float32)
    w2 = torch.zeros(1, dtype=torch.float32)
    bias = torch.ones(1, dtype=torch.float32)
    print('최초 w1, w2, bias:', w1.item(), w2.item(), bias.item())
    
    # learning_rate와 RM, LSTAT 피처 지정. 호출 시 tensor형태로 RM과 LSTAT으로 된 2차원 feature가 입력됨.
    rm = features[:, 0]
    lstat = features[:, 1]
    
    # iter_epochs 수만큼 반복하면서 weight와 bias update 수행. 
    for i in range(1, iter_epochs+1):
        # weight/bias update 값 계산
        bias_update, w1_update, w2_update, loss = get_update_weights_value(bias, w1, w2, 
                                                                           rm, lstat, target, learning_rate=0.01)
        
        # weight/bias의 update 적용.
        w1 = w1 - w1_update
        w2 = w2 - w2_update
        bias = bias - bias_update
        if verbose: # 10회 epochs 시마다 출력
            if i % 10 == 0:
                print(f'Epoch: {i}/{iter_epochs}')
                print(f'w1: {w1.item()}, w2: {w2.item()}, bias: {bias.item()}, loss: {loss.item()}')
        
    return w1, w2, bias

### Gradient Descent 적용
* 신경망은 데이터를 정규화/표준화 작업을 미리 선행해 주어야 함. 
* 이를 위해 사이킷런의 MinMaxScaler를 이용하여 개별 feature값은 0~1사이 값으로 변환후 학습 적용.

In [5]:
import torch
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_features_np = scaler.fit_transform(boston_df[['RM', 'LSTAT']])

scaled_features_ts = torch.from_numpy(scaled_features_np)
targets_ts = torch.from_numpy(boston_df['PRICE'].values)

w1, w2, bias = gradient_descent(scaled_features_ts, targets_ts, iter_epochs=5000, verbose=True)
print('##### 최종 w1, w2, bias #######')
print(w1.item(), w2.item(), bias.item())

최초 w1, w2, bias: 0.0 0.0 1.0
Epoch: 10/5000
w1: 2.1600239276885986, w2: 0.8827961683273315, bias: 4.814126491546631, loss: 364.92802077239907
Epoch: 20/5000
w1: 3.83414626121521, w2: 1.488028645515442, bias: 7.705874919891357, loss: 244.00112523756874
Epoch: 30/5000
w1: 5.139560699462891, w2: 1.8834487199783325, bias: 9.898355484008789, loss: 174.09917235653666
Epoch: 40/5000
w1: 6.165116786956787, w2: 2.1204135417938232, bias: 11.560710906982422, loss: 133.53020121296407
Epoch: 50/5000
w1: 6.978174686431885, w2: 2.23785138130188, bias: 12.8211669921875, loss: 109.82715995177246
Epoch: 60/5000
w1: 7.629802703857422, w2: 2.265268325805664, bias: 13.776935577392578, loss: 95.82475012805263
Epoch: 70/5000
w1: 8.158720016479492, w2: 2.225027322769165, bias: 14.501710891723633, loss: 87.40472070696126
Epoch: 80/5000
w1: 8.594283103942871, w2: 2.134077310562134, bias: 15.05136489868164, loss: 82.2000806840636
Epoch: 90/5000
w1: 8.958751678466797, w2: 2.0052602291107178, bias: 15.468253135681

### 계산된 Weight와 Bias를 이용하여 Price 예측
* 예측 feature 역시 0~1사이의 scaled값을 이용하고 Weight와 bias를 적용하여 예측값 계산. 

In [6]:
predicted = scaled_features_ts[:, 0]*w1 + scaled_features_ts[:, 1]*w2 + bias
boston_df['PREDICTED_PRICE'] = predicted.cpu().numpy()
boston_df.head(10)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE,PREDICTED_PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0,28.948606
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6,25.489461
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7,32.538213
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4,32.337287
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2,31.506543
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21,28.7,28.092617
6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43,22.9,21.354932
7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.9,19.15,27.1,17.757951
8,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93,16.5,8.10284
9,0.17004,12.5,7.87,0.0,0.524,6.004,85.9,6.5921,5.0,311.0,15.2,386.71,17.1,18.9,18.27474


In [7]:
from sklearn.metrics import mean_squared_error

total_mse = mean_squared_error(boston_df['PRICE'], boston_df['PREDICTED_PRICE'])
print(total_mse)

30.529068392718102


In [8]:
from sklearn.model_selection import train_test_split

tr_features, test_features, tr_target, test_target = train_test_split(scaled_features_np, boston_df['PRICE'].values, 
                                                                      test_size=0.3, random_state=2025)
print(tr_features.shape, tr_target.shape, test_features.shape, test_target.shape)

(354, 2) (354,) (152, 2) (152,)


In [9]:
tr_features_ts = torch.from_numpy(tr_features)
tr_targets_ts = torch.from_numpy(tr_target)

w1, w2, bias = gradient_descent(tr_features_ts, tr_targets_ts, iter_epochs=5000, verbose=True)
print('##### 최종 w1, w2, bias #######')
print(w1.item(), w2.item(), bias.item())

최초 w1, w2, bias: 0.0 0.0 1.0
Epoch: 10/5000
w1: 2.1782474517822266, w2: 0.8989012241363525, bias: 4.82615327835083, loss: 369.1777488693663
Epoch: 20/5000
w1: 3.865514039993286, w2: 1.5150399208068848, bias: 7.724394798278809, loss: 247.23309861532792
Epoch: 30/5000
w1: 5.180546760559082, w2: 1.9176135063171387, bias: 9.919800758361816, loss: 176.86106703093324
Epoch: 40/5000
w1: 6.213293075561523, w2: 2.159029245376587, bias: 11.58285140991211, loss: 136.08428868980482
Epoch: 50/5000
w1: 7.03188419342041, w2: 2.2789769172668457, bias: 12.842673301696777, loss: 112.29406054215411
Epoch: 60/5000
w1: 7.687928199768066, w2: 2.3075127601623535, bias: 13.797069549560547, loss: 98.25655334337485
Epoch: 70/5000
w1: 8.220519065856934, w2: 2.2673964500427246, bias: 14.520127296447754, loss: 89.8214789681167
Epoch: 80/5000
w1: 8.659270286560059, w2: 2.1758623123168945, bias: 15.067954063415527, loss: 84.60770563979638
Epoch: 90/5000
w1: 9.02661418914795, w2: 2.0459582805633545, bias: 15.48305225

In [10]:
test_features_ts = torch.from_numpy(test_features)
test_predicted_ts = test_features_ts[:, 0]*w1 + test_features_ts[:, 1]*w2 + bias

boston_test_df = pd.DataFrame({
    'RM': test_features[:, 0],
    'LSTAT': test_features[:, 1],
    'PRICE': test_target,
    'PREDICTED_PRICE': test_predicted_ts.cpu().numpy()
})

boston_test_df.head(20)

Unnamed: 0,RM,LSTAT,PRICE,PREDICTED_PRICE
0,0.504311,0.546082,11.0,16.276481
1,0.727534,0.082781,31.5,33.224345
2,0.442422,0.348786,22.0,19.469608
3,0.44338,0.197296,50.0,23.164367
4,0.51964,0.139349,24.1,26.52354
5,0.511401,0.309051,20.1,22.200871
6,0.425752,0.450607,22.5,16.575347
7,0.612569,0.049669,32.4,31.078924
8,0.623683,0.061258,31.6,31.083084
9,0.571757,0.53394,10.9,18.299919


In [11]:
test_total_mse = mean_squared_error(boston_test_df['PRICE'], boston_test_df['PREDICTED_PRICE'])
print(test_total_mse)

28.44255102479557


### Stochastic Gradient Descent와 Mini Batch Gradient Descent 구현
* SGD 는 전체 데이터에서 한건만 임의로 선택하여 Gradient Descent 로 Weight/Bias Update 계산한 뒤 Weight/Bias 적용
* Mini Batch GD는 전체 데이터에서 Batch 건수만큼 데이터를 선택하여 Gradient Descent로 Weight/Bias Update 계산한 뒤 Weight/Bias 적용

### SGD 기반으로 Weight/Bias update 값 구하기

In [12]:
import torch

# get_update_weights_value() 함수와 거의 유사. 
# 인자로 들어오는 rm_sgd, lstat_sgd, target_sgd은 단 1개의 원소를 가지는 tensor임. 
def get_update_weights_value_sgd(bias, w1, w2, rm_sgd, lstat_sgd, target_sgd, learning_rate=0.01):
    # 데이터 건수
    N = target_sgd.shape[0]
    # 예측 값. 
    predicted_sgd = w1 * rm_sgd + w2 * lstat_sgd + bias
    # 실제값과 예측값의 차이
    diff_sgd = target_sgd - predicted_sgd 
    
    # weight와 bias를 얼마나 update할 것인지를 계산.  
    w1_update = -(2/N) * learning_rate * (torch.matmul(rm_sgd, diff_sgd))
    w2_update = -(2/N) * learning_rate * (torch.matmul(lstat_sgd, diff_sgd))
    bias_update = -(2/N) * learning_rate * torch.sum(diff_sgd)
    
    # weight와 bias가 update되어야 할 값 반환. 
    return bias_update, w1_update, w2_update

### SGD 수행하기

In [13]:
# RM, LSTAT feature tensor와 PRICE target tensor를 입력 받아서 iter_epochs수만큼 반복적으로 Weight와 Bias를 update적용. 
def st_gradient_descent(features, target, iter_epochs=1000, learning_rate=0.01, verbose=True):
    # random seed 값 설정. 
    torch.manual_seed(2025)
    # w1, w2는 1차원 tensor로 변환하되 초기 값은 0으로 설정
    # bias도 1차원 tensor로 변환하되 초기 값은 1로 설정.
    w1 = torch.zeros(1, dtype=torch.float32)
    w2 = torch.zeros(1, dtype=torch.float32)
    bias = torch.ones(1, dtype=torch.float32)
    print('최초 w1, w2, bias:', w1.item(), w2.item(), bias.item())
    
    # learning_rate와 RM, LSTAT 피처 지정. 호출 시 tensor형태로 RM과 LSTAT으로 된 2차원 feature가 입력됨.
    rm = features[:, 0]
    lstat = features[:, 1]
    
    # iter_epochs 수만큼 반복하면서 weight와 bias update 수행. 
    for i in range(1, iter_epochs+1):
        # iteration 시마다 stochastic gradient descent 를 수행할 데이터를 한개만 추출. 
        #추출할 데이터의 인덱스를  로 선택. 
        stochastic_index = torch.randint(0, target.shape[0], size=(1,))
        rm_sgd = rm[stochastic_index]
        lstat_sgd = lstat[stochastic_index]
        target_sgd = target[stochastic_index]
        # weight/bias update 값 계산. loss 반환 없음. 
        bias_update, w1_update, w2_update = get_update_weights_value_sgd(bias, w1, w2, rm_sgd, lstat_sgd, 
                                                                     target_sgd, learning_rate=0.01)
        # weight/bias의 update 적용.
        w1 = w1 - w1_update
        w2 = w2 - w2_update
        bias = bias - bias_update
        if verbose: # 100회 iteration 시마다 출력
            if i % 100 == 0:
                print(f'Epoch: {i}/{iter_epochs}')
                # Loss는 전체 학습 데이터 기반으로 구해야 함. 아래는 전체 학습 feature 기반의 예측 및 loss임.  
                predicted = w1 * rm + w2*lstat + bias
                diff = target - predicted
                loss = torch.mean(diff ** 2)
                print(f'w1: {w1.item()}, w2: {w2.item()}, bias: {bias.item()}, loss: {loss.item()}')
        
    return w1, w2, bias

In [14]:
import torch
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# 학습과 테스트용 feature와 target 분리. 
def get_scaled_train_test_feature_target_ts(data_df):
    # RM, LSTAT Feature에 Scaling 적용
    scaler = MinMaxScaler()
    scaled_features_np = scaler.fit_transform(data_df[['RM', 'LSTAT']])
    # 학습 feature, 테스트 feature, 학습 target, test_target으로 분리. 
    tr_features, test_features, tr_target, test_target = train_test_split(scaled_features_np, 
                                                                          data_df['PRICE'].values, 
                                                                          test_size=0.3, random_state=2025)
    # 학습 feature와 target을 tensor로 변환. 
    tr_ftr_ts = torch.from_numpy(tr_features)
    tr_tgt_ts = torch.from_numpy(tr_target)
    test_ftr_ts = torch.from_numpy(test_features)
    test_tgt_ts = torch.from_numpy(test_target)
    
    return tr_ftr_ts, tr_tgt_ts, test_ftr_ts, test_tgt_ts

tr_ftr_ts, tr_tgt_ts, test_ftr_ts, test_tgt_ts = get_scaled_train_test_feature_target_ts(data_df=boston_df)

print(f"tr_ftr_ts shape:{tr_ftr_ts.shape} tr_tgt_ts shape:{tr_tgt_ts.shape}")
print(f"test_ftr_ts shape:{test_ftr_ts.shape} test_tgt_ts shape: {test_tgt_ts.shape}")

tr_ftr_ts shape:torch.Size([354, 2]) tr_tgt_ts shape:torch.Size([354])
test_ftr_ts shape:torch.Size([152, 2]) test_tgt_ts shape: torch.Size([152])


In [15]:
# 학습 feature와 target으로 Stochastic Gradient Descent 수행. 
w1, w2, bias = st_gradient_descent(tr_ftr_ts, tr_tgt_ts, iter_epochs=5000, verbose=True)
print('##### 최종 w1, w2, bias #######')
print(w1, w2, bias)

최초 w1, w2, bias: 0.0 0.0 1.0
Epoch: 100/5000
w1: 9.10193920135498, w2: 2.380099058151245, bias: 16.32122802734375, loss: 79.20577943326704
Epoch: 200/5000
w1: 11.38884162902832, w2: 0.6377108693122864, bias: 17.6075439453125, loss: 72.33473168920669
Epoch: 300/5000
w1: 11.43453311920166, w2: -1.9443591833114624, bias: 15.634873390197754, loss: 67.57221355061512
Epoch: 400/5000
w1: 13.544651985168457, w2: -3.477250814437866, bias: 16.884212493896484, loss: 59.43184224469005
Epoch: 500/5000
w1: 14.300559997558594, w2: -5.690975189208984, bias: 16.270288467407227, loss: 54.669107955151254
Epoch: 600/5000
w1: 15.722941398620605, w2: -7.305695056915283, bias: 16.813228607177734, loss: 50.19557187179603
Epoch: 700/5000
w1: 16.91908073425293, w2: -8.649571418762207, bias: 16.845500946044922, loss: 47.19992843074546
Epoch: 800/5000
w1: 17.07891273498535, w2: -10.086063385009766, bias: 16.271669387817383, loss: 45.02960586768405
Epoch: 900/5000
w1: 17.974334716796875, w2: -10.912721633911133, b

In [16]:
# 테스트 데이터에서 예측 수행 및 결과를 DataFrame으로 생성. 
test_predicted_ts = test_ftr_ts[:, 0]*w1 + test_ftr_ts[:, 1]*w2 + bias

boston_test_df = pd.DataFrame({
    'RM': test_features[:, 0],
    'LSTAT': test_ftr_ts[:, 1],
    'PRICE': test_tgt_ts,
    'PREDICTED_PRICE_SGD': test_predicted_ts.cpu().numpy()
})

boston_test_df.head(20)

Unnamed: 0,RM,LSTAT,PRICE,PREDICTED_PRICE_SGD
0,0.504311,0.546082,11.0,16.638467
1,0.727534,0.082781,31.5,33.931732
2,0.442422,0.348786,22.0,19.718124
3,0.44338,0.197296,50.0,23.406238
4,0.51964,0.139349,24.1,26.888865
5,0.511401,0.309051,20.1,22.561742
6,0.425752,0.450607,22.5,16.801722
7,0.612569,0.049669,32.4,31.593653
8,0.623683,0.061258,31.6,31.616892
9,0.571757,0.53394,10.9,18.773215


In [17]:
from sklearn.metrics import mean_squared_error

test_total_mse = mean_squared_error(boston_test_df['PRICE'], boston_test_df['PREDICTED_PRICE_SGD'])
print(test_total_mse)

28.984664879873662


### iteration시마다 일정한 batch 크기만큼의 데이터를 random하게 가져와서 GD를 수행하는 Mini-Batch GD 수행

In [18]:
def get_update_weights_value_batch(bias, w1, w2, rm_batch, lstat_batch, target_batch, learning_rate=0.01):
    # 데이터 건수
    N = target_batch.shape[0]
    # 예측 값. 
    predicted_batch = w1 * rm_batch + w2 * lstat_batch + bias
    # 실제값과 예측값의 차이
    diff_batch = target_batch - predicted_batch 
    
    # weight와 bias를 얼마나 update할 것인지를 계산.  
    w1_update = -(2/N) * learning_rate * (torch.matmul(rm_batch, diff_batch))
    w2_update = -(2/N) * learning_rate * (torch.matmul(lstat_batch, diff_batch))
    bias_update = -(2/N) * learning_rate * torch.sum(diff_batch)
    
    # weight와 bias가 update되어야 할 값 반환. 
    return bias_update, w1_update, w2_update

In [19]:
batch_indexes = torch.randint(0, 300, size=(30,))
print(batch_indexes)

tr_ftr_ts[batch_indexes, 0]

tensor([213, 127,  30, 201, 134, 201,  72, 284, 128,  42, 247,  18, 118, 208,
        276, 236, 271, 166,  12, 136, 290, 224,  88,  49,  22,  56,  25, 178,
        202,   3])


tensor([0.5965, 0.6541, 0.4208, 0.4397, 0.4532, 0.4397, 0.6256, 0.7394, 0.6978,
        0.3531, 0.6206, 0.5882, 0.5281, 0.6066, 0.3631, 0.4110, 0.4369, 0.4681,
        0.7586, 0.5557, 0.3915, 0.4451, 0.6902, 0.4800, 0.4509, 0.5392, 0.2686,
        0.4645, 0.2826, 0.4263], dtype=torch.float64)

In [20]:
# batch_gradient_descent()는 인자로 batch_size(배치 크기)를 입력 받음. 
def batch_random_gradient_descent(features, target, iter_epochs=5000, batch_size=30, verbose=True):
    # random seed 값 설정. 
    torch.manual_seed(2025)
    # w1, w2는 1차원 tensor로 변환하되 초기 값은 0으로 설정
    # bias도 1차원 tensor로 변환하되 초기 값은 1로 설정.
    w1 = torch.zeros(1, dtype=torch.float32)
    w2 = torch.zeros(1, dtype=torch.float32)
    bias = torch.ones(1, dtype=torch.float32)
    print('최초 w1, w2, bias:', w1.item(), w2.item(), bias.item())
    
    # learning_rate와 RM, LSTAT 피처 지정. 호출 시 tensor 형태로 RM과 LSTAT으로 된 2차원 feature가 입력됨.
    learning_rate = 0.01
    rm = features[:, 0]
    lstat = features[:, 1]
    
    # iter_epochs 수만큼 반복하면서 weight와 bias update 수행. 
    for i in range(1, iter_epochs+1):
        # batch_size 갯수만큼 데이터를 임의로 선택. 
        batch_indexes = torch.randint(0, target.shape[0], size=(batch_size,))
        rm_batch = rm[batch_indexes]
        lstat_batch = lstat[batch_indexes]
        target_batch = target[batch_indexes]
        # Batch GD 기반으로 Weight/Bias의 Update를 구함. 
        bias_update, w1_update, w2_update = get_update_weights_value_batch(bias, w1, w2, 
                                                                           rm_batch, lstat_batch, 
                                                                           target_batch, learning_rate)
        
        # Batch GD로 구한 weight/bias의 update 적용. 
        w1 = w1 - w1_update
        w2 = w2 - w2_update
        bias = bias - bias_update
        if verbose: # 100회 iteration 시마다 출력
            if i % 100 == 0:
                print(f'Epoch: {i}/{iter_epochs}')
                # Loss는 전체 학습 데이터 기반으로 구해야 함. 아래는 전체 학습 feature 기반의 예측 및 loss임.  
                predicted = w1 * rm + w2*lstat + bias
                diff = target - predicted
                loss = torch.mean(diff ** 2)
                print(f'w1: {w1.item()}, w2: {w2.item()}, bias: {bias.item()}, loss: {loss.item()}')
        
    return w1, w2, bias

In [21]:
tr_ftr_ts, tr_tgt_ts, test_ftr_ts, test_tgt_ts = get_scaled_train_test_feature_target_ts(data_df=boston_df)

# 학습 feature와 target으로 Stochastic Gradient Descent 수행. 
w1, w2, bias = batch_random_gradient_descent(tr_ftr_ts, tr_tgt_ts, iter_epochs=5000, batch_size=30, verbose=True)
print('##### 최종 w1, w2, bias #######')
print(w1, w2, bias)

최초 w1, w2, bias: 0.0 0.0 1.0
Epoch: 100/5000
w1: 9.532495498657227, w2: 1.938775897026062, bias: 15.881488800048828, loss: 78.11859371095791
Epoch: 200/5000
w1: 11.555778503417969, w2: -0.21187061071395874, bias: 16.998315811157227, loss: 68.93165545311753
Epoch: 300/5000
w1: 12.718533515930176, w2: -2.56358003616333, bias: 16.81089210510254, loss: 62.173577384915774
Epoch: 400/5000
w1: 13.90304183959961, w2: -4.585747241973877, bias: 16.800025939941406, loss: 56.79182986963833
Epoch: 500/5000
w1: 14.967144012451172, w2: -6.40493106842041, bias: 16.75732421875, loss: 52.39103174204248
Epoch: 600/5000
w1: 15.9356050491333, w2: -8.12226390838623, bias: 16.799663543701172, loss: 48.65957528994758
Epoch: 700/5000
w1: 16.72344398498535, w2: -9.702632904052734, bias: 16.679140090942383, loss: 45.64700262630224
Epoch: 800/5000
w1: 17.431224822998047, w2: -11.052210807800293, bias: 16.579404830932617, loss: 43.32511744180553
Epoch: 900/5000
w1: 18.30120849609375, w2: -12.218840599060059, bias:

In [22]:
# 테스트 데이터에서 예측 수행 및 결과를 DataFrame으로 생성. 
test_predicted_ts = test_ftr_ts[:, 0]*w1 + test_ftr_ts[:, 1]*w2 + bias

boston_test_df = pd.DataFrame({
    'RM': test_features[:, 0],
    'LSTAT': test_ftr_ts[:, 1],
    'PRICE': test_tgt_ts,
    'PREDICTED_PRICE_RANDOM_BATCH': test_predicted_ts.cpu().numpy()
})

test_total_mse = mean_squared_error(boston_test_df['PRICE'], boston_test_df['PREDICTED_PRICE_RANDOM_BATCH'])
print("test 데이터 세트의 MSE:", test_total_mse)

boston_test_df.head(20)

test 데이터 세트의 MSE: 28.42650203860407


Unnamed: 0,RM,LSTAT,PRICE,PREDICTED_PRICE_RANDOM_BATCH
0,0.504311,0.546082,11.0,16.242677
1,0.727534,0.082781,31.5,33.231256
2,0.442422,0.348786,22.0,19.422623
3,0.44338,0.197296,50.0,23.116687
4,0.51964,0.139349,24.1,26.490353
5,0.511401,0.309051,20.1,22.167069
6,0.425752,0.450607,22.5,16.525711
7,0.612569,0.049669,32.4,31.063288
8,0.623683,0.061258,31.6,31.069676
9,0.571757,0.53394,10.9,18.27916


### iteration 시에 순차적으로 일정한 batch 크기만큼의 데이터를 전체 학습데이터에 걸쳐서 가져오는 Mini-Batch GD 수행

In [23]:
for batch_step in range(0, 506, 30):
    print(batch_step)

0
30
60
90
120
150
180
210
240
270
300
330
360
390
420
450
480


In [24]:
# batch_gradient_descent()는 인자로 batch_size(배치 크기)를 입력 받음. 
def batch_gradient_descent(features, target, epochs=300, batch_size=30, verbose=True):
    # random seed 값 설정. 
    torch.manual_seed(2025)
    # w1, w2는 1차원 tensor로 변환하되 초기 값은 0으로 설정
    # bias도 1차원 tensor로 변환하되 초기 값은 1로 설정.
    w1 = torch.zeros(1, dtype=torch.float32)
    w2 = torch.zeros(1, dtype=torch.float32)
    bias = torch.ones(1, dtype=torch.float32)
    print('최초 w1, w2, bias:', w1.item(), w2.item(), bias.item())
    
    # learning_rate와 RM, LSTAT 피처 지정. 호출 시 numpy array형태로 RM과 LSTAT으로 된 2차원 feature가 입력됨.
    learning_rate = 0.01
    rm = features[:, 0]
    lstat = features[:, 1]
    
    # iter_epochs 수만큼 반복하면서 weight와 bias update 수행. 
    for i in range(1, epochs+1):
        # batch_size 만큼 데이터를 가져와서 weight/bias update를 수행하는 로직을 전체 데이터 건수만큼 반복
        for batch_step in range(0, target.shape[0], batch_size):
            # batch_size만큼 순차적인 데이터를 가져옴. 
            rm_batch = rm[batch_step:batch_step + batch_size]
            lstat_batch = lstat[batch_step:batch_step + batch_size]
            target_batch = target[batch_step:batch_step + batch_size]
        
            bias_update, w1_update, w2_update = get_update_weights_value_batch(bias, w1, w2, 
                                                                               rm_batch, lstat_batch, target_batch, 
                                                                               learning_rate)
            # Batch GD로 구한 weight/bias의 update 적용. 
            w1 = w1 - w1_update
            w2 = w2 - w2_update
            bias = bias - bias_update
        
        if verbose:
            print(f'Epoch: {i}/{epochs}')
            # Loss는 전체 학습 데이터 기반으로 구해야 함. 아래는 전체 학습 feature 기반의 예측 및 loss임.  
            predicted = w1 * rm + w2*lstat + bias
            diff = target - predicted
            loss = torch.mean(diff ** 2)
            print(f'w1: {w1.item()}, w2: {w2.item()}, bias: {bias.item()}, loss: {loss.item()}')
        
    return w1, w2, bias

In [25]:
tr_ftr_ts, tr_tgt_ts, test_ftr_ts, test_tgt_ts = get_scaled_train_test_feature_target_ts(data_df=boston_df)

# 학습 feature와 target으로 Mini Batch Gradient Descent 수행. 
w1, w2, bias = batch_gradient_descent(tr_ftr_ts, tr_tgt_ts, epochs=300, batch_size=30, verbose=True)
print('##### 최종 w1, w2, bias #######')
print(w1, w2, bias)

최초 w1, w2, bias: 0.0 0.0 1.0
Epoch: 1/300
w1: 2.548649311065674, w2: 1.0510945320129395, bias: 5.483735084533691, loss: 324.92663814145334
Epoch: 2/300
w1: 4.426955699920654, w2: 1.7139111757278442, bias: 8.697149276733398, loss: 206.7792344029648
Epoch: 3/300
w1: 5.824299335479736, w2: 2.0995075702667236, bias: 11.000228881835938, loss: 145.6548750690044
Epoch: 4/300
w1: 6.876403331756592, w2: 2.2874581813812256, bias: 12.650941848754883, loss: 113.7959164674643
Epoch: 5/300
w1: 7.680532455444336, w2: 2.3347744941711426, bias: 13.834155082702637, loss: 96.95631171099302
Epoch: 6/300
w1: 8.306395530700684, w2: 2.2823007106781006, bias: 14.682345390319824, loss: 87.82619786802941
Epoch: 7/300
w1: 8.803953170776367, w2: 2.1592955589294434, bias: 15.290448188781738, loss: 82.65587677962309
Epoch: 8/300
w1: 9.209012985229492, w2: 1.9867151975631714, bias: 15.726494789123535, loss: 79.52265426532097
Epoch: 9/300
w1: 9.547245025634766, w2: 1.7795655727386475, bias: 16.03923797607422, loss: 7

In [26]:
# 테스트 데이터에서 예측 수행 및 결과를 DataFrame으로 생성. 
test_predicted_ts = test_ftr_ts[:, 0]*w1 + test_ftr_ts[:, 1]*w2 + bias

boston_test_df = pd.DataFrame({
    'RM': test_features[:, 0],
    'LSTAT': test_ftr_ts[:, 1],
    'PRICE': test_tgt_ts,
    'PREDICTED_PRICE_BATCH': test_predicted_ts.cpu().numpy()
})

test_total_mse = mean_squared_error(boston_test_df['PRICE'], boston_test_df['PREDICTED_PRICE_BATCH'])
print("test 데이터 세트의 MSE:", test_total_mse)

boston_test_df.head(20)

test 데이터 세트의 MSE: 28.330330879942494


Unnamed: 0,RM,LSTAT,PRICE,PREDICTED_PRICE_BATCH
0,0.504311,0.546082,11.0,16.511732
1,0.727534,0.082781,31.5,32.899692
2,0.442422,0.348786,22.0,19.593937
3,0.44338,0.197296,50.0,23.16414
4,0.51964,0.139349,24.1,26.41401
5,0.511401,0.309051,20.1,22.236692
6,0.425752,0.450607,22.5,16.796413
7,0.612569,0.049669,32.4,30.820619
8,0.623683,0.061258,31.6,30.825219
9,0.571757,0.53394,10.9,18.470453


### Pytorch를 이용하여 Simple Regression 모델 구축하기
* Pytorch 모델은 torch.nn.Module 클래스를 상속하여 생성함
* nn.Parameter()는 학습 파라미터(Learnable Parameter) tensor를 생성
* Pytorch의 train 로직은 model의 출력값(feed forward)을 오차 역전파(Backpropagation)로 weight 값 update 수행
* 손실 함수는 nn.MSELoss()로, Optimizer는 Adam 생성하고 모델 학습 수행.

In [27]:
import torch
import torch.nn as nn

class SimpleRegression_01(nn.Module):
    def __init__(self):
        super().__init__()
        #w
        self.weights = nn.Parameter(data=torch.zeros(size=(2, ), dtype=torch.float32),
                                   requires_grad=True)
        self.bias = nn.Parameter(data=torch.ones(size=(1,), dtype=torch.float32))
        
    def forward(self, x):
        return torch.matmul(self.weights, x.t()) + self.bias # w1*x1 + w2*x2 + b

simple_model_01 = SimpleRegression_01()
# simple_model의 학습 파라미터 출력(Learnable Parameter)
print(list(simple_model_01.parameters()))

[Parameter containing:
tensor([0., 0.], requires_grad=True), Parameter containing:
tensor([1.], requires_grad=True)]


In [28]:
def get_scaled_train_test_feature_target_ts_01(data_df):
    # RM, LSTAT Feature에 Scaling 적용
    scaler = MinMaxScaler()
    scaled_features_np = scaler.fit_transform(data_df[['RM', 'LSTAT']])
    # 학습 feature, 테스트 feature, 학습 target, test_target으로 분리. 
    tr_features, test_features, tr_target, test_target = train_test_split(scaled_features_np, 
                                                                          data_df['PRICE'].values, 
                                                                          test_size=0.3, random_state=2025)
    # 학습 feature와 target을 tensor로 변환. dtype=torch.float32로 수정
    tr_ftr_ts = torch.tensor(tr_features, dtype=torch.float32)
    tr_tgt_ts = torch.tensor(tr_target, dtype=torch.float32)
    test_ftr_ts = torch.tensor(test_features, dtype=torch.float32)
    test_tgt_ts = torch.tensor(test_target, dtype=torch.float32)
    
    return tr_ftr_ts, tr_tgt_ts, test_ftr_ts, test_tgt_ts

In [29]:
tr_ftr_ts, tr_tgt_ts, test_ftr_ts, test_tgt_ts = get_scaled_train_test_feature_target_ts_01(data_df=boston_df)
print(tr_ftr_ts.dtype, tr_tgt_ts.dtype, test_ftr_ts.dtype, test_tgt_ts.dtype)


torch.float32 torch.float32 torch.float32 torch.float32


In [30]:
# MSELoss 생성. 
loss_fn = nn.MSELoss(reduction='mean')
# optimizer는 Adam으로, 생성 시 인자로 model의 모든 parameter 값과 learning rate가 필요. 
optimizer = torch.optim.Adam(simple_model_01.parameters(), lr=0.01)

# train loop 수행. 
def train_loop(model, tr_ftr_ts, tr_tgt_ts, loss_fn, optimizer, epochs=300, batch_size=30, verbose=True):
    #model.train()
    for i in range(1, epochs+1):
    # batch_size 만큼 데이터를 가져와서 weight/bias update를 수행하는 로직을 전체 데이터 건수만큼 반복
        for batch_step in range(0, tr_tgt_ts.shape[0], batch_size):
            # batch_size만큼 순차적인 데이터를 가져옴.
            ftr_batch = tr_ftr_ts[batch_step:batch_step + batch_size]
            target_batch = tr_tgt_ts[batch_step:batch_step + batch_size]
            
            # forward pass
            output = model(ftr_batch).squeeze(-1)
            
            # mse loss 계산
            loss = loss_fn(output, target_batch)

            # backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if verbose:
                if batch_step == 330:
                    print(f'Epoch: {i}/{epochs}, batch step:{batch_step}, loss: {loss.item()}')

    return model

In [31]:
simple_model_01 = SimpleRegression_01()
loss_fn = nn.MSELoss(reduction='mean')
optimizer = torch.optim.Adam(simple_model_01.parameters(), lr=0.01)

trained_model = train_loop(simple_model_01, tr_ftr_ts, tr_tgt_ts, loss_fn, optimizer, 
                         epochs=300, batch_size=30, verbose=True) 

Epoch: 1/300, batch step:330, loss: 515.7574462890625
Epoch: 2/300, batch step:330, loss: 506.6460876464844
Epoch: 3/300, batch step:330, loss: 497.666259765625
Epoch: 4/300, batch step:330, loss: 488.8221740722656
Epoch: 5/300, batch step:330, loss: 480.1151123046875
Epoch: 6/300, batch step:330, loss: 471.54443359375
Epoch: 7/300, batch step:330, loss: 463.1086730957031
Epoch: 8/300, batch step:330, loss: 454.8063049316406
Epoch: 9/300, batch step:330, loss: 446.6358642578125
Epoch: 10/300, batch step:330, loss: 438.5956115722656
Epoch: 11/300, batch step:330, loss: 430.6841125488281
Epoch: 12/300, batch step:330, loss: 422.8999938964844
Epoch: 13/300, batch step:330, loss: 415.24169921875
Epoch: 14/300, batch step:330, loss: 407.7079772949219
Epoch: 15/300, batch step:330, loss: 400.29736328125
Epoch: 16/300, batch step:330, loss: 393.0084533691406
Epoch: 17/300, batch step:330, loss: 385.8399963378906
Epoch: 18/300, batch step:330, loss: 378.79052734375
Epoch: 19/300, batch step:33

In [32]:
from sklearn.metrics import mean_squared_error

test_predicted_ts = trained_model(test_ftr_ts)
print(test_predicted_ts.requires_grad, test_predicted_ts.shape)

boston_test_df = pd.DataFrame({
    'RM': test_features[:, 0],
    'LSTAT': test_ftr_ts[:, 1],
    'PRICE': test_tgt_ts,
    'PREDICTED': test_predicted_ts.squeeze(-1).detach().numpy()
})

test_total_mse = mean_squared_error(boston_test_df['PRICE'], boston_test_df['PREDICTED'])
print("test 데이터 세트의 MSE:", test_total_mse)

boston_test_df.head(20)

True torch.Size([152])
test 데이터 세트의 MSE: 45.09147644042969


Unnamed: 0,RM,LSTAT,PRICE,PREDICTED
0,0.504311,0.546082,11.0,20.649502
1,0.727534,0.082781,31.5,27.032528
2,0.442422,0.348786,22.0,20.653357
3,0.44338,0.197296,50.0,21.494732
4,0.51964,0.139349,24.1,23.129108
5,0.511401,0.309051,20.1,22.062653
6,0.425752,0.450607,22.5,19.81068
7,0.612569,0.049669,32.400002,25.224556
8,0.623683,0.061258,31.6,25.353655
9,0.571757,0.53394,10.9,21.88205


In [33]:
import torch
import torch.nn as nn

class SimpleRegression_02(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(in_features=2, out_features=1, bias=True)

    def forward(self, x):
        return self.linear(x)

simple_model_02 = SimpleRegression_02()
# simple_model의 학습 파라미터 출력(Learnable Parameter)
print(list(simple_model_02.parameters()))
print(simple_model_02)

[Parameter containing:
tensor([[0.2616, 0.6159]], requires_grad=True), Parameter containing:
tensor([-0.2969], requires_grad=True)]
SimpleRegression_02(
  (linear): Linear(in_features=2, out_features=1, bias=True)
)


In [34]:
loss_fn = nn.MSELoss(reduction='mean')
optimizer = torch.optim.Adam(simple_model_02.parameters(), lr=0.01)
tr_ftr_ts, tr_tgt_ts, test_ftr_ts, test_tgt_ts = get_scaled_train_test_feature_target_ts_01(data_df=boston_df)

trained_model = train_loop(simple_model_02, tr_ftr_ts, tr_tgt_ts, loss_fn, optimizer, 
                         epochs=300, batch_size=30, verbose=True)

Epoch: 1/300, batch step:330, loss: 561.0238037109375
Epoch: 2/300, batch step:330, loss: 551.493896484375
Epoch: 3/300, batch step:330, loss: 542.0956420898438
Epoch: 4/300, batch step:330, loss: 532.8333129882812
Epoch: 5/300, batch step:330, loss: 523.7081909179688
Epoch: 6/300, batch step:330, loss: 514.719482421875
Epoch: 7/300, batch step:330, loss: 505.8660888671875
Epoch: 8/300, batch step:330, loss: 497.1463928222656
Epoch: 9/300, batch step:330, loss: 488.5589599609375
Epoch: 10/300, batch step:330, loss: 480.1021423339844
Epoch: 11/300, batch step:330, loss: 471.7745666503906
Epoch: 12/300, batch step:330, loss: 463.5748596191406
Epoch: 13/300, batch step:330, loss: 455.5016174316406
Epoch: 14/300, batch step:330, loss: 447.5534973144531
Epoch: 15/300, batch step:330, loss: 439.7292175292969
Epoch: 16/300, batch step:330, loss: 432.02734375
Epoch: 17/300, batch step:330, loss: 424.4466857910156
Epoch: 18/300, batch step:330, loss: 416.9859313964844
Epoch: 19/300, batch step:

In [35]:
from sklearn.metrics import mean_squared_error

test_predicted_ts = trained_model(test_ftr_ts)
print(test_predicted_ts.requires_grad, test_predicted_ts.shape)

boston_test_df = pd.DataFrame({
    'RM': test_features[:, 0],
    'LSTAT': test_ftr_ts[:, 1],
    'PRICE': test_tgt_ts,
    'PREDICTED': test_predicted_ts.squeeze(-1).detach().numpy()
})

test_total_mse = mean_squared_error(boston_test_df['PRICE'], boston_test_df['PREDICTED'])
print("test 데이터 세트의 MSE:", test_total_mse)

boston_test_df.head(20)

True torch.Size([152, 1])
test 데이터 세트의 MSE: 47.48511505126953


Unnamed: 0,RM,LSTAT,PRICE,PREDICTED
0,0.504311,0.546082,11.0,21.008846
1,0.727534,0.082781,31.5,26.716106
2,0.442422,0.348786,22.0,20.642385
3,0.44338,0.197296,50.0,21.224915
4,0.51964,0.139349,24.1,22.800194
5,0.511401,0.309051,20.1,22.019934
6,0.425752,0.450607,22.5,19.96526
7,0.612569,0.049669,32.400002,24.790985
8,0.623683,0.061258,31.6,24.945768
9,0.571757,0.53394,10.9,22.256081


### 활성화 함수(Activation Function)
* function 기반으로 또는 Layer 기반으로 적용 가능
* relu()/ReLU()는 inplace=False(default)시 원본 입력 tensor를 복제하여 relu 적용. inplace=True시 원본 입력 tensor에 바로 relu를 적용하므로 원본 입력 tensor가 변환됨(메모리가 절감됨)
* softmax()/Softmax()의 경우는 dim=-1 을 보통 적용함

In [36]:
import torch
import torch.nn as nn
import torch.nn.functional as F

input_tensor = torch.tensor([0, 9.0, 10.0, -9.0, -10.0], dtype=torch.float32)

# sigmoid 적용
sigmoid_output = F.sigmoid(input_tensor)
# sigmoid_output = torch.sigmoid(input_tensor)

print('sigmoid_output:', sigmoid_output)

sigmoid_output: tensor([5.0000e-01, 9.9988e-01, 9.9995e-01, 1.2339e-04, 4.5398e-05])


In [37]:
# Sigmoid Layer 적용
sigmoid_layer = nn.Sigmoid()
sigmoid_output = sigmoid_layer(input_tensor)
print('sigmoid_output_02:', sigmoid_output)

sigmoid_output_02: tensor([5.0000e-01, 9.9988e-01, 9.9995e-01, 1.2339e-04, 4.5398e-05])


In [38]:
# relu() 함수 적용
input_tensor = torch.tensor([0, 9.0, 10.0, -9.0, -10.0], dtype=torch.float32)
# relu function 적용
# inplace=False시 입력 원본 tensor는 그대로 유지한 채 입력 tensor를 복사하여 relu함수 적용하여 반환
# inplace=True시 입력 원본 tensor에 바로 relu함수 적용하여 반환. 입력 tensor가 바로 relu 변환됨. 메모리 절감 
relu_output = F.relu(input_tensor, inplace=False) # inplace=True
print('relu_output:', relu_output)
# inplace=False시 input_tensor 값은 변환 없음. inplace=True시 input_tensor 값이 변화됨. 
print('input_tensor:', input_tensor)

relu_output: tensor([ 0.,  9., 10.,  0.,  0.])
input_tensor: tensor([  0.,   9.,  10.,  -9., -10.])


In [39]:
# ReLU Layer 적용
input_tensor = torch.tensor([0, 9.0, 10.0, -9.0, -10.0], dtype=torch.float32)
relu_layer = nn.ReLU() #inplace=True
relu_output = relu_layer(input_tensor)
print('relu_output:', relu_output)
print('input_tensor:', input_tensor)

relu_output: tensor([ 0.,  9., 10.,  0.,  0.])
input_tensor: tensor([  0.,   9.,  10.,  -9., -10.])


In [40]:
# softmax 함수 적용
input_tensor = torch.tensor([[1.0, -1.0, 0.5], 
                             [0.5, 0.5, 1.5]], dtype=torch.float32)
print('input_tensor shape:', input_tensor.shape)
softmax_output = F.softmax(input_tensor, dim=-1)
#softmax_output = torch.softmax(input_tensor, dim=-1)
print('softmax_output:', softmax_output)

input_tensor shape: torch.Size([2, 3])
softmax_output: tensor([[0.5741, 0.0777, 0.3482],
        [0.2119, 0.2119, 0.5761]])


#### Custom Model에서 Activation Function 사용

In [41]:
import torch
import torch.nn as nn
#from torch import nn

# Custom Model 생성. 
class LinearModel(nn.Module):
    def __init__(self):
        # 반드시 super()를 호출. 
        super().__init__()
        #Linear Layer와 ReLU Layer 생성. 
        self.linear_01 = nn.Linear(in_features=10, out_features=5)
        self.relu_01 = nn.ReLU()
        self.linear_02 = nn.Linear(in_features=5, out_features=3)
        
    # 순방향 전파(Pass Forward) 기술.
    def forward(self, x):
        x = self.linear_01(x)
        x = self.relu_01(x)
        output = self.linear_02(x)
        return output

In [42]:
#임의의 입력 tensor 생성. 
input_tensor = torch.randn(size=(4, 10))
print(input_tensor)

linear_model = LinearModel()

# LinearModel 객체는 Callable Object이므로 LinearModel 객체에 함수 호출과 유사한 형태로 입력 인자 전달하여 forward()메소드 호출. 
output_tensor = linear_model(input_tensor)
print(output_tensor)

tensor([[-0.9990, -0.3729, -0.1594,  0.2836, -1.1234,  0.5575,  0.4875,  2.2787,
          0.1437,  1.6156],
        [ 0.1340,  0.2344,  1.3478, -0.8199, -1.3315,  0.7726,  0.9116,  0.4670,
          1.1802,  0.3709],
        [-2.0193, -1.1871, -0.2822, -0.2651,  2.1293,  1.7256,  0.1499,  1.2593,
         -0.5026, -0.8897],
        [-0.0430, -0.0234,  0.9300,  1.1032,  0.6404,  0.2513, -1.2607, -0.7816,
         -0.0619, -0.6913]])
tensor([[-0.4969,  0.4712,  0.1203],
        [-0.3782,  0.3505,  0.1125],
        [-0.1365,  0.5857, -0.2523],
        [-0.2776,  0.3470, -0.0123]], grad_fn=<AddmmBackward0>)


In [43]:
softmax_output = F.softmax(output_tensor, dim=-1)
#softmax_output = torch.softmax(input_tensor, dim=-1)
print('softmax_output:', softmax_output)
print('predicted class:', softmax_output.argmax(dim=-1))

softmax_output: tensor([[0.1823, 0.4799, 0.3379],
        [0.2125, 0.4404, 0.3471],
        [0.2532, 0.5213, 0.2255],
        [0.2397, 0.4477, 0.3126]], grad_fn=<SoftmaxBackward0>)
predicted class: tensor([1, 1, 1, 1])
