<a href="https://colab.research.google.com/github/saltfish0211/harry-machine-learning/blob/main/simple_LR_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 製造假資料

In [None]:
import numpy as np

# 製造假資料

# Seed是拿來控制隨機性的變數，不是很重要，不懂沒關係
np.random.seed(0)

# 設定樣本數
num_samples = 1000


# 製造隨機的0跟1來代表性別
gender = np.random.randint(0, 2, size=num_samples)
# 製造隨機的身高資料（用常態分佈，且考慮性別）
height = np.random.normal(loc=175, scale=10, size=num_samples)-gender*np.random.randint(3, 10, size=num_samples)
# 體重資料是用身高跟性別算出來的，再加上一點隨機性
weight = np.random.normal(loc=0, scale=10, size=num_samples) + 0.7 * height - 5 * gender - 52.5

# 把身高轉換成以公尺為單位
height = height/100

# Data preparation

In [None]:
# 把資料曡成一個矩陣
data = np.column_stack((height, gender, weight))

In [None]:
# 印前20個出來看看這個資料合不合理
data[:20]

array([[ 1.65144893,  0.        , 68.87841957],
       [ 1.5228165 ,  1.        , 56.38044137],
       [ 1.83481349,  1.        , 70.49827295],
       [ 1.76642278,  0.        , 52.64039509],
       [ 1.75672903,  1.        , 60.53255717],
       [ 1.67773249,  1.        , 54.96812363],
       [ 1.62465683,  1.        , 72.44210824],
       [ 1.49835258,  1.        , 43.02407222],
       [ 1.67081626,  1.        , 69.52583654],
       [ 1.61385078,  1.        , 56.161772  ],
       [ 1.79579239,  1.        , 87.93998523],
       [ 1.86411019,  0.        , 90.35672349],
       [ 1.89665787,  0.        , 65.90431924],
       [ 1.80525519,  1.        , 70.38099303],
       [ 1.69013461,  0.        , 66.99319001],
       [ 1.6384103 ,  0.        , 56.14953622],
       [ 1.82666632,  0.        , 82.54967638],
       [ 1.78562928,  0.        , 79.61798933],
       [ 1.57314615,  0.        , 56.8120266 ],
       [ 1.72554818,  1.        , 59.20875713]])

In [None]:
# 把特徵（X）跟標籤（y）分開
X = data[:, :2]
y = data[:, 2]

# 新增一列都是1的給X
X = np.hstack((X, np.ones((X.shape[0], 1))))

In [None]:
X[:5]

array([[1.65144893, 0.        , 1.        ],
       [1.5228165 , 1.        , 1.        ],
       [1.83481349, 1.        , 1.        ],
       [1.76642278, 0.        , 1.        ],
       [1.75672903, 1.        , 1.        ]])

In [None]:
y[:5]

array([68.87841957, 56.38044137, 70.49827295, 52.64039509, 60.53255717])

In [None]:
# 把資料分成訓練跟測試兩部分

train_data = X[:800, :]
train_target = y[:800]
test_data = X[800:, :]
test_target = y[800:]

In [None]:
# 整理數據形態
test_target = np.reshape(test_target, (200, 1))
train_target = np.reshape(train_target, (800, 1))

## 梯度下降

In [None]:
# 定義權重的矩陣w，並用0來初始化
w = np.zeros((3, 1))

# 定義學習率
alpha = 0.01

# 定義迭代次數
num_iterations = 200000

In [None]:
# 進行梯度下降
for i in range(num_iterations):
    # 計算y_pred

    predictions = train_data.dot(w)

    # 計算誤差
    errors = predictions - train_target

    # 計算梯度
    gradients = (1/len(train_data)) * train_data.T.dot(errors)

    # 更新權重
    w = w - alpha * gradients

    # 每1000次迭代就印出目前的Cost
    if i % 10000 == 0:
        cost = (1/(len(train_data))) * np.sum((predictions - train_target)**2)
        print(f"Iteration {i}: cost = {cost}")

print(f"Final weights: {w}")

Iteration 0: cost = 4399.792963956563
Iteration 10000: cost = 102.25487878054997
Iteration 20000: cost = 99.59340765783216
Iteration 30000: cost = 97.9346489477709
Iteration 40000: cost = 96.90082955566183
Iteration 50000: cost = 96.25650284978347
Iteration 60000: cost = 95.8549269970989
Iteration 70000: cost = 95.60464532479182
Iteration 80000: cost = 95.4486575703637
Iteration 90000: cost = 95.35143838804309
Iteration 100000: cost = 95.2908466483534
Iteration 110000: cost = 95.25308291869645
Iteration 120000: cost = 95.22954671912696
Iteration 130000: cost = 95.21487781075349
Iteration 140000: cost = 95.20573543106588
Iteration 150000: cost = 95.2000374538051
Iteration 160000: cost = 95.19648619625792
Iteration 170000: cost = 95.19427287921768
Iteration 180000: cost = 95.19289343200623
Iteration 190000: cost = 95.19203369317322
Final weights: [[ 64.31418794]
 [ -5.37382596]
 [-42.46981186]]


In [None]:
# 計算測試資料的y_pred
y_pred = test_data.dot(w)

# 評估模型表現
mse = np.mean((y_pred - test_target) ** 2)
print("Mean Squared Error:", mse)

Mean Squared Error: 92.68733661637805


# Moore-Penrose 僞逆矩陣

In [None]:
# 用SVD計算X的僞逆矩陣
X_pinv = np.linalg.pinv(train_data)

# 用僞逆矩陣計算權重
w = np.dot(X_pinv, train_target)

In [None]:
# 最後的權重結果，跟梯度下降的結果非常相似！

w

array([[ 64.61811225],
       [ -5.35758015],
       [-43.0011546 ]])

In [None]:
# 計算測試資料的y_pred
y_pred = test_data.dot(w)

# 評估模型
mse = np.mean((y_pred - test_target) ** 2)
print("Mean Squared Error:", mse)

Mean Squared Error: 92.66477527219573


In [None]:
# 整理資料
final_data = np.column_stack((np.delete(test_data, 2, axis=1), y_pred, test_target))
final_data[:5]

array([[ 1.72509981,  1.        , 63.11395852, 76.80412353],
       [ 1.85547579,  0.        , 76.89618844, 79.06928953],
       [ 1.77600477,  1.        , 66.4033411 , 66.61281951],
       [ 1.70835009,  0.        , 67.38920339, 81.93468537],
       [ 1.7223177 ,  0.        , 68.29176391, 68.04010644]])

In [None]:
# 把資料匯出成Excel。點選左邊工具欄的文件夾圖示就可以下載檔案了

import pandas as pd

df = pd.DataFrame(final_data, columns=['Height', 'Gender', 'Predicted_Weight','Actual_weight'])

df.to_excel("output.xlsx", index=False)

## 使用Sklearn套件

In [None]:
from sklearn.linear_model import LinearRegression

# 定義一個Linear Regression物件
model = LinearRegression()

# 用訓練資料訓練模型
model.fit(train_data, train_target)

y_pred = model.predict(test_data)

# 把權重印出來。你可以看到，結果跟僞逆矩陣完全一樣！
print("Intercept: ", model.intercept_)
print("Coefficients: ", model.coef_)

Intercept:  [-43.0011546]
Coefficients:  [[64.61811225 -5.35758015  0.        ]]
