# 通过正态分布初始化模型参数的探究

 - https://zhuanlan.zhihu.com/p/474988236
 - https://www.zhihu.com/question/519668254/answer/2371885202


In [None]:
import torch

In [None]:
import torch.nn as nn

## 方阵

In [None]:
m, n = 4096, 4096
std = torch.sqrt(torch.tensor(n))
A = torch.randn((m, n), device='cuda') / std

In [None]:
torch.norm(A[0])

tensor(0.9846, device='cuda:0')

In [None]:
torch.dot(A[0], A[1])

tensor(-0.0145, device='cuda:0')

In [None]:
torch.norm(A[:, 0])

tensor(1.0096, device='cuda:0')

In [None]:
torch.dot(A[:, 0], A[:, 1])

tensor(0.0049, device='cuda:0')

In [None]:
B = A @ A.T

In [None]:
torch.norm(B - torch.eye(m, device=B.device)) / (m ** 2)

tensor(3.8150e-06, device='cuda:0')

In [None]:
_, S, _ = torch.svd(A)
S

tensor([1.9951e+00, 1.9925e+00, 1.9893e+00,  ..., 1.1717e-03, 5.4775e-04,
        1.0965e-04], device='cuda:0')

## 行数大于列数

In [None]:
m, n = 8192, 4096
std = torch.sqrt(torch.tensor(n))
A = torch.randn((m, n), device='cuda') / std

In [None]:
torch.norm(A[0])

tensor(1.0168, device='cuda:0')

In [None]:
torch.dot(A[0], A[1])

tensor(-0.0087, device='cuda:0')

In [None]:
torch.norm(A[:, 0])

tensor(1.4054, device='cuda:0')

In [None]:
torch.dot(A[:, 0], A[:, 1])

tensor(-0.0283, device='cuda:0')

In [None]:
B = A @ A.T

In [None]:
B

tensor([[ 1.0339e+00, -8.7324e-03,  2.8929e-03,  ..., -2.0385e-02,
          3.2770e-03,  2.8342e-02],
        [-8.7324e-03,  1.0134e+00,  2.2873e-03,  ..., -1.4398e-02,
         -7.2349e-03,  5.4106e-03],
        [ 2.8929e-03,  2.2873e-03,  9.9706e-01,  ...,  6.3395e-03,
         -2.0819e-03,  8.4754e-04],
        ...,
        [-2.0385e-02, -1.4398e-02,  6.3395e-03,  ...,  1.0033e+00,
         -7.1462e-04,  1.4631e-02],
        [ 3.2770e-03, -7.2349e-03, -2.0819e-03,  ..., -7.1462e-04,
          1.0030e+00,  6.1640e-03],
        [ 2.8342e-02,  5.4106e-03,  8.4754e-04,  ...,  1.4631e-02,
          6.1640e-03,  1.0216e+00]], device='cuda:0')

In [None]:
torch.norm(B - torch.eye(m, device=B.device)) / (m ** 2)

tensor(1.9067e-06, device='cuda:0')

In [None]:
_, S, _ = torch.svd(A)
S

tensor([2.4185, 2.4107, 2.4054,  ..., 0.4217, 0.4186, 0.4174], device='cuda:0')

## 行数小于列数

In [None]:
m, n = 4096, 8192
std = torch.sqrt(torch.tensor(n))
A = torch.randn((m, n), device='cuda') / std

In [None]:
torch.norm(A[0])

tensor(0.9898, device='cuda:0')

In [None]:
torch.dot(A[0], A[1])

tensor(0.0065, device='cuda:0')

In [None]:
torch.norm(A[:, 0])

tensor(0.7043, device='cuda:0')

In [None]:
torch.dot(A[:, 0], A[:, 1])

tensor(-0.0015, device='cuda:0')

In [None]:
B = A @ A.T

In [None]:
B

tensor([[ 0.9797,  0.0065,  0.0037,  ..., -0.0123,  0.0227, -0.0084],
        [ 0.0065,  1.0072,  0.0059,  ...,  0.0135,  0.0029, -0.0136],
        [ 0.0037,  0.0059,  1.0029,  ..., -0.0023, -0.0023, -0.0039],
        ...,
        [-0.0123,  0.0135, -0.0023,  ...,  0.9733, -0.0081, -0.0147],
        [ 0.0227,  0.0029, -0.0023,  ..., -0.0081,  1.0180, -0.0119],
        [-0.0084, -0.0136, -0.0039,  ..., -0.0147, -0.0119,  0.9942]],
       device='cuda:0')

In [None]:
torch.norm(B - torch.eye(m, device=B.device)) / (m ** 2)

tensor(2.6986e-06, device='cuda:0')

In [None]:
_, S, _ = torch.svd(A)
S

tensor([1.7051, 1.7013, 1.6997,  ..., 0.2995, 0.2973, 0.2936], device='cuda:0')

# QR分解

## 方阵

In [None]:
# 定义矩阵的大小，例如 m x n
m, n = 1024, 1024

# 生成一个随机矩阵
A = torch.randn(m, n, device='cuda')

# 进行QR分解
Q, _ = torch.qr(A)

# 确保Q是正交的，可以通过取Q的共轭转置
Q = Q.conj().T

# 验证Q是否是正交的
print("Q shape:", Q.shape)
print("Q matrix:\n", Q)
print("Q^T * Q:\n", torch.mm(Q.T, Q))

Q shape: torch.Size([1024, 1024])
Q matrix:
 tensor([[-0.0225, -0.0554,  0.0238,  ..., -0.0065,  0.0304,  0.0182],
        [-0.0268, -0.0422, -0.0174,  ..., -0.0063,  0.0155, -0.0461],
        [ 0.0087,  0.0264, -0.0156,  ..., -0.0812,  0.0171,  0.0135],
        ...,
        [ 0.0065,  0.0251,  0.0473,  ..., -0.0071,  0.0236, -0.0290],
        [ 0.0112, -0.0542, -0.0181,  ..., -0.0931,  0.0058,  0.0254],
        [-0.0090, -0.0227, -0.0003,  ...,  0.0092,  0.0117,  0.0205]],
       device='cuda:0')
Q^T * Q:
 tensor([[ 1.0000e+00,  2.7940e-08, -1.0245e-08,  ..., -5.5181e-08,
          2.3283e-09, -4.7497e-08],
        [ 2.7940e-08,  1.0000e+00, -3.9116e-08,  ..., -1.6764e-08,
          3.9116e-08,  1.3039e-08],
        [-1.0245e-08, -3.9116e-08,  1.0000e+00,  ..., -1.3970e-08,
          1.4901e-08,  1.8626e-09],
        ...,
        [-5.5181e-08, -1.6764e-08, -1.3970e-08,  ...,  1.0000e+00,
          2.7940e-09,  1.4901e-08],
        [ 2.3283e-09,  3.9116e-08,  1.4901e-08,  ...,  2.7940e

The boolean parameter 'some' has been replaced with a string parameter 'mode'.
Q, R = torch.qr(A, some)
should be replaced with
Q, R = torch.linalg.qr(A, 'reduced' if some else 'complete') (Triggered internally at ../aten/src/ATen/native/BatchLinearAlgebra.cpp:2416.)
  Q, _ = torch.qr(A)


In [None]:
_, S, _ = torch.svd(Q)
S

tensor([1.0005, 1.0005, 1.0005,  ..., 1.0002, 1.0002, 1.0002], device='cuda:0')

In [None]:
torch.dot(Q[0], Q[1])

tensor(-1.9558e-08, device='cuda:0')

In [None]:
torch.dot(Q[:, 0], Q[:, 1])

tensor(2.5146e-08, device='cuda:0')

In [None]:
torch.norm(Q[0])

tensor(1., device='cuda:0')

In [None]:
torch.norm(Q[:, 0])

tensor(1., device='cuda:0')

## 行数大于列数

In [None]:
# 定义矩阵的大小，例如 m x n
m, n = 8192, 1024

# 生成一个随机矩阵
A = torch.randn(m, n, device='cuda')

# 进行QR分解
Q, _ = torch.qr(A)

# 确保Q是正交的，可以通过取Q的共轭转置
Q = Q.conj().T

# 验证Q是否是正交的
print("Q shape:", Q.shape)
print("Q matrix:\n", Q)
print("Q^T * Q:\n", torch.mm(Q.T, Q))

Q shape: torch.Size([1024, 8192])
Q matrix:
 tensor([[-1.7934e-03, -4.2859e-03,  6.1266e-03,  ...,  1.3964e-03,
          2.6025e-02,  5.0162e-03],
        [ 3.3993e-03, -6.4243e-03,  1.7593e-02,  ..., -1.0766e-02,
          6.3614e-03, -7.2805e-03],
        [-1.4938e-02, -1.5316e-02, -1.0364e-02,  ..., -2.6297e-03,
         -1.7472e-02, -3.2588e-03],
        ...,
        [-1.3105e-05,  1.4949e-02,  3.9246e-03,  ...,  4.3051e-03,
         -8.5429e-03,  2.3750e-02],
        [-1.0177e-02,  8.7232e-03, -1.2767e-02,  ...,  1.0789e-03,
         -7.4314e-03, -8.8782e-04],
        [-5.8803e-03,  3.7844e-03,  2.3275e-03,  ...,  1.4252e-02,
         -1.7017e-02, -2.3851e-02]], device='cuda:0')
Q^T * Q:
 tensor([[ 0.1220,  0.0003, -0.0006,  ...,  0.0006,  0.0045,  0.0016],
        [ 0.0003,  0.1283,  0.0007,  ...,  0.0021, -0.0023, -0.0041],
        [-0.0006,  0.0007,  0.1190,  ..., -0.0011,  0.0032,  0.0022],
        ...,
        [ 0.0006,  0.0021, -0.0011,  ...,  0.1322, -0.0011, -0.0048],
   

In [None]:
_, S, _ = torch.svd(Q)
S

tensor([1.0002, 1.0002, 1.0002,  ..., 1.0000, 1.0000, 1.0000], device='cuda:0')

In [None]:
torch.dot(Q[0], Q[1])

tensor(-7.7998e-09, device='cuda:0')

In [None]:
torch.dot(Q[:, 0], Q[:, 1])

tensor(0.0003, device='cuda:0')

In [None]:
torch.norm(Q[0])

tensor(1.0000, device='cuda:0')

In [None]:
torch.norm(Q[:, 0])

tensor(0.3493, device='cuda:0')

## 行数小于列数

In [None]:
# 定义矩阵的大小，例如 m x n
m, n = 1024, 8192

# 生成一个随机矩阵
A = torch.randn(m, n, device='cuda')

# 进行QR分解
Q, _ = torch.qr(A)

# 确保Q是正交的，可以通过取Q的共轭转置
Q = Q.conj().T

# 验证Q是否是正交的
print("Q shape:", Q.shape)
print("Q matrix:\n", Q)
print("Q^T * Q:\n", torch.mm(Q.T, Q))

Q shape: torch.Size([1024, 1024])
Q matrix:
 tensor([[-0.0092,  0.0026, -0.0276,  ..., -0.0356,  0.0119, -0.0034],
        [ 0.0176, -0.0163, -0.0362,  ..., -0.0197,  0.0433,  0.0550],
        [ 0.0220,  0.0515, -0.0179,  ..., -0.0684,  0.0321,  0.0377],
        ...,
        [ 0.0153, -0.0602, -0.0049,  ...,  0.0074, -0.0618,  0.0156],
        [ 0.0142,  0.0006, -0.0010,  ...,  0.0149, -0.0084,  0.0506],
        [ 0.0113,  0.0266,  0.0025,  ..., -0.0470, -0.0057, -0.0388]],
       device='cuda:0')
Q^T * Q:
 tensor([[ 1.0000e+00,  3.9116e-08,  4.3772e-08,  ...,  2.8871e-08,
         -1.4901e-08,  9.3132e-10],
        [ 3.9116e-08,  1.0000e+00, -2.9802e-08,  ..., -3.3528e-08,
          9.3132e-09,  2.1886e-08],
        [ 4.3772e-08, -2.9802e-08,  1.0000e+00,  ...,  5.1223e-09,
          4.1910e-09, -2.9802e-08],
        ...,
        [ 2.8871e-08, -3.3528e-08,  5.1223e-09,  ...,  1.0000e+00,
          2.3749e-08, -4.1910e-08],
        [-1.4901e-08,  9.3132e-09,  4.1910e-09,  ...,  2.3749e

In [None]:
_, S, _ = torch.svd(Q)
S

tensor([1.0005, 1.0005, 1.0005,  ..., 1.0002, 1.0002, 1.0002], device='cuda:0')

In [None]:
torch.dot(Q[0], Q[1])

tensor(-9.3132e-10, device='cuda:0')

In [None]:
torch.dot(Q[:, 0], Q[:, 1])

tensor(4.6566e-08, device='cuda:0')

In [None]:
torch.norm(Q[0])

tensor(1., device='cuda:0')

In [None]:
torch.norm(Q[:, 0])

tensor(1., device='cuda:0')