# Exercise 1 - NumPy (SOLUTION)

Let's practice working with NumPy functions.  You may find NumPy's [reference documentation](https://numpy.org/doc/stable/reference/arrays.html) useful.

In [1]:
import numpy as np

**1. Create the input data array with the numbers `1` to `500_000_000`.** 

In [2]:
arr = np.arange(1, 500_000_001)
arr

array([        1,         2,         3, ..., 499999998, 499999999,
       500000000], shape=(500000000,))

**2. Calculate how large the array is in GB with `nbytes`** _Hint: GB is `1e9`_

In [3]:
arr.nbytes / 1e9

4.0

**3. How many dimensions does the array have?**

In [4]:
arr.ndim  # len(arr.shape) also works, but is longer to type

1

**4. How many elements does the array have?**

In [5]:
arr.size  # for 1D array, arr.shape[0] also works, but arr.size multiplies the size of all dimensions

500000000

**5. What is the shape of the array?**

In [6]:
arr.shape

(500000000,)

**6. Create a new array with `5_000_000` elements containing equally spaced values between `0` to `1000` (inclusive).**

In [7]:
arr = np.linspace(0, 1000, 5_000_000, endpoint=True)
arr

array([0.0000000e+00, 2.0000004e-04, 4.0000008e-04, ..., 9.9999960e+02,
       9.9999980e+02, 1.0000000e+03], shape=(5000000,))

**7. Create a random array that is `10_000` by `5_000`.**

In [8]:
arr = np.random.rand(10_000, 5_000)
arr

array([[0.17659865, 0.38648474, 0.63742629, ..., 0.86629327, 0.91934898,
        0.82434493],
       [0.97551894, 0.50853995, 0.55525147, ..., 0.78958867, 0.93456844,
        0.95708043],
       [0.16804187, 0.85185444, 0.55105745, ..., 0.29746959, 0.85406659,
        0.83115339],
       ...,
       [0.17329175, 0.63280969, 0.92239834, ..., 0.59682226, 0.71442996,
        0.36697754],
       [0.00230553, 0.90562461, 0.16170203, ..., 0.30949731, 0.36002029,
        0.99357106],
       [0.04347126, 0.20688976, 0.2916649 , ..., 0.66340966, 0.93985517,
        0.66640657]], shape=(10000, 5000))

**8. Sort that array.**

In [9]:
arr = np.sort(arr)
arr

array([[4.06280759e-05, 8.52379296e-05, 8.65952159e-05, ...,
        9.98868260e-01, 9.99672593e-01, 9.99787619e-01],
       [1.31693984e-05, 4.21413391e-05, 1.17697952e-04, ...,
        9.99787390e-01, 9.99833504e-01, 9.99835725e-01],
       [3.93602191e-05, 4.95429678e-05, 3.17867547e-04, ...,
        9.99723825e-01, 9.99752935e-01, 9.99768837e-01],
       ...,
       [1.86931672e-04, 5.58848933e-04, 5.80630871e-04, ...,
        9.99242630e-01, 9.99531669e-01, 9.99705199e-01],
       [1.16845958e-04, 2.89016878e-04, 6.59209124e-04, ...,
        9.99027411e-01, 9.99148800e-01, 9.99232332e-01],
       [1.96891537e-04, 4.28666361e-04, 6.92506173e-04, ...,
        9.99689969e-01, 9.99725993e-01, 9.99981385e-01]],
      shape=(10000, 5000))

**9. Reshape the array to have the last dimension of length `5`**

In [10]:
arr = arr.reshape((-1, 5))
# -1 will infer the size of that dimension from the rest.  Would also accept: arr.reshape((10_000_000, 5))
arr

array([[4.06280759e-05, 8.52379296e-05, 8.65952159e-05, 2.34144996e-04,
        4.46441980e-04],
       [1.13515948e-03, 1.24167905e-03, 1.50160141e-03, 1.50230035e-03,
        1.58054283e-03],
       [1.72709475e-03, 2.17918485e-03, 2.19080665e-03, 2.40222725e-03,
        2.52510764e-03],
       ...,
       [9.96267828e-01, 9.96329699e-01, 9.96330732e-01, 9.96984723e-01,
        9.97255448e-01],
       [9.97501828e-01, 9.97884805e-01, 9.98206395e-01, 9.98474393e-01,
        9.98821604e-01],
       [9.98978774e-01, 9.99538140e-01, 9.99689969e-01, 9.99725993e-01,
        9.99981385e-01]], shape=(10000000, 5))

**10. Find the sum of each row (index 0 is the row index)**

In [17]:
arr_sum = np.sum(arr, axis=1)
# would also accept arr.sum(axis=1)
arr_sum

array([8.93048197e-04, 6.96128312e-03, 1.10244211e-02, ...,
       4.98316843e+00, 4.99088903e+00, 4.99791426e+00], shape=(10000000,))

**11. Normalize each row of the original random array by dividing by the sum you just computed using broadcasting.**

In [21]:
arr_normalized = arr / arr_sum[:,np.newaxis]
arr_normalized

array([[0.04549371, 0.09544606, 0.09696589, 0.26218629, 0.49990805],
       [0.16306756, 0.17836928, 0.21570756, 0.21580797, 0.22704763],
       [0.15666081, 0.19766887, 0.19872305, 0.21790053, 0.22904673],
       ...,
       [0.19992658, 0.199939  , 0.1999392 , 0.20007044, 0.20012477],
       [0.19986456, 0.19994129, 0.20000573, 0.20005943, 0.20012899],
       [0.19987913, 0.19999105, 0.20002143, 0.20002864, 0.20007974]],
      shape=(10000000, 5))

**Extra credit: Prove that your normalized array is actually normalized.  (Hint: Does each row sum to 1 now?)**

In [None]:
np.testing.assert_equal(np.sum(arr_normalized, axis=1), 1.0)