# Getting started with NumPy

In [1]:
import numpy as np

## Getting used to vectorization and broadcasting

First, let's generate some test data. Generate a two-dimensional array with 10 rows and 4 columns, from a normal distribution with mean = 100 and standard deviation 20.

In [2]:
data = np.random.normal(100, 20, (10,4))
data

array([[  92.30655148,  129.9311227 ,   70.64847028,   72.44558008],
       [ 104.30756145,  132.9652361 ,   91.08058179,  110.95906757],
       [  85.84430616,  114.53768115,  130.03054066,  142.91838379],
       [ 128.51354202,  118.19484504,   98.75239905,   66.12932731],
       [  50.82601197,   74.90358652,  129.22237526,  105.53799185],
       [  88.97867955,   65.18964801,  112.96644632,   92.29403921],
       [ 147.51887021,   92.71083474,   78.34825529,   91.0673518 ],
       [  96.73200958,   72.20153736,  116.26752552,   90.20263798],
       [ 133.63173702,  102.79199187,  100.11504546,  101.83080595],
       [ 104.33245915,   91.3606158 ,  120.61046555,  115.86601472]])

Display the last 2 columns of rows 1-5 (let's agree that we start counting from 1 when using normal language).

In [3]:
data = data.round(1)
data

array([[  92.3,  129.9,   70.6,   72.4],
       [ 104.3,  133. ,   91.1,  111. ],
       [  85.8,  114.5,  130. ,  142.9],
       [ 128.5,  118.2,   98.8,   66.1],
       [  50.8,   74.9,  129.2,  105.5],
       [  89. ,   65.2,  113. ,   92.3],
       [ 147.5,   92.7,   78.3,   91.1],
       [  96.7,   72.2,  116.3,   90.2],
       [ 133.6,  102.8,  100.1,  101.8],
       [ 104.3,   91.4,  120.6,  115.9]])

In [4]:
data[:5, -2:]

array([[  70.6,   72.4],
       [  91.1,  111. ],
       [ 130. ,  142.9],
       [  98.8,   66.1],
       [ 129.2,  105.5]])

Now, set all data points > 130 to 0.

In [5]:
data = np.where(data > 130, 0, data)
data

array([[  92.3,  129.9,   70.6,   72.4],
       [ 104.3,    0. ,   91.1,  111. ],
       [  85.8,  114.5,  130. ,    0. ],
       [ 128.5,  118.2,   98.8,   66.1],
       [  50.8,   74.9,  129.2,  105.5],
       [  89. ,   65.2,  113. ,   92.3],
       [   0. ,   92.7,   78.3,   91.1],
       [  96.7,   72.2,  116.3,   90.2],
       [   0. ,  102.8,  100.1,  101.8],
       [ 104.3,   91.4,  120.6,  115.9]])

Take the square root of all data points.

In [6]:
data = np.sqrt(data)
data

array([[  9.6072889 ,  11.39736812,   8.40238062,   8.50881895],
       [ 10.21273715,   0.        ,   9.544632  ,  10.53565375],
       [  9.26282894,  10.70046728,  11.40175425,   0.        ],
       [ 11.33578405,  10.87198234,   9.93981891,   8.13019065],
       [  7.12741187,   8.65447861,  11.36661779,  10.27131929],
       [  9.43398113,   8.0746517 ,  10.63014581,   9.6072889 ],
       [  0.        ,   9.62808392,   8.84872872,   9.544632  ],
       [  9.83361582,   8.49705831,  10.78424777,   9.49736806],
       [  0.        ,  10.13903348,  10.00499875,  10.0895986 ],
       [ 10.21273715,   9.56033472,  10.98180313,  10.76568623]])

Now, compute the mean and the standard deviation over all data points.

In [7]:
mean, sd = np.mean(data), np.std(data)
mean, sd

(8.8351381919661058, 3.1040832992788499)

Then, compute the row means and standard deviations.

In [8]:
row_ms, row_sds = np.mean(data, axis=1), np.std(data, axis=1)
row_ms, row_sds

(array([  9.47896415,   7.57325572,   7.84126262,  10.06944399,
          9.35495689,   9.43651688,   7.00536116,   9.65307249,
          7.55840771,  10.38014031]),
 array([ 1.20384331,  4.38700327,  4.59234151,  1.22731324,  1.60772559,
         0.90947737,  4.05584948,  0.81742982,  4.36411192,  0.55017016]))

Finally, compute column means and standard deviations.

In [9]:
col_ms, col_sds = np.mean(data, axis=0), np.std(data, axis=0)
col_ms, col_sds

(array([  7.7026385 ,   8.75234585,  10.19051278,   8.69505564]),
 array([ 3.97986936,  3.09458271,  0.97644732,  3.00433143]))

Now, compute the column means without using NumPy's mean function.

In [10]:
data

array([[  9.6072889 ,  11.39736812,   8.40238062,   8.50881895],
       [ 10.21273715,   0.        ,   9.544632  ,  10.53565375],
       [  9.26282894,  10.70046728,  11.40175425,   0.        ],
       [ 11.33578405,  10.87198234,   9.93981891,   8.13019065],
       [  7.12741187,   8.65447861,  11.36661779,  10.27131929],
       [  9.43398113,   8.0746517 ,  10.63014581,   9.6072889 ],
       [  0.        ,   9.62808392,   8.84872872,   9.544632  ],
       [  9.83361582,   8.49705831,  10.78424777,   9.49736806],
       [  0.        ,  10.13903348,  10.00499875,  10.0895986 ],
       [ 10.21273715,   9.56033472,  10.98180313,  10.76568623]])

In [11]:
col2 = data[:,1]
col2

array([ 11.39736812,   0.        ,  10.70046728,  10.87198234,
         8.65447861,   8.0746517 ,   9.62808392,   8.49705831,
        10.13903348,   9.56033472])

In [12]:
mean_col2 = col2.sum() / len(col2)
mean_col2

8.7523458484056107

In [13]:
var_col2 = ((col2 - mean_col2)**2).sum() / len(col2)
var_col2

9.5764421498970513

In [14]:
sd_col2 = np.sqrt(var_col2)
sd_col2

3.0945827101399392

Now double check again using NumPy's mean and std functions.

In [15]:
np.mean(col2)

8.7523458484056107

In [16]:
np.std(col2)

3.0945827101399392

In [17]:
((col2 - mean_col2)**2).sum()

95.76442149897052

## Simple linear regression

In [None]:
For practise with ndarrays, we are going to compute linear regression ourselves,