### Before diving into pandas, let's take a look at a very useful data structure known as dictionary(known as map in java)

In [46]:
id_list = [1,2,3,4]
name_list = ['a','b','c','d']
# now we want to associate it each with an id
test_map = {id_list[index]:name_list[index] for index,_ in enumerate(id_list)}
print(test_map)

{1: 'a', 2: 'b', 3: 'c', 4: 'd'}


In [47]:
print(test_map[1])

a


In [48]:
# now we want to associate it each with an id
test_map = {name_list[index]:id_list[index] for index,_ in enumerate(id_list)}
print(test_map)

{'d': 4, 'c': 3, 'a': 1, 'b': 2}


In [49]:
print(test_map['d'])

4


In [59]:
# dictionaries can even be created dynamically
number_of_layers = 4
nodes_in_each_layer = [5,6,2,3,4]
param_dict = {}

for i in range(number_of_layers) : 
    param_dict['W'+str(i+1)] = np.random.randn(nodes_in_each_layer[i+1],nodes_in_each_layer[i])*0.01
    

print('layer1\n',param_dict['W1'])
print(param_dict['W1'].shape)

print('\nlayer2\n',param_dict['W2'])
print(param_dict['W2'].shape)



layer1
 [[ -7.91449307e-03  -4.41762641e-04   5.40548181e-03   1.69230609e-02
    6.73687632e-03]
 [  3.52720347e-04   1.33281295e-02   9.09199554e-03   9.12405530e-06
    6.05758758e-03]
 [  1.95896618e-02  -1.86888759e-02  -7.67204019e-03  -2.33989125e-03
    5.20037973e-03]
 [  8.97858731e-03  -7.96697188e-03   2.00172154e-03  -1.19107666e-02
    8.96499167e-03]
 [  1.04924477e-02  -7.65905264e-03   6.03135822e-03   2.14456134e-03
   -1.02894787e-03]
 [  1.26178863e-02  -9.12364325e-03   1.38621310e-02  -1.61664669e-02
   -5.98071066e-03]]
(6, 5)

layer2
 [[-0.01557193 -0.00725576 -0.00951629  0.00868157  0.00261935 -0.00225334]
 [-0.00275036 -0.00553528  0.01576735 -0.01720711 -0.00541849  0.01089524]]
(2, 6)


What we have done above is actually **weight initilization** for a **neural network(do not worry if you don't know about it)**

### Numpy array vs python list

Numpy arrays are not like regular python or java arrays. They are usually treated as **vectors** or **matrices**. Let's see what that means.



In [50]:
import numpy as np

In [4]:
list = [1,2,3]
np_array = np.array(list)

In [6]:
list + list

[1, 2, 3, 1, 2, 3]

In [7]:
np_array + np_array

array([2, 4, 6])

In [9]:
list * list

TypeError: can't multiply sequence by non-int of type 'list'

In [10]:
#note this is an element-wise multiplication
#we will come to matrix multiplication in a short while
np_array * np_array

array([1, 4, 9])

In [11]:
np.sqrt(np_array)

array([ 1.        ,  1.41421356,  1.73205081])

In [13]:
np.log(np_array)

array([ 0.        ,  0.69314718,  1.09861229])

In [15]:
4*np_array

array([ 4,  8, 12])



### A Practical use case where numpy arrays are a much better option


Numpy arrays are **highly optimized for speed**. For the purpose of demonstration, we will use a python list and a numpy array respectively for vector multiplication and compare their speed. These type of operations are very common in machine learning(e.g. Logistic Regression,Neural Networks,Deep Learning etc.)   

In [21]:
import time
import numpy as np

n_features = 10000000
np_x = np.random.uniform(0,1,(n_features,1))
np_W = np.random.randn(1,n_features)
b = 0
#
#x = list(np_x[:,0])
#W = list(np_W[0,:])

#numpy array to list conversion
x = np_x[:,0].tolist()
W = np_W[0,:].tolist()

#python list

start = time.time()
z = 0

for i in range(n_features) :
    z = z + x[i]*W[i]
    
z = z + b

list_result = z 

end = time.time()
t1 = (end - start)*1000

start = time.time()
#note this is a matrix multiplication
z = np.dot(np_W,np_x) + b
end = time.time()
t2 = (end - start)*1000

numpy_result = z[0,0]

if int(list_result) == int(numpy_result) : 
    print('Both of the methods output : ',list_result)

print('But list method took ',t1,'milliseconds while numpy array method took ',t2)

Both of the methods output :  -526.0482278063494
But list method took  2426.1364936828613 milliseconds while numpy array method took  10.003805160522461


Here we have tried to simulate the *Wx + b* formula which is used in logistic regression,neural network etc. Using numpy arrays has just reduced computation time by almost 250 times which is significant for experimentation purposes. 

### Adding/Removing Elements from numpy array

In [42]:
array = np.random.randn(2,3)
values = np.random.randn(1,3) 
print(array)


[[ 0.97885806 -0.63627034 -1.86576788]
 [-1.71888712  0.63494951 -0.94567484]]


In [43]:
print(values)

[[-0.24827296 -1.63613672  1.71570443]]


In [44]:
# Appends values to end of arr

np.append(array,values,axis=0) 

array([[ 0.97885806, -0.63627034, -1.86576788],
       [-1.71888712,  0.63494951, -0.94567484],
       [-0.24827296, -1.63613672,  1.71570443]])

In [45]:
# Inserts values into array at index 0
# please note that the append statement above did not change 'array'
# we need to assign the result into 'array'  
np.insert(array,0,values,axis=0)

array([[-0.24827296, -1.63613672,  1.71570443],
       [ 0.97885806, -0.63627034, -1.86576788],
       [-1.71888712,  0.63494951, -0.94567484]])

In [46]:
# Deletes row at index 1 of array  
np.delete(array,1,axis=0) 

array([[ 0.97885806, -0.63627034, -1.86576788]])

In [47]:
array

array([[ 0.97885806, -0.63627034, -1.86576788],
       [-1.71888712,  0.63494951, -0.94567484]])

In [48]:
#Deletes column on index 0 of array
np.delete(array,0,axis=1)

array([[-0.63627034, -1.86576788],
       [ 0.63494951, -0.94567484]])

In [49]:
array

array([[ 0.97885806, -0.63627034, -1.86576788],
       [-1.71888712,  0.63494951, -0.94567484]])

In [50]:
array_copy = array
array_copy

array([[ 0.97885806, -0.63627034, -1.86576788],
       [-1.71888712,  0.63494951, -0.94567484]])

In [51]:
array_copy[0,0] = 9.99

In [52]:
array_copy

array([[ 9.99      , -0.63627034, -1.86576788],
       [-1.71888712,  0.63494951, -0.94567484]])

In [53]:
# note here that numpy arrays are pass by reference(just like java)
# modifying array_copy has also modified array
array

array([[ 9.99      , -0.63627034, -1.86576788],
       [-1.71888712,  0.63494951, -0.94567484]])

In [54]:
array_copy = np.copy(array)

In [55]:
array_copy[0,0] =  4

In [56]:
array_copy

array([[ 4.        , -0.63627034, -1.86576788],
       [-1.71888712,  0.63494951, -0.94567484]])

In [57]:
# notice that modifying array_copy no longer modifies array
array

array([[ 9.99      , -0.63627034, -1.86576788],
       [-1.71888712,  0.63494951, -0.94567484]])

One important thing to note here is the axis argument.Axis **0 refers to rows while axis 1 refers to columns**.
If we are appending or inserting along axis 0, both arrays must have equal number of columns(all other axes except axis 0 
must be equal).

### INDEXING/SLICING/SUBSETTING

In [3]:

array_1 = np.random.uniform(low=2,high=6,size=(3,))
array_2 = np.random.uniform(low=2,high=6,size=(3,4))
array_3 = np.random.uniform(low=2,high=6,size=(3,4))


In [4]:
print(array_1)

[ 3.09843285  2.30931032  3.6728476 ]


In [5]:
print(array_2)

[[ 4.64328967  2.76634952  2.60850011  3.69346793]
 [ 5.18280557  4.20222226  2.51751563  4.6821338 ]
 [ 3.83320964  2.85274692  3.09265041  4.61107613]]


In [6]:
print(array_3)

[[ 4.87154386  4.59143738  2.05173751  2.86300843]
 [ 4.30821845  2.27923643  5.05196281  3.62478338]
 [ 3.75443465  3.60330234  5.70475247  2.51296106]]


In [7]:
# Returns the elements at indices 0,1
# note that the last index in the range(2) is not included(open interval)
# [)
array_1[0:2] 

array([ 3.09843285,  2.30931032])

In [32]:
# On a 2D array: returns rows 0,1
print('array_2\n',array_2)
print('\nrows 0 and 1 of array_2\n\n',array_2[0:2]) 

array_2
 [[ 4.64328967  2.76634952  2.60850011  3.69346793]
 [ 5.18280557  4.20222226  2.51751563  4.6821338 ]
 [ 3.83320964  2.85274692  3.09265041  4.61107613]]

rows 0 and 1 of array_2

 [[ 4.64328967  2.76634952  2.60850011  3.69346793]
 [ 5.18280557  4.20222226  2.51751563  4.6821338 ]]


In [28]:
# Returns the elements at index 1 on all rows(full column 1)
print('array_2\n',array_2)
print('array_2 column one\n',array_2[:,1]) 

array_2
 [[ 4.64328967  2.76634952  2.60850011  3.69346793]
 [ 5.18280557  4.20222226  2.51751563  4.6821338 ]
 [ 3.83320964  2.85274692  3.09265041  4.61107613]]
array_2 column one
 [ 2.76634952  4.20222226  2.85274692]


In [27]:
# Returns an array of the same shape with boolean values 
# indicating whether the corresonding elements in the array satisfy the condition or not
print('array_1\n',array_1)
print('bool_array\n',array_1<3.5) 

array_1
 [ 3.09843285  2.30931032  3.6728476 ]
bool_array
 [ True  True False]


In [24]:
# performs element-wise 'and' between 2 boolean arrays of the same size
print('array_2\n',array_2,'\n\n')
print('array_3\n',array_3,'\n\n')
print('array_2 condition\n',array_2<3.5,'\n\n')
print('array_3 condition\n',array_3>4.5,'\n\n')
(array_2<3.5) & (array_3>4.5)

array_2
 [[ 4.64328967  2.76634952  2.60850011  3.69346793]
 [ 5.18280557  4.20222226  2.51751563  4.6821338 ]
 [ 3.83320964  2.85274692  3.09265041  4.61107613]] 


array_3
 [[ 4.87154386  4.59143738  2.05173751  2.86300843]
 [ 4.30821845  2.27923643  5.05196281  3.62478338]
 [ 3.75443465  3.60330234  5.70475247  2.51296106]] 


array_2 condition
 [[False  True  True False]
 [False False  True False]
 [False  True  True False]] 


array_3 condition
 [[ True  True False False]
 [False False  True False]
 [False False  True False]] 




array([[False,  True, False, False],
       [False, False,  True, False],
       [False, False,  True, False]], dtype=bool)

In [25]:
# Inverts a boolean array
print('array_1\n',array_1)
print('bool_array\n',(array_1<3.5))
print('inverted bool array\n',~(array_1<3.5))

array_1
 [ 3.09843285  2.30931032  3.6728476 ]
bool_array
 [ True  True False]
inverted bool array
 [False False  True]


In [12]:
# Inverts boolean arrays and performs element-wise 'and' afterwards
~(array_2<3.5) & ~(array_3>4.5) 

array([[ True,  True, False, False],
       [False, False,  True, False],
       [False, False, False,  True]], dtype=bool)

In [29]:
# boolean arrays can be used as array index
# Returns array elements greater than 3.5
print('array_2\n',array_2)
print('bool_array\n',array_2>3.5)
print('conditional selection\n',array_2[array_2>3.5]) 

array_2
 [[ 4.64328967  2.76634952  2.60850011  3.69346793]
 [ 5.18280557  4.20222226  2.51751563  4.6821338 ]
 [ 3.83320964  2.85274692  3.09265041  4.61107613]]
bool_array
 [[ True False False  True]
 [ True  True False  True]
 [ True False False  True]]
conditional selection
 [ 4.64328967  3.69346793  5.18280557  4.20222226  4.6821338   3.83320964
  4.61107613]


In [36]:
# Return the array indices where some condition is satisfied
print('array_2\n',array_2)
rows,columns = np.where(array_2>3.5)
print('row_indiecs\n',rows,'\ncolumn_indices\n',columns)

array_2
 [[ 4.64328967  2.76634952  2.60850011  3.69346793]
 [ 5.18280557  4.20222226  2.51751563  4.6821338 ]
 [ 3.83320964  2.85274692  3.09265041  4.61107613]]
row_indiecs
 [0 0 1 1 1 2 2] 
column_indices
 [0 3 0 1 3 0 3]


In [39]:
# Check which elements of one array are contained in another array
element = 2*np.arange(4).reshape((2, 2))
test_elements = [1, 2, 4, 8,9,2,3]
print('element\n',element)
print('test_elements\n',test_elements)


element
 [[0 2]
 [4 6]]
test_elements
 [1, 2, 4, 8, 9, 2, 3]


In [42]:
mask = np.isin(test_elements, element)
print(mask)


[False  True  True False False  True False]


### SIMPLE STATISTICS

In [16]:
#Returns mean along specific axis
np.mean(array_2,axis=0)

array([ 4.68403964,  4.09487421,  3.88499147,  3.37187408])

In [17]:
#Returns sum of array along specific axis 
np.sum(array_2,axis=0)

array([ 14.05211893,  12.28462262,  11.65497441,  10.11562223])

In [18]:
np.min(array_2,axis=1) 

array([ 3.34076318,  2.04650584,  2.53752509])

In [19]:
#Returns index of minimum element of array along specific axis
np.argmin(array_2,axis=1)

array([3, 1, 2], dtype=int64)

In [20]:
#Returns index of maximum element of array along specific axis 
np.argmax(array_2,axis=1)

array([1, 2, 0], dtype=int64)

In [21]:
#Returns variance along specific axis
np.var(array_2,axis=1)

array([ 0.98712518,  1.73315015,  1.38551553])

In [22]:
#Returns standard deviation along specific axis
np.std(array_2,axis=1)

array([ 0.99354174,  1.31649161,  1.17707924])

For convenience, the axis argument can be thaught as **REDUCTION AXIS**. The result will be reduced along the specified axis.