# The Numpy array object

# NumPy Arrays

**python objects:** 

1. high-level number objects: integers, floating point
2. containers: lists (costless insertion and append), dictionaries (fast lookup)

**Numpy provides:**

1. extension package to Python for multi-dimensional arrays
2. closer to hardware (efficiency)
3. designed for scientific computation (convenience)
4. Also known as array oriented computing

In [1]:
marks = [12, 15, 14, 10, 18]

len(marks)

5

In [2]:

import numpy as np

# lets create 1 d data ( sales of 10 days)
sales = np.array([200,220,250,275,300,320,350,400,450,500])

# lets create 2 d data ( sales of 10 days for 3 products)
sales_2d = np.array([[200,220,250],[275,300,320],[350,400,450],[500,550,600]])

# I want to increase the sales by 10%^
increased_sales = sales * 1.1
increased_sales

array([220. , 242. , 275. , 302.5, 330. , 352. , 385. , 440. , 495. ,
       550. ])

In [12]:
# numpy attributes
sales.ndim  # number of dimensions
sales.shape  # dimensions of the array
sales.size  # total number of elements
sales.dtype  # data type of the elements
sales.itemsize  # size of each element in bytes
sales.nbytes  # total size of the array in bytes
sales_2d.ndim  # number of dimensions
sales_2d.shape  # dimensions of the array
sales_2d.size  # total number of elements
sales_2d.dtype  # data type of the elements
sales_2d.itemsize  # size of each element in bytes
sales_2d.nbytes  # total size of the array in bytes


96

In [14]:
# accessing elements
sales[0]  # first element
sales[-1]  # last element
sales[1:3]  # elements from index 1 to 2
sales_2d[0, 1]  # element at row 0, column 1
sales_2d[:, 0]  # all rows, first column

array([200, 275, 350, 500])

In [15]:
# methods in numpy
print(np.mean(sales))  # mean
print(np.median(sales))  # median
print(np.std(sales))  # standard deviation

print(np.min(sales))  # minimum
print(np.max(sales))  # maximum
print(np.sum(sales))  # sum
print(np.argmax(sales))  # index of maximum

326.5
310.0
93.75633311942185
200
500
3265
9


In [22]:
# operations on 1d array
# lets create new array with sales of 10 days
print(sales)
sales_new = np.array([190,230,260,280,310,330,360,410,460,510])
sales_combined = sales + sales_new
print(sales_combined)
sales_diff = sales_new - sales
print(sales_diff)


# lets compute % increase in sales
sales_percentage_increase = (sales_diff / sales) * 100
print("Sales percentage increase:", sales_percentage_increase)



[200 220 250 275 300 320 350 400 450 500]
[ 390  450  510  555  610  650  710  810  910 1010]
[-10  10  10   5  10  10  10  10  10  10]
Sales percentage increase: [-5.          4.54545455  4.          1.81818182  3.33333333  3.125
  2.85714286  2.5         2.22222222  2.        ]


In [None]:
# lets create 2 d data ( sales of 10 days for 3 products)
sales_2d = np.array([[200,220,250],[275,300,320],[350,400,450],[500,550,600],[600,650,700],[700,750,800],[800,850,900],[900,950,1000],[1000,1100,1200],[1100,1200,1300]])


# lets give 10% discount on all products
discounted_sales = sales_2d * 0.9

(10, 3)

In [None]:
# I want to generate randome numbers
income = np.random.randint(5000, 10000, size=(10, 3))

print('datatype of income array:', income.dtype)
income = np.random.rand(10, 3) * 10000
income.dtype    

# choice function
choices = np.random.choice([100, 200, 300, 400, 500], size=10)
print(choices)





datatype of income array: int64
[100 100 400 300 400 400 100 500 300 200]


In [37]:
# sampling 
population_income = np.random.randint(5000, 10000, size=(500))
print(population_income)
#sample
sample_1 = np.random.choice(population_income,size=30)
sample_2 = np.random.choice(population_income,size=30)
sample_3 = np.random.choice(population_income,size=30)

print('sample_1:',sample_1)
# avg_sample_mean = np.mean()

[7818 5452 9380 7487 6944 9253 9573 8890 9946 8046 9921 9545 9075 8643
 9940 8410 9019 9783 7697 6853 8639 8166 9302 6881 9486 9184 5110 8116
 8256 6913 6115 6679 6628 8906 9423 6357 6881 9515 8570 6942 8502 6423
 5133 7268 7463 9101 8109 7926 6032 8158 9941 8583 6974 7463 6130 7921
 8365 6937 8022 7084 9720 8977 9774 9141 5027 6265 9689 9180 7195 7310
 8569 9244 7468 6926 8426 7855 5612 5816 5107 7415 5459 6787 7857 9983
 7589 6006 7386 6243 9923 8415 7027 5453 7129 9221 8220 8468 6955 6424
 7261 8894 5839 8062 5782 8142 8132 7047 7776 6708 7842 7240 9339 8072
 8920 5174 7775 7989 7641 7068 6370 6226 6868 6331 5752 5136 9998 8337
 7186 6091 5809 6588 8296 9653 9763 5235 9024 5615 7686 6518 9218 5580
 6162 6324 9296 7609 9323 6098 8218 7594 8603 7791 5714 8855 7419 8875
 5806 7705 8330 9854 8295 6276 9605 6910 8903 8147 8881 9978 8451 9145
 6390 8555 8209 6062 7263 6699 6726 7728 5721 5995 8099 7757 7316 5481
 6688 6183 5519 8269 9693 5924 9797 9442 7334 7665 8974 6200 6762 5471
 9425 

In [40]:
a1 = np.array([[1, 1],
               [2, 2]])

a2 = np.array([[3, 3],
               [4, 4]])


np.hstack((a1, a2))

array([[1, 1, 3, 3],
       [2, 2, 4, 4]])

In [43]:
data = np.array([1, 2])
ones = np.ones(2, dtype=int)
print(ones)
final_result= data + ones

[1 1]


In [45]:
sales_2d.T

array([[ 200,  275,  350,  500,  600,  700,  800,  900, 1000, 1100],
       [ 220,  300,  400,  550,  650,  750,  850,  950, 1100, 1200],
       [ 250,  320,  450,  600,  700,  800,  900, 1000, 1200, 1300]])

In [None]:
import pandas as pd
df = pd.DataFrame(sales_2d, columns=['Product_A', 'Product_B', 'Product_C'])
df.to_csv('sales_data.csv', index=False)

Unnamed: 0,Product_A,Product_B,Product_C
0,200,220,250
1,275,300,320
2,350,400,450
3,500,550,600
4,600,650,700
5,700,750,800
6,800,850,900
7,900,950,1000
8,1000,1100,1200
9,1100,1200,1300


In [None]:
# accessing elements


In [5]:
import numpy as np

# create a simple marks array
marks = np.array([85, 90, 78, 92, 88])
# find the avg marks
average_marks = np.mean(marks)
median_marks = np.median(marks)
#75th percentile
percentile_75 = np.percentile(marks, 75)
print(average_marks)
print(median_marks)
print(percentile_75)

86.6
88.0
90.0


In [14]:
#lets create one simple matrix
matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(matrix.shape)
# create 1 by 3 matrix
matrix_1x3 = np.array([[1, 2, 3]])
print(matrix_1x3.shape)
# print(matrix_1x3)

# multiply matrix_1x3 by matrix
result = matrix.dot(matrix)
print(result)

(3, 3)
(1, 3)
[[ 30  36  42]
 [ 66  81  96]
 [102 126 150]]


In [23]:
# lets talk about all the available attribute in numpy arrays
# create a matrix
matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
matrix = matrix.flatten()
matrix

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [10]:
matrix


array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [11]:
matrix_1x3

array([[1, 2, 3]])

In [24]:
import numpy as np
a = np.array([[0, 1, 2, 3],[0, 1, 2, 3]])
print(a)
a.ndim
# print(np.arange(10))

[[0 1 2 3]
 [0 1 2 3]]


2

**Why it is useful:** Memory-efficient container that provides fast numerical operations.

In [25]:
#python lists
L = range(1000)

%timeit [i**2 for i in L]

20.3 μs ± 67.9 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [26]:
a = np.arange(1000)
%timeit a**2

453 ns ± 3.01 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


# 1. Creating arrays

** 1.1.  Manual Construction of arrays**

In [27]:
#1-D

a = np.array([0, 1, 2, 3])

a

array([0, 1, 2, 3])

In [28]:
#print dimensions

a.ndim

1

In [29]:
#shape

a.shape

(4,)

In [30]:
len(a)

4

In [31]:
# 2-D, 3-D....

b = np.array([[0, 1, 2], [3, 4, 5]])

b

array([[0, 1, 2],
       [3, 4, 5]])

In [32]:
b.ndim

2

In [33]:
b.shape

(2, 3)

In [34]:
len(b) #returns the size of the first dimention

2

In [35]:
c = np.array([[[0, 1], [2, 3]], [[4, 5], [6, 7]]])

c

array([[[0, 1],
        [2, 3]],

       [[4, 5],
        [6, 7]]])

In [36]:
c.shape

(2, 2, 2)

In [37]:
c.ndim

3

In [38]:
c.shape

(2, 2, 2)

** 1.2  Functions for creating arrays**

In [39]:
#using arrange function

# arange is an array-valued version of the built-in Python range function

a = np.arange(10) # 0.... n-1
a

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [41]:
b = np.arange(1, 10, 2) #start, end (exclusive), step

b

array([1, 3, 5, 7, 9])

In [42]:
#using linspace

a = np.linspace(0, 3, 6) #start, end, number of points

a

array([0. , 0.6, 1.2, 1.8, 2.4, 3. ])

In [43]:
#common arrays

a = np.ones((3, 3))

a

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [44]:
b = np.zeros((3, 3))

b

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [45]:
c = np.eye(2)  #Return a 2-D array with ones on the diagonal and zeros elsewhere.

c

array([[1., 0.],
       [0., 1.]])

In [46]:
d = np.eye(3, 2) #3 is number of rows, 2 is number of columns, index of diagonal start with 0

d

array([[1., 0.],
       [0., 1.],
       [0., 0.]])

In [47]:
#create array using diag function

a = np.diag([1, 2, 3, 4]) #construct a diagonal array.

a

array([[1, 0, 0, 0],
       [0, 2, 0, 0],
       [0, 0, 3, 0],
       [0, 0, 0, 4]])

In [48]:
np.diag(a)   #Extract diagonal

array([1, 2, 3, 4])

In [50]:
#create array using random

#Create an array of the given shape and populate it with random samples from a uniform distribution over [0, 1).
a = np.random.rand(15) 

a

array([0.88290352, 0.50420509, 0.02496344, 0.14617317, 0.12758945,
       0.4673385 , 0.53000486, 0.70339422, 0.22844506, 0.09911315,
       0.72974706, 0.51529339, 0.58556909, 0.28428368, 0.52316068])

In [51]:
a = np.random.randn(4)#Return a sample (or samples) from the “standard normal” distribution.  ***Gausian***

a

array([ 2.63189646,  0.32566963, -1.80168582,  0.44005034])

**Note:**
    
For random samples from N(\mu, \sigma^2), use:

sigma * np.random.randn(...) + mu



# 2. Basic DataTypes

You may have noticed that, in some instances, array elements are displayed with a **trailing dot (e.g. 2. vs 2)**. This is due to a difference in the **data-type** used:

In [52]:
a = np.arange(10)

a.dtype

dtype('int64')

In [None]:
#You can explicitly specify which data-type you want:

a = np.arange(10, dtype='float64')
a

In [53]:
#The default data type is float for zeros and ones function

a = np.zeros((3, 3))

print(a)

a.dtype

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]


dtype('float64')

**other datatypes**

In [54]:
d = np.array([1+2j, 2+4j])   #Complex datatype

print(d.dtype)

complex128


In [56]:
b = np.array([True, False, True, False])  #Boolean datatype

print(b.dtype)

bool


In [57]:
s = np.array(['Ram', 'Robert', 'Rahim'])

s.dtype

dtype('<U6')

**Each built-in data type has a character code that uniquely identifies it.**

'b' − boolean

'i' − (signed) integer

'u' − unsigned integer

'f' − floating-point

'c' − complex-floating point

'm' − timedelta

'M' − datetime

'O' − (Python) objects

'S', 'a' − (byte-)string

'U' − Unicode

'V' − raw data (void)

**For more details**

**https://docs.scipy.org/doc/numpy-1.10.1/user/basics.types.html**

# 3. Indexing and Slicing

**3.1 Indexing**

The items of an array can be accessed and assigned to the same way as other **Python sequences (e.g. lists)**:

In [58]:
a = np.arange(10)
print(a)
print(a[5])  #indices begin at 0, like other Python sequences (and C/C++)

[0 1 2 3 4 5 6 7 8 9]
5


In [59]:
# For multidimensional arrays, indexes are tuples of integers:

a = np.diag([1, 2, 3])
print(a)
print(a[0, 1])

[[1 0 0]
 [0 2 0]
 [0 0 3]]
0


In [60]:
a[0, 1] = 5 #assigning value

a

array([[1, 5, 0],
       [0, 2, 0],
       [0, 0, 3]])

**3.2 Slicing**

In [61]:
a = np.arange(10)

a

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [62]:
a[1:8:2] # [startindex: endindex(exclusive) : step]

array([1, 3, 5, 7])

In [63]:
#we can also combine assignment and slicing:

a = np.arange(10)
a[5:] = 10
a

array([ 0,  1,  2,  3,  4, 10, 10, 10, 10, 10])

In [65]:
b[::-1]

array([4, 3, 2, 1, 0])

In [64]:
b = np.arange(5)
print(b)
a[5:] = b[::-1]  #assigning

a

[0 1 2 3 4]


array([0, 1, 2, 3, 4, 4, 3, 2, 1, 0])

# 4. Copies and Views

A slicing operation creates a view on the original array, which is just a way of accessing array data. Thus the original array is not copied in memory. You can use **np.may_share_memory()** to check if two arrays share the same memory block. 

**When modifying the view, the original array is modified as well:**

In [66]:
a = np.arange(10)
a

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [70]:
b = a[::2]
b

array([0, 2, 4, 6, 8])

In [71]:
np.shares_memory(a, b)

True

In [72]:
b[0] = 10
b

array([10,  2,  4,  6,  8])

In [73]:
a  #eventhough we modified b,  it updated 'a' because both shares same memory

array([10,  1,  2,  3,  4,  5,  6,  7,  8,  9])

In [75]:


a = np.arange(10)

c = a[::2].copy()     #force a copy
c

array([0, 2, 4, 6, 8])

In [76]:
np.shares_memory(a, c)

False

In [None]:
c[0] = 10

a

# 5. Fancy Indexing

NumPy arrays can be indexed with slices, but also with boolean or integer arrays **(masks)**. This method is called **fancy indexing**. It creates copies not views.

**Using Boolean Mask**

In [77]:
a = np.random.randint(0, 20, 15)
len(a)
a

array([11,  3,  7,  4,  3, 16,  3,  1,  2, 17,  1, 14,  9,  8,  2])

In [78]:
mask = (a % 2 == 0)
mask

array([False, False, False,  True, False,  True, False, False,  True,
       False, False,  True, False,  True,  True])

In [80]:
extract_from_a = a[~mask]

extract_from_a

array([11,  3,  7,  3,  3,  1, 17,  1,  9])

**Indexing with a mask can be very useful to assign a new value to a sub-array:**

In [81]:
a[mask] = -1
a

array([11,  3,  7, -1,  3, -1,  3,  1, -1, 17,  1, -1,  9, -1, -1])

**Indexing with an array of integers**

In [82]:
a = np.arange(0, 100, 10)

a

array([ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90])

In [83]:
#Indexing can be done with an array of integers, where the same index is repeated several time:

a[[2, 3, 2, 4, 2]]

array([20, 30, 20, 40, 20])

In [84]:
# New values can be assigned 

a[[9, 7]] = -200

a

array([   0,   10,   20,   30,   40,   50,   60, -200,   80, -200])