# Working with `Basic Statistics`
------------

In [3]:
import numpy as np
np_list = np.array(range(2,21,2))

In [None]:
np_list

array([ 2,  4,  6,  8, 10, 12, 14, 16, 18, 20])

## mean
- https://www.mathsisfun.com/mean.html
- https://www.mathsisfun.com/numbers/geometric-mean.html
- https://www.mathsisfun.com/numbers/harmonic-mean.html

In [None]:
np.mean(np_list)

11.0

## median
- https://www.mathsisfun.com/median.html

- Step 1: oragnize data in asc
    - case 1: if no of items are odd --> then median is --> `(no of items  + 1)/2`
        - 1, 20,**30**,40,60

    - case 2: if no of items are even --> then medians are --> `no of items/2`,`(no of items/2)+ 1`
        - 1, 20,**30,40**,60,70

            - avg of above two median  is our required median (30+40)/2 = 35



In [None]:
np.median([1,20,30,40,60])

30.0

In [None]:
np.median([1,20,30,40,60,70])

35.0

In [None]:
np.median(np_list)

11.0

## Mode
- https://www.mathsisfun.com/mode.html

In [2]:
import numpy as np

In [12]:
a_np_array= np.array([6, 3, 9, 6, 6, 3,5, 9, 3,6])

In [13]:
from scipy import stats

In [14]:
stats.mode(a_np_array)

ModeResult(mode=array([6]), count=array([4]))

## Standard Deviation

- https://www.mathsisfun.com/data/standard-deviation.html

    $\sigma = \sqrt{ \frac{1}{N}\sum^N_{i=1}(x_{i}-\mu)^2} $

- It is useful to find out `outliers`
- Square root of `variance`$ (\sigma^2) $ is `standard deviation`$(\sigma)$

In [4]:
np_list

array([ 2,  4,  6,  8, 10, 12, 14, 16, 18, 20])

In [5]:
np.std(np_list)

5.744562646538029

In [6]:
std = np.std([600 ,470 ,170 ,430 ,300])

In [7]:
std

147.32277488562318

In [8]:
mean = np.mean([600 ,470 ,170 ,430 ,300])

In [9]:
mean

394.0

In [10]:
upper = mean + std

In [11]:
lower = mean -std

In [12]:
upper

541.3227748856232

In [13]:
lower

246.67722511437682

### outliers

In [14]:
np_1d_array = np.array([600 ,470 ,170 ,430 ,300])

In [15]:
np_1d_array[np.logical_or(upper<np_1d_array,lower>np_1d_array)]

array([600, 170])

`Home Work: `

Refer mean median mode calculations https://acadgild.com/blog/python-mean-median-mode


# random numbers
read it : https://www.statisticshowto.datasciencecentral.com/probability-distribution/

## random_sample

In [2]:
import numpy as np

In [12]:
#np.random?

In [15]:
# 1D array
np.random.random_sample(10)

array([0.54425024, 0.93831908, 0.86574256, 0.45964292, 0.06939916,
       0.35448628, 0.13093508, 0.03838408, 0.45356118, 0.40477955])

In [5]:
# 2D array
np.random.random_sample((5,2)) # n_samples 5 and n_features = 2

array([[0.8429302 , 0.01431914],
       [0.32096613, 0.29350777],
       [0.81908028, 0.28892861],
       [0.67564181, 0.54671407],
       [0.36617063, 0.10275393]])

In [16]:
# random_sample and random both are same
np.random.random((5,2)) # n_samples 5 and n_features = 2

array([[0.31098232, 0.32518332],
       [0.72960618, 0.63755747],
       [0.88721274, 0.47221493],
       [0.11959425, 0.71324479],
       [0.76078505, 0.5612772 ]])

## seed

### Working with random numbers with out seed (seed = same)

In [1]:
# generate 5 random integers between 10(Inclusive) and 50(Exclusive)
import numpy as np

In [2]:
print(np.random.randint(10,50,5))

[45 13 35 48 23]


In [3]:
print(np.random.randint(10,50,5))

[43 21 44 34 42]


### Working with random numbers with seed
- seed is useful for reproducibility
- Seed must be between 0 and 2**32 - 1

In [2]:
#np.random.randint?

In [1]:
import numpy as np
np.random.seed(42)
print(np.random.randint(10,50,5))

[48 38 24 17 30]


In [6]:
np.random.seed(42)
print(np.random.randint(10,50,5))

[48 38 24 17 30]


In [7]:
np.random.seed(42)
print(np.random.randint(10,50,5))

[48 38 24 17 30]


In [8]:
np.random.seed(4)
print(np.random.randint(10,50,5))

[15 11 33 18 19]


In [9]:
np.random.seed(4)
print(np.random.randint(10,50,5))

[15 11 33 18 19]


In [18]:
np.random.seed(2**32)

ValueError: Seed must be between 0 and 2**32 - 1

In [3]:
#np.random.random?

In [11]:
np.random.random(20)

array([0.2160895 , 0.97627445, 0.00623026, 0.25298236, 0.43479153,
       0.77938292, 0.19768507, 0.86299324, 0.98340068, 0.16384224,
       0.59733394, 0.0089861 , 0.38657128, 0.04416006, 0.95665297,
       0.43614665, 0.94897731, 0.78630599, 0.8662893 , 0.17316542])

> **For a given seed we will get same random numbers**

In [12]:
np.random.seed(8)
print(np.random.randint(10,50,5))

[13 30 15 36 18]


In [13]:
np.random.seed(8)
print(np.random.randint(10,50,5))

[13 30 15 36 18]


In [14]:
np.random.seed(8)
print(np.random.randint(10,50,5))

[13 30 15 36 18]


# Working with linear algebra

In [1]:
# Create a matrix with first 16 odd numbers
import numpy as np
a = np.arange(1,32,2).reshape(4,4)
print(a)

[[ 1  3  5  7]
 [ 9 11 13 15]
 [17 19 21 23]
 [25 27 29 31]]


In [2]:
# Create a matrix with first 16 even numbers
b = np.arange(2,33,2).reshape(4,-1) # -1 means "whatever is needed"
print(b)

[[ 2  4  6  8]
 [10 12 14 16]
 [18 20 22 24]
 [26 28 30 32]]


## Add two matrices

In [10]:
# Add these two matrices
print(a+b)

[[ 3  7 11 15]
 [19 23 27 31]
 [35 39 43 47]
 [51 55 59 63]]


## subtract two matrices

In [11]:
# subtract these two matrices
print(b-a)

[[1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]]


## Divide two matrices

In [13]:
# divide these two matrices
b/a

array([[2.        , 1.33333333, 1.2       , 1.14285714],
       [1.11111111, 1.09090909, 1.07692308, 1.06666667],
       [1.05882353, 1.05263158, 1.04761905, 1.04347826],
       [1.04      , 1.03703704, 1.03448276, 1.03225806]])

## Multiply two matrices
* Number of columns in one matrix must be `equal` to the number of rows in second matrix

![](https://github.com/rritec/Data-Analysis/blob/master/images/Matrix_multiplication.png?raw=true)

In [14]:
a=np.array([[1,2,3],[4,5,6]])
b=np.array([[7,8],[9,10],[11,12]])
np.dot(a, b)

array([[ 58,  64],
       [139, 154]])

## Transpose a matrix
* Transpose of a matrix is obtained by changing `rows to columns` and `columns to rows`

In [15]:
m = np.arange(1,7).reshape(3,2) 
m

array([[1, 2],
       [3, 4],
       [5, 6]])

In [16]:
m.T

array([[1, 3, 5],
       [2, 4, 6]])

## Inverse of a matrix

![](https://github.com/rritec/Data-Analysis/blob/master/images/Matrix_inverse.png?raw=true)

In [33]:
a=np.array([[4,7],[2,6]])

In [34]:
a

array([[4, 7],
       [2, 6]])

In [35]:
np.linalg.inv(a)

array([[ 0.6, -0.7],
       [-0.2,  0.4]])

# sort

In [17]:
import numpy as np

In [18]:
np.random.seed(1)
x = np.random.randn(10)

In [19]:
x

array([ 1.62434536, -0.61175641, -0.52817175, -1.07296862,  0.86540763,
       -2.3015387 ,  1.74481176, -0.7612069 ,  0.3190391 , -0.24937038])

In [23]:
x.sort()

In [24]:
x

array([-2.3015387 , -1.07296862, -0.7612069 , -0.61175641, -0.52817175,
       -0.24937038,  0.3190391 ,  0.86540763,  1.62434536,  1.74481176])

# unique

In [25]:
import numpy as np
array = np.array([10,20,10,40,20,10,20,40,20])
print(np.unique(array,return_counts=True))

(array([10, 20, 40]), array([3, 4, 2], dtype=int64))


# Set Operations

In [3]:
s1 = np.array(['desk','chair','bulb'])
s2 = np.array(['lamp','bulb','chair'])
print(s1)
print(s2)

['desk' 'chair' 'bulb']
['lamp' 'bulb' 'chair']


## union

In [30]:
print( np.union1d(s1, s2) )

['bulb' 'chair' 'desk' 'lamp']


## intersect

In [31]:
print( np.intersect1d(s1, s2) )

['bulb' 'chair']


## diff

In [32]:
print( np.setdiff1d(s1, s2) )# elements in s1 that are not in s2

['desk']


In [4]:
print( np.setdiff1d(s2, s1) )# elements in s2 that are not in s1

['lamp']


# Broadcasting
- The term broadcasting describes how numpy treats arrays with `different shapes` during arithmetic operations
- For more details, [please refer](https://docs.scipy.org/doc/numpy-1.10.1/user/basics.broadcasting.html)

In [5]:
import numpy as np
start = np.zeros((4,3))
print(start)

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]


In [34]:
start.shape

(4, 3)

In [35]:
# create a rank 1 ndarray with 3 values
add_rows = np.array([1, 0, 2])
print(add_rows)

[1 0 2]


In [36]:
add_rows.shape

(3,)

In [39]:
y = start + add_rows  # add to each row of 'start' using broadcasting
print(y)

[[1. 0. 2.]
 [1. 0. 2.]
 [1. 0. 2.]
 [1. 0. 2.]]


In [40]:
# create an ndarray which is 4 x 1 to broadcast across columns
add_cols = np.array([[0,1,2,3]])
add_cols = add_cols.T
print(add_cols)

[[0]
 [1]
 [2]
 [3]]


In [41]:
# add to each column of 'start' using broadcasting
y = start + add_cols 
print(y)

[[0. 0. 0.]
 [1. 1. 1.]
 [2. 2. 2.]
 [3. 3. 3.]]


In [42]:
# this will just broadcast in both dimensions
add_scalar = np.array([1])  
print(start+add_scalar)

[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]


# Read or Write to Disk

## Binary Format:

In [19]:
x = np.array([ 23.23, 24.24] )

In [20]:
x

array([23.23, 24.24])

In [21]:
import os
os.getcwd()

'C:\\Users\\ramreddymyla\\RRITEC_TRAINING_ASSETS\\Data-Analysis'

In [22]:
np.save('an_array_20200320', x) # observe file in current working directory

In [23]:
np.load('an_array_20200320.npy')

array([23.23, 24.24])

## Text Format:

In [81]:
np.savetxt('array.txt', X=x, delimiter=',') # Writeing on to disk

In [82]:
np.loadtxt('array.txt', delimiter=',') # Reading from disk

array([23.23, 24.24])

# home Work

https://docs.scipy.org/doc/numpy/user/quickstart.html