<a href="https://colab.research.google.com/github/priyankconnect/MyProjects/blob/main/numpy/Practice_in_np_Arrays_and_Pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Create Arrays

In [None]:
import numpy as np
import pandas as pd

In [None]:
data1 = [1, 2, 3, 4, 5] # list
arr1 = np.array(data1) # 1d array
data2 = [range(1, 5), range(5, 9)] # list of lists
arr2 = np.array(data2) # 2d array
arr2.tolist() # convert array back to list

[[1, 2, 3, 4], [5, 6, 7, 8]]

In [None]:
data1 = [1,2,3,4,5]
arr1 = np.array(data1)
arr1

array([1, 2, 3, 4, 5])

In [None]:
data2 = [range(1,5),range(5,9)] # list of lists
arr2 = np.array(data2)
arr2

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [None]:
arr2.tolist()

[[1, 2, 3, 4], [5, 6, 7, 8]]

## Create Special Arrays

In [None]:
np.zeros((3, 6))
np.ones(10)
np.linspace(0, 1, 5) # 0 to 1 (inclusive) with 5 points
np.logspace(0, 3, 4) # 10^0 to 10^3 (inclusive) with 4 points

In [None]:
np.zeros((3,6))

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [None]:
np.ones(10)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [None]:
np.linspace(0, 1, 5) # 0 to 1 (inclusive) with 5 points

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [None]:
np.logspace(0, 3, 4) # 10^0 to 10^3 (inclusive) with 4 points

array([   1.,   10.,  100., 1000.])

## arange is like range, except it returns an array (not a list)

In [None]:
int_array = np.arange(5)
float_array = int_array.astype(float)

In [None]:
int_array = np.arange(5)
int_array

array([0, 1, 2, 3, 4])

In [None]:
float_array = int_array.astype(float)
float_array

array([0., 1., 2., 3., 4.])

## 4.2 Examining arrays

In [None]:
arr1.dtype # float64
arr2.dtype # int32
arr2.ndim # 2
arr2.shape # (2, 4) - axis 0 is rows, axis 1 is columns
arr2.size # 8 - total number of elements
len(arr2) # 2 - size of first dimension (aka axis)

In [None]:
arr1.dtype # float64

dtype('int64')

In [None]:
arr2.dtype # int32

dtype('int64')

In [None]:
arr2.ndim # 2

2

In [None]:
arr2.shape # (2, 4) - axis 0 is rows, axis 1 is columns

(2, 4)

In [None]:
arr2.size # 8 - total number of elements

8

In [None]:
len(arr2) # 2 - size of first dimension (aka axis)

2

## 4.3 Reshaping

In [None]:
arr = np.arange(10, dtype=float).reshape((2, 5))
print(arr.shape)
print(arr.reshape(5, 2))

(2, 5)
[[0. 1.]
 [2. 3.]
 [4. 5.]
 [6. 7.]
 [8. 9.]]


## Add an axis

In [None]:
a = np.array([0, 1])
a_col = a[:, np.newaxis]
print(a_col)
#or
a_col = a[:, None]

[[0]
 [1]]


So basically instead of [0,1] it added an axis [[0],[1]]

## Transpose

In [None]:
print(a_col.T)

[[0 1]]


## Flatten: always returns a flat copy of the orriginal array. Returns a deep copy of the original array

In [None]:
arr_flt = arr.flatten()
arr_flt[0] = 33
print(arr_flt)
print(arr)

[33.  1.  2.  3.  4.  5.  6.  7.  8.  9.]
[[0. 1. 2. 3. 4.]
 [5. 6. 7. 8. 9.]]


In [None]:
arr_flt=arr.flatten()
arr_flt

array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])

Adding an element 33 on the arr_flt

In [None]:
arr_flt[0] = 33
print(arr_flt)

[33.  1.  2.  3.  4.  5.  6.  7.  8.  9.]


## Ravel: returns a view of the original array whenever possible. Returns a shallow copy of the original array

In [None]:
arr_flt = arr.ravel()
arr_flt[0] = 33
print(arr_flt)
print(arr)

[33.  1.  2.  3.  4.  5.  6.  7.  8.  9.]
[[33.  1.  2.  3.  4.]
 [ 5.  6.  7.  8.  9.]]


##4.4 Stack arrays
###Stack flat arrays in columns

In [None]:
a = np.array([0, 1])
b = np.array([2, 3])
ab = np.stack((a, b)).T
print(ab)
# or
np.hstack((a[:, None], b[:, None]))

[[0 2]
 [1 3]]


array([[0, 2],
       [1, 3]])

In [None]:
a = np.array([0, 1])
b = np.array([2, 3])
int =np.stack((a, b))
int

array([[0, 1],
       [2, 3]])

In [None]:
int.T

array([[0, 2],
       [1, 3]])

In [None]:
np.hstack((a[:, None], b[:, None]))

array([[0, 2],
       [1, 3]])

##4.5 Selection
###Single item

In [None]:
arr = np.arange(10, dtype=float).reshape((2, 5))
arr[0] # 0th element (slices like a list)
arr[0, 3] # row 0, column 3: returns 4
arr[0][3] # alternative syntax

In [None]:
arr = np.arange(10, dtype=float).reshape((2, 5))
arr

array([[0., 1., 2., 3., 4.],
       [5., 6., 7., 8., 9.]])

In [None]:
arr[0]

array([0., 1., 2., 3., 4.])

In [None]:
arr[0,3]

3.0

In [None]:
arr[0][3]

3.0

## 4.5.1 Slicing
###Syntax: start:stop:step with start (default 0) stop (default last) step (default 1)

In [None]:
arr[0, :] # row 0: returns 1d array ([1, 2, 3, 4])
arr[:, 0] # column 0: returns 1d array ([1, 5])
arr[:, :2] # columns strictly before index 2 (2 first columns)
arr[:, 2:] # columns after index 2 included
arr2 = arr[:, 1:4] # columns between index 1 (included) and 4 (excluded)
print(arr2)

[[1. 2. 3.]
 [6. 7. 8.]]


In [None]:
arr

array([[0., 1., 2., 3., 4.],
       [5., 6., 7., 8., 9.]])

In [None]:
arr[0, :]

array([0., 1., 2., 3., 4.])

In [None]:
arr[:, 0] # column 0: returns 1d array ([1, 5])

array([0., 5.])

In [None]:
arr[:, :2] # columns strictly before index 2 (2 first columns)

array([[0., 1.],
       [5., 6.]])

In [None]:
arr[:, 2:] # columns after index 2 included

array([[2., 3., 4.],
       [7., 8., 9.]])

In [None]:
arr2 = arr[:, 1:4] # columns between index 1 (included) and 4 (excluded)
arr2

array([[1., 2., 3.],
       [6., 7., 8.]])

### Slicing returns a view (not a copy)

In [None]:
arr2[0, 0] = 33
print(arr2)
print(arr)

[[33.  2.  3.]
 [ 6.  7.  8.]]
[[ 0. 33.  2.  3.  4.]
 [ 5.  6.  7.  8.  9.]]


## Row 0: reverse order

In [None]:
print(arr[0, ::-1])
# The rule of thumb here can be: in the context of lvalue indexing (i.e. the indices are␣
#placed in the left hand side value of an assignment), no view or copy of the array is␣
#created (because there is no need to). However, with regular values, the above rules␣
#for creating views does apply.

[ 4.  3.  2. 33.  0.]


In [None]:
print(arr[1, ::-1])

[9. 8. 7. 6. 5.]


##4.5.2 Fancy indexing: Integer or boolean array indexing
Fancy indexing returns a copy not a view.
Integer array indexing

In [None]:
arr2 = arr[:, [1,2,3]] # return a copy
print(arr2)
arr2[0, 0] = 44
print(arr2)
print(arr)

[[33.  2.  3.]
 [ 6.  7.  8.]]
[[44.  2.  3.]
 [ 6.  7.  8.]]
[[ 0. 33.  2.  3.  4.]
 [ 5.  6.  7.  8.  9.]]


## Boolean arrays indexing

In [None]:
arr

array([[ 0., 33.,  2.,  3.,  4.],
       [ 5.,  6.,  7.,  8.,  9.]])

In [None]:
arr2 = arr[arr > 5] # return a copy
print(arr2)
arr2[0] = 44
print(arr2)
print(arr)

[33.  6.  7.  8.  9.]
[44.  6.  7.  8.  9.]
[[ 0. 33.  2.  3.  4.]
 [ 5.  6.  7.  8.  9.]]


## However, In the context of lvalue indexing (left hand side value of an assignment) Fancy authorizes the modification of the original array

In [None]:
arr[arr > 5] = 0
print(arr)

[[0. 0. 2. 3. 4.]
 [5. 0. 0. 0. 0.]]


### Boolean arrays indexing continues

In [None]:
names = np.array(['Bob', 'Joe', 'Will', 'Bob'])
names == 'Bob' # returns a boolean array
names[names != 'Bob'] # logical selection
(names == 'Bob') | (names == 'Will') # keywords "and/or" don't work with boolean arrays
names[names != 'Bob'] = 'Joe' # assign based on a logical selection
np.unique(names) # set function

array(['Bob', 'Joe'], dtype='<U4')

In [None]:
names = np.array(['Bob', 'Joe', 'Will', 'Bob'])
names == 'Bob' # returns a boolean array

array([ True, False, False,  True])

In [None]:
names[names != 'Bob'] # logical selection

array(['Joe', 'Will'], dtype='<U4')

In [None]:
(names == 'Bob') | (names == 'Will') # keywords "and/or" don't work with boolean arrays

array([ True, False,  True,  True])

In [None]:
names[names != 'Bob'] = 'Joe' # assign based on a logical selection
names

array(['Bob', 'Joe', 'Joe', 'Bob'], dtype='<U4')

### 4.6. Vectorized operations

In [None]:
# math and stats
rnd = np.random.randn(4, 2) # random normals in 4x2 array
rnd.mean()
rnd.std()
rnd.argmin() # index of minimum element
rnd.sum()
rnd.sum(axis=0) # sum of columns
rnd.sum(axis=1) # sum of rows
# methods for boolean arrays
(rnd > 0).sum() # counts number of positive values
(rnd > 0).any() # checks if any value is True
(rnd > 0).all() # checks if all values are True
# random numbers
np.random.seed(12234) # Set the seed
np.random.rand(2, 3) # 2 x 3 matrix in [0, 1]
np.random.randn(10) # random normals (mean 0, sd 1)
np.random.randint(0, 2, 10) # 10 randomly picked 0 or 1

array([0, 0, 0, 1, 1, 0, 1, 1, 1, 1])

In [None]:
rnd = np.random.randn(4, 2) # random normals in 4x2 array
rnd

array([[-1.06254652,  0.72437607],
       [ 0.89954397, -1.1673215 ],
       [ 1.0130447 , -1.86335047],
       [ 1.38646439, -0.06839843]])

In [None]:
rnd.mean()

-0.017273473820827695

In [None]:
rnd.std()

1.1318377786970633

In [None]:
rnd.argmin() # index of minimum element

5

In [None]:
rnd.argmin(axis =0) # index of minimum element

array([0, 2])

In [None]:
rnd.argmin(axis =1) # index of minimum element

array([0, 1, 1, 1])

In [None]:
rnd.sum()

-0.13818779056662156

In [None]:
rnd.sum(axis=0)

array([ 2.23650654, -2.37469433])

In [None]:
rnd.sum(axis=1)

array([-0.33817045, -0.26777753, -0.85030577,  1.31806596])

In [None]:
(rnd > 0).sum() # counts number of positive values

4

In [None]:
(rnd > 0).any() # checks if any value is True

True

In [None]:
(rnd > 0).all() # checks if all values are True

False

In [None]:
np.random.seed(12234) # Set the seed

In [None]:
np.random.rand(2, 3) # 2 x 3 matrix in [0, 1]


array([[0.00630595, 0.20303476, 0.76478993],
       [0.55513384, 0.74358546, 0.93777808]])

In [None]:
np.random.randn(10) # random normals (mean 0, sd 1)

array([-2.79962074e-01,  1.31281104e+00, -9.27155784e-01, -4.01302169e-01,
       -2.31085929e+00, -2.08460156e+00,  4.59241643e-01,  1.62191344e+00,
        1.94515120e-01, -2.08631547e-03])

In [None]:
np.random.randint(0, 2, 10) # 10 randomly picked 0 or 1

array([1, 0, 0, 1, 1, 1, 1, 0, 0, 0])

### Broadcasting
4.7.1 Rules
Starting with the trailing axis and working backward, Numpy compares arrays dimensions.
• If two dimensions are equal then continues
• If one of the operand has dimension 1 stretches it to match the largest one
• When one of the shapes runs out of dimensions (because it has less dimensions than
the other shape), Numpy will

In [None]:
a = np.array([[ 0, 0, 0],
[10, 10, 10],
[20, 20, 20],
[30, 30, 30]])
b = np.array([0, 1, 2])
print(a + b)

[[ 0  1  2]
 [10 11 12]
 [20 21 22]
 [30 31 32]]


## Create Dataframe

In [None]:
columns = ['name', 'age', 'gender', 'job']
user1 = pd.DataFrame([['alice', 19, "F", "student"],
['john', 26, "M", "student"]],
columns=columns)
user2 = pd.DataFrame([['eric', 22, "M", "student"],
['paul', 58, "F", "manager"]],
columns=columns)
user3 = pd.DataFrame(dict(name=['peter', 'julie'],
age=[33, 44], gender=['M', 'F'],
job=['engineer', 'scientist']))
print(user3)

    name  age gender        job
0  peter   33      M   engineer
1  julie   44      F  scientist


In [None]:
user1

Unnamed: 0,name,age,gender,job
0,alice,19,F,student
1,john,26,M,student


In [None]:
user2

Unnamed: 0,name,age,gender,job
0,eric,22,M,student
1,paul,58,F,manager


##5.2 Combining DataFrames
5.2.1 Concatenate DataFrame

In [None]:
user1.append(user2)

Unnamed: 0,name,age,gender,job
0,alice,19,F,student
1,john,26,M,student
0,eric,22,M,student
1,paul,58,F,manager


In [None]:
users = pd.concat([user1, user2, user3])
users

Unnamed: 0,name,age,gender,job
0,alice,19,F,student
1,john,26,M,student
0,eric,22,M,student
1,paul,58,F,manager
0,peter,33,M,engineer
1,julie,44,F,scientist


## 5.2.2 Join DataFrame

In [None]:
user4 = pd.DataFrame(dict(name=['alice', 'john', 'eric', 'julie'],
height=[165, 180, 175, 171]))
print(user4)

    name  height
0  alice     165
1   john     180
2   eric     175
3  julie     171


## Use intersection of keys from both frames

In [None]:
merge_inter = pd.merge(users, user4, on="name")
print(merge_inter)

    name  age gender        job  height
0  alice   19      F    student     165
1   john   26      M    student     180
2   eric   22      M    student     175
3  julie   44      F  scientist     171


Use union of keys from both frames

In [None]:
users = pd.merge(users, user4, on="name", how='outer')
print(users)

    name  age gender        job  height
0  alice   19      F    student   165.0
1   john   26      M    student   180.0
2   eric   22      M    student   175.0
3   paul   58      F    manager     NaN
4  peter   33      M   engineer     NaN
5  julie   44      F  scientist   171.0


## 5.2.3 Reshaping by pivoting
“Unpivots” a DataFrame from wide format to long (stacked) format,

In [None]:
users

Unnamed: 0,name,age,gender,job,height
0,alice,19,F,student,165.0
1,john,26,M,student,180.0
2,eric,22,M,student,175.0
3,paul,58,F,manager,
4,peter,33,M,engineer,
5,julie,44,F,scientist,171.0


In [None]:
staked = pd.melt(users, id_vars="name", var_name="variable", value_name="value")
print(staked)

     name variable      value
0   alice      age         19
1    john      age         26
2    eric      age         22
3    paul      age         58
4   peter      age         33
5   julie      age         44
6   alice   gender          F
7    john   gender          M
8    eric   gender          M
9    paul   gender          F
10  peter   gender          M
11  julie   gender          F
12  alice      job    student
13   john      job    student
14   eric      job    student
15   paul      job    manager
16  peter      job   engineer
17  julie      job  scientist
18  alice   height      165.0
19   john   height      180.0
20   eric   height      175.0
21   paul   height        NaN
22  peter   height        NaN
23  julie   height      171.0


## “pivots” a DataFrame from long (stacked) format to wide format,

In [None]:
print(staked.pivot(index='name', columns='variable', values='value'))

variable age gender height        job
name                                 
alice     19      F  165.0    student
eric      22      M  175.0    student
john      26      M  180.0    student
julie     44      F  171.0  scientist
paul      58      F    NaN    manager
peter     33      M    NaN   engineer


## 5.3 Summarizing

In [None]:
users # print the first 30 and last 30 rows
type(users) # DataFrame
users.head() # print the first 5 rows
users.tail() # print the last 5 rows
users.index # "the index" (aka "the labels")
users.columns # column names (which is "an index")
users.dtypes # data types of each column
users.shape # number of rows and columns
users.values # underlying numpy array
users.info() # concise summary (includes memory usage as of pandas 0.15.0)

In [None]:
users # print the first 30 and last 30 rows

Unnamed: 0,name,age,gender,job,height
0,alice,19,F,student,165.0
1,john,26,M,student,180.0
2,eric,22,M,student,175.0
3,paul,58,F,manager,
4,peter,33,M,engineer,
5,julie,44,F,scientist,171.0


In [None]:
type(users) # DataFrame

pandas.core.frame.DataFrame

In [None]:
users.head() # print the first 5 rows


Unnamed: 0,name,age,gender,job,height
0,alice,19,F,student,165.0
1,john,26,M,student,180.0
2,eric,22,M,student,175.0
3,paul,58,F,manager,
4,peter,33,M,engineer,


In [None]:
users.tail() # print the last 5 rows

Unnamed: 0,name,age,gender,job,height
1,john,26,M,student,180.0
2,eric,22,M,student,175.0
3,paul,58,F,manager,
4,peter,33,M,engineer,
5,julie,44,F,scientist,171.0


In [None]:
users.index # "the index" (aka "the labels")

Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')

In [None]:
users.columns # column names (which is "an index")

Index(['name', 'age', 'gender', 'job', 'height'], dtype='object')

In [None]:
users.dtypes # data types of each column

name       object
age         int64
gender     object
job        object
height    float64
dtype: object

In [None]:
users.shape # number of rows and columns

(6, 5)

In [None]:
users.values # underlying numpy array

array([['alice', 19, 'F', 'student', 165.0],
       ['john', 26, 'M', 'student', 180.0],
       ['eric', 22, 'M', 'student', 175.0],
       ['paul', 58, 'F', 'manager', nan],
       ['peter', 33, 'M', 'engineer', nan],
       ['julie', 44, 'F', 'scientist', 171.0]], dtype=object)

In [None]:
users.info() # concise summary (includes memory usage as of pandas 0.15.0)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    6 non-null      object 
 1   age     6 non-null      int64  
 2   gender  6 non-null      object 
 3   job     6 non-null      object 
 4   height  4 non-null      float64
dtypes: float64(1), int64(1), object(3)
memory usage: 288.0+ bytes


## 5.4 Columns selection

In [None]:
users['gender'] # select one column


0    F
1    M
2    M
3    F
4    M
5    F
Name: gender, dtype: object

In [None]:
type(users['gender']) # Series


pandas.core.series.Series

In [None]:
users.gender # select one column using the DataFrame


0    F
1    M
2    M
3    F
4    M
5    F
Name: gender, dtype: object

In [None]:
# select multiple columns
users[['age', 'gender']] # select two columns


Unnamed: 0,age,gender
0,19,F
1,26,M
2,22,M
3,58,F
4,33,M
5,44,F


In [None]:
my_cols = ['age', 'gender'] # or, create a list...
users[my_cols] # ...and use that list to select columns

Unnamed: 0,age,gender
0,19,F
1,26,M
2,22,M
3,58,F
4,33,M
5,44,F


In [None]:
type(users[my_cols]) # DataFrame

pandas.core.frame.DataFrame

##5.5 Rows selection (basic)
iloc is strictly integer position based

In [None]:
df = users.copy()
df.iloc[0] # first row


name        alice
age            19
gender          F
job       student
height      165.0
Name: 0, dtype: object

In [None]:
df.iloc[0, 0] # first item of first row


'alice'

In [None]:
df.iloc[0, 0] = 55
df

Unnamed: 0,name,age,gender,job,height
0,55,19,F,student,165.0
1,john,26,M,student,180.0
2,eric,22,M,student,175.0
3,paul,58,F,manager,
4,peter,33,M,engineer,
5,julie,44,F,scientist,171.0


In [None]:
for i in range(users.shape[0]):
  row = df.iloc[i]
row.age *= 100 # setting a copy, and not the original frame data.
print(df) # df is not modified

    name  age gender        job  height
0     55   19      F    student   165.0
1   john   26      M    student   180.0
2   eric   22      M    student   175.0
3   paul   58      F    manager     NaN
4  peter   33      M   engineer     NaN
5  julie   44      F  scientist   171.0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cacher_needs_updating = self._check_is_chained_assignment_possible()


## ix supports mixed integer and label based access.

In [None]:
df = users.copy()
df.loc[0] # first row

name        alice
age            19
gender          F
job       student
height      165.0
Name: 0, dtype: object

In [None]:
df.loc[0, "age"] # first item of first row


19

In [None]:
df.loc[0, "age"] = 55

In [None]:
for i in range(df.shape[0]):
  df.loc[i, "age"] *= 10
print(df) # df is modified

    name  age gender        job  height
0  alice  550      F    student   165.0
1   john  260      M    student   180.0
2   eric  220      M    student   175.0
3   paul  580      F    manager     NaN
4  peter  330      M   engineer     NaN
5  julie  440      F  scientist   171.0


### 5.6 Rows selection (
ltering)
simple logical filtering

In [None]:
users[users.age < 20] # only show users with age < 20


Unnamed: 0,name,age,gender,job,height
0,alice,19,F,student,165.0


In [None]:
young_bool = users.age < 20 # or, create a Series of booleans...
young = users[young_bool] # ...and use that Series to filter rows
young

Unnamed: 0,name,age,gender,job,height
0,alice,19,F,student,165.0


## Rows selection (basic)

In [None]:
users[users.age < 20].job # select one column from the filtered results
print(young)

    name  age gender      job  height
0  alice   19      F  student   165.0


## Advanced logical filtering

In [None]:
users[users.age < 20][['age', 'job']] # select multiple columns

Unnamed: 0,age,job
0,19,student


In [None]:
users[(users.age > 20) & (users.gender == 'M')] # use multiple conditions

Unnamed: 0,name,age,gender,job,height
1,john,26,M,student,180.0
2,eric,22,M,student,175.0
4,peter,33,M,engineer,


In [None]:
users[users.job.isin(['student', 'engineer'])] # filter specific values

Unnamed: 0,name,age,gender,job,height
0,alice,19,F,student,165.0
1,john,26,M,student,180.0
2,eric,22,M,student,175.0
4,peter,33,M,engineer,


## 5.7 Sorting

In [None]:
df = users.copy()
df.age.sort_values() # only works for a Series


0    19
2    22
1    26
4    33
5    44
3    58
Name: age, dtype: int64

In [None]:
df.sort_values(by='age') # sort rows by a specific column


Unnamed: 0,name,age,gender,job,height
0,alice,19,F,student,165.0
2,eric,22,M,student,175.0
1,john,26,M,student,180.0
4,peter,33,M,engineer,
5,julie,44,F,scientist,171.0
3,paul,58,F,manager,


In [None]:
df.sort_values(by='age', ascending=False) # use descending order instead


Unnamed: 0,name,age,gender,job,height
3,paul,58,F,manager,
5,julie,44,F,scientist,171.0
4,peter,33,M,engineer,
1,john,26,M,student,180.0
2,eric,22,M,student,175.0
0,alice,19,F,student,165.0


In [None]:
df.sort_values(by=['job', 'age']) # sort by multiple columns


Unnamed: 0,name,age,gender,job,height
4,peter,33,M,engineer,
3,paul,58,F,manager,
5,julie,44,F,scientist,171.0
0,alice,19,F,student,165.0
2,eric,22,M,student,175.0
1,john,26,M,student,180.0


In [None]:
df.sort_values(by=['job', 'age'], inplace=True) # modify df
print(df)

    name  age gender        job  height
4  peter   33      M   engineer     NaN
3   paul   58      F    manager     NaN
5  julie   44      F  scientist   171.0
0  alice   19      F    student   165.0
2   eric   22      M    student   175.0
1   john   26      M    student   180.0


## 5.8 Descriptive statistics
Summarize all numeric columns

In [None]:
print(df.describe())

             age      height
count   6.000000    4.000000
mean   33.666667  172.750000
std    14.895189    6.344289
min    19.000000  165.000000
25%    23.000000  169.500000
50%    29.500000  173.000000
75%    41.250000  176.250000
max    58.000000  180.000000


### Summarize all columns

In [None]:
print(df.describe(include='all'))

         name        age gender      job      height
count       6   6.000000      6        6    4.000000
unique      6        NaN      2        4         NaN
top     peter        NaN      M  student         NaN
freq        1        NaN      3        3         NaN
mean      NaN  33.666667    NaN      NaN  172.750000
std       NaN  14.895189    NaN      NaN    6.344289
min       NaN  19.000000    NaN      NaN  165.000000
25%       NaN  23.000000    NaN      NaN  169.500000
50%       NaN  29.500000    NaN      NaN  173.000000
75%       NaN  41.250000    NaN      NaN  176.250000
max       NaN  58.000000    NaN      NaN  180.000000


In [None]:
print(df.describe(include=['object'])) # limit to one (or more) types

         name gender      job
count       6      6        6
unique      6      2        4
top     peter      M  student
freq        1      3        3


### Statistics per group (groupby)

In [None]:
print(df.groupby("job").mean())

                 age      height
job                             
engineer   33.000000         NaN
manager    58.000000         NaN
scientist  44.000000  171.000000
student    22.333333  173.333333


In [None]:
print(df.groupby("job")["age"].mean())

job
engineer     33.000000
manager      58.000000
scientist    44.000000
student      22.333333
Name: age, dtype: float64


In [None]:
print(df.groupby("job").describe(include='all'))

           name                                                   ... height  \
          count unique    top freq mean  std  min  25%  50%  75%  ... unique   
job                                                               ...          
engineer      1      1  peter    1  NaN  NaN  NaN  NaN  NaN  NaN  ...    NaN   
manager       1      1   paul    1  NaN  NaN  NaN  NaN  NaN  NaN  ...    NaN   
scientist     1      1  julie    1  NaN  NaN  NaN  NaN  NaN  NaN  ...    NaN   
student       3      3  alice    1  NaN  NaN  NaN  NaN  NaN  NaN  ...    NaN   

                                                                             
          top freq        mean       std    min    25%    50%    75%    max  
job                                                                          
engineer  NaN  NaN         NaN       NaN    NaN    NaN    NaN    NaN    NaN  
manager   NaN  NaN         NaN       NaN    NaN    NaN    NaN    NaN    NaN  
scientist NaN  NaN  171.000000       NaN  171.0  

## Groupby in a loop

In [None]:
for grp, data in df.groupby("job"):
  print(grp, data)

engineer     name  age gender       job  height
4  peter   33      M  engineer     NaN
manager    name  age gender      job  height
3  paul   58      F  manager     NaN
scientist     name  age gender        job  height
5  julie   44      F  scientist   171.0
student     name  age gender      job  height
0  alice   19      F  student   165.0
2   eric   22      M  student   175.0
1   john   26      M  student   180.0


## 5.9 Quality check
5.9.1 Remove duplicate data

In [None]:
df = users.append(df.iloc[0], ignore_index=True)
df

Unnamed: 0,name,age,gender,job,height
0,alice,19,F,student,165.0
1,john,26,M,student,180.0
2,eric,22,M,student,175.0
3,paul,58,F,manager,
4,peter,33,M,engineer,
5,julie,44,F,scientist,171.0
6,alice,19,F,student,165.0


In [None]:
print(df.duplicated()) # Series of booleans
# (True if a row is identical to a previous row)

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool


In [None]:
df.duplicated().sum() # count of duplicates

1

In [None]:
df[df.duplicated()] # only show duplicates

Unnamed: 0,name,age,gender,job,height
6,alice,19,F,student,165.0


In [None]:
df.age.duplicated() # check a single column for duplicates

0    False
1    False
2    False
3    False
4    False
5    False
6     True
Name: age, dtype: bool

In [None]:
df.duplicated(['age', 'gender']).sum() # specify columns for finding duplicates

1

In [None]:
df = df.drop_duplicates() # drop duplicate rows
df

Unnamed: 0,name,age,gender,job,height
0,alice,19,F,student,165.0
1,john,26,M,student,180.0
2,eric,22,M,student,175.0
3,paul,58,F,manager,
4,peter,33,M,engineer,
5,julie,44,F,scientist,171.0


## 5.9.2 Missing data

In [None]:
# Missing values are often just excluded
df = users.copy()

In [None]:
df.describe(include='all') # excludes missing values

Unnamed: 0,name,age,gender,job,height
count,6,6.0,6,6,4.0
unique,6,,2,4,
top,alice,,F,student,
freq,1,,3,3,
mean,,33.666667,,,172.75
std,,14.895189,,,6.344289
min,,19.0,,,165.0
25%,,23.0,,,169.5
50%,,29.5,,,173.0
75%,,41.25,,,176.25


In [None]:
# find missing values in a Series
df.height.isnull() # True if NaN, False otherwise

0    False
1    False
2    False
3     True
4     True
5    False
Name: height, dtype: bool

In [None]:
df.height.notnull() # False if NaN, True otherwise

0     True
1     True
2     True
3    False
4    False
5     True
Name: height, dtype: bool

In [None]:
df[df.height.notnull()] # only show rows where age is not NaN

Unnamed: 0,name,age,gender,job,height
0,alice,19,F,student,165.0
1,john,26,M,student,180.0
2,eric,22,M,student,175.0
5,julie,44,F,scientist,171.0


In [None]:
df.height.isnull().sum() # count the missing values

2

In [None]:
# find missing values in a DataFrame
df.isnull() # DataFrame of booleans

Unnamed: 0,name,age,gender,job,height
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,True
4,False,False,False,False,True
5,False,False,False,False,False


In [None]:
df.isnull().sum() # calculate the sum of each column

name      0
age       0
gender    0
job       0
height    2
dtype: int64

Strategy 1: drop missing values

In [None]:
df.dropna() # drop a row if ANY values are missing

Unnamed: 0,name,age,gender,job,height
0,alice,19,F,student,165.0
1,john,26,M,student,180.0
2,eric,22,M,student,175.0
5,julie,44,F,scientist,171.0


In [None]:
df.dropna(how='all') # drop a row only if ALL values are missing

Unnamed: 0,name,age,gender,job,height
0,alice,19,F,student,165.0
1,john,26,M,student,180.0
2,eric,22,M,student,175.0
3,paul,58,F,manager,
4,peter,33,M,engineer,
5,julie,44,F,scientist,171.0


Strategy 2: fill in missing values

In [None]:
df.height.mean()

172.75

In [None]:
df = users.copy()
df.loc[df.height.isnull(), "height"] = df["height"].mean()
print(df)

    name  age gender        job  height
0  alice   19      F    student  165.00
1   john   26      M    student  180.00
2   eric   22      M    student  175.00
3   paul   58      F    manager  172.75
4  peter   33      M   engineer  172.75
5  julie   44      F  scientist  171.00
