**6) Useful methods and operations**

In [1]:
import numpy as np
import pandas as pd

In [3]:
data_dict = {'col_1': [1, 2, 3, 4, 5],
            'col_2': [111, 222, 333, 111, 555],
            'col_3': ['alpha', 'bravo', 'charlie', np.nan, np.nan]}
df = pd.DataFrame(data_dict, index=[1, 2, 3, 4, 5])
df

Unnamed: 0,col_1,col_2,col_3
1,1,111,alpha
2,2,222,bravo
3,3,333,charlie
4,4,111,
5,5,555,


In [4]:
# lets start with info() which provides a concise summary of df
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 1 to 5
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   col_1   5 non-null      int64 
 1   col_2   5 non-null      int64 
 2   col_3   3 non-null      object
dtypes: int64(2), object(1)
memory usage: 160.0+ bytes


In [6]:
# head(n) - returns the first n rows, default n is 5
df.head()

Unnamed: 0,col_1,col_2,col_3
1,1,111,alpha
2,2,222,bravo
3,3,333,charlie
4,4,111,
5,5,555,


In [7]:
# isnull(), returns a boolean same-sized objects indicating
# the values that are null
df.isnull()

Unnamed: 0,col_1,col_2,col_3
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,True
5,False,False,True


In [9]:
# dropna(axis=<n>) n=0 for rows, n=1 for cols
print(df.dropna())
print(df.dropna(axis=1))

   col_1  col_2    col_3
1      1    111    alpha
2      2    222    bravo
3      3    333  charlie
   col_1  col_2
1      1    111
2      2    222
3      3    333
4      4    111
5      5    555


In [10]:
# fillna(value='', method='', axis=<n>, inplace=<boolean>)
# Fill NA/NaN values using the specified fillvalue, method
# axis and inplace
print(df.fillna(value='foxtrot'))

Unnamed: 0,col_1,col_2,col_3
1,1,111,alpha
2,2,222,bravo
3,3,333,charlie
4,4,111,foxtrot
5,5,555,foxtrot


In [11]:
# unique() - find and returns all the unique values
print('unique values in col_1', df['col_1'].unique())

unique values in col_1 [1 2 3 4 5]


In [12]:
# nunique() - find and returns the number of unique values
print('num unique values in col_2', df['col_2'].nunique())

num unique values in col_2 4


In [13]:
# value_counts() - find and returns counts for every value
print('value_counts of col_3', df['col_3'].value_counts())

value_counts of col_3 alpha      1
bravo      1
charlie    1
Name: col_3, dtype: int64


In [15]:
# sort_values - 
print(df.sort_values(by = 'col_2'))

   col_1  col_2    col_3
1      1    111    alpha
4      4    111      NaN
2      2    222    bravo
3      3    333  charlie
5      5    555      NaN


In [16]:
# apply() - we can broadcast our customized functions on our data
def square(value):
    return value ** 2

df['col_1'].apply(square)

1     1
2     4
3     9
4    16
5    25
Name: col_1, dtype: int64

In [17]:
df['col_1'].apply(lambda value: value ** 2)

1     1
2     4
3     9
4    16
5    25
Name: col_1, dtype: int64

In [23]:
df['col_3'].loc[0:3].apply(len) # avoid NaN rows by specifying 0:3

1    5
2    5
3    7
Name: col_3, dtype: int64

**Data Selection**
Lets revisit this again.
Suppose we have the following steps
Task 1: df['col_1'] > 2 - returns the data where condition is True. This is just a boolean series
Task 2: df['col_2'] == 111 - returns the data where condition is True
Task 3: Lets combine these two conditions with & by wrapping in ()
Task 4: wrap them in df[] and see the return value!

In [24]:
df['col_1'] > 2

1    False
2    False
3     True
4     True
5     True
Name: col_1, dtype: bool

In [25]:
df['col_2'] == 111

1     True
2    False
3    False
4     True
5    False
Name: col_2, dtype: bool

In [27]:
bool_ser = (df['col_1'] > 2) & (df['col_2'] == 111)
bool_ser

1    False
2    False
3    False
4     True
5    False
dtype: bool

In [28]:
result = df[bool_ser]
result

Unnamed: 0,col_1,col_2,col_3
4,4,111,


**Good to know**

In [29]:
# index - get the index of the dataframe
df.index

Int64Index([1, 2, 3, 4, 5], dtype='int64')

In [30]:
# columns - get the columns
df.columns

Index(['col_1', 'col_2', 'col_3'], dtype='object')

In [31]:
# drop() - drop a row or a column
df.drop('col_1', axis=1)

Unnamed: 0,col_2,col_3
1,111,alpha
2,222,bravo
3,333,charlie
4,111,
5,555,


In [32]:
# copy()
newdf = df.copy()
del newdf['col_1']
newdf

Unnamed: 0,col_2,col_3
1,111,alpha
2,222,bravo
3,333,charlie
4,111,
5,555,


**pivot_table()**
Create a spreadsheet-style pivot table as a DataFrame
Takes 3 main arguments
values - default is None,
index - default is None
columns - default is None

In [35]:
# lets create one from our dataframe df
print(df)
df.pivot_table(values = 'col_2', index='col_1', columns=['col_3'])

   col_1  col_2    col_3
1      1    111    alpha
2      2    222    bravo
3      3    333  charlie
4      4    111      NaN
5      5    555      NaN


col_3,alpha,bravo,charlie
col_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,111.0,,
2,,222.0,
3,,,333.0


Notice how NaN is not used for the column name in the pivot table, hence index 4 and 5 are skipped