### Segment 1 - Filtering and selecting Data

In [1]:
import numpy as np
import pandas as pd

from pandas import Series, DataFrame

#### Selecting and Retrieving Data
We can write the index values in 2 forms:
- Label Index
- Integer Index

In [11]:
series_obj = Series(np.arange(8), index = ['row 1','row 2','row 3','row 4','row 5','row 6','row 7','row 8'])
print(series_obj)

row 1    0
row 2    1
row 3    2
row 4    3
row 5    4
row 6    5
row 7    6
row 8    7
dtype: int32


In [4]:
# Label Index
series_obj['row5']

4

In [6]:
# Integer Index
series_obj[[1,4]]

row2    1
row5    4
dtype: int32

In [7]:
# Creating a dataframe using random number generator
np.random.seed(25)
DF_obj = pd.DataFrame(np.random.rand(36).reshape((6,6)),
                     index = ['row 1','row 2','row 3','row 4','row 5','row 6'],
                     columns = ['col 1','col 2','col 3', 'col 4', 'col 5', 'col 6'])
print(DF_obj)

          col 1     col 2     col 3     col 4     col 5     col 6
row 1  0.870124  0.582277  0.278839  0.185911  0.411100  0.117376
row 2  0.684969  0.437611  0.556229  0.367080  0.402366  0.113041
row 3  0.447031  0.585445  0.161985  0.520719  0.326051  0.699186
row 4  0.366395  0.836375  0.481343  0.516502  0.383048  0.997541
row 5  0.514244  0.559053  0.034450  0.719930  0.421004  0.436935
row 6  0.281701  0.900274  0.669612  0.456069  0.289804  0.525819


In [9]:
# The .loc method takes 2 parameters here - 1-> name of rows, 2-> name of columns
DF_obj.loc[['row 2', 'row 5'], ['col 5', 'col 2']]

Unnamed: 0,col 5,col 2
row 2,0.402366,0.437611
row 5,0.421004,0.559053


#### Data Slicing
We can use slicing to select and return a slice of several values from a data set.

In Slicing, we pass 2 index values that are separated by a colon. The indexer thenreturns those 2 values and everything between them.

In [12]:
series_obj['row 2':'row 6']

row 2    1
row 3    2
row 4    3
row 5    4
row 6    5
dtype: int32

#### Comparing with Scalars
We can use comparison operators like greater than(>) or less than(<) to return True/False for all records to indicate how each element compares to a scalar value( a single numeric value).

In [16]:
DF_obj < 0.3

Unnamed: 0,col 1,col 2,col 3,col 4,col 5,col 6
row 1,False,False,True,True,False,True
row 2,False,False,False,False,False,True
row 3,False,False,True,False,False,False
row 4,False,False,False,False,False,False
row 5,False,False,True,False,False,False
row 6,True,False,False,False,True,False


#### Filtering with Scalars

In [20]:
series_obj[series_obj > 4]

row 6    5
row 7    6
row 8    7
dtype: int32

In [22]:
DF_obj[DF_obj<0.3]

Unnamed: 0,col 1,col 2,col 3,col 4,col 5,col 6
row 1,,,0.278839,0.185911,,0.117376
row 2,,,,,,0.113041
row 3,,,0.161985,,,
row 4,,,,,,
row 5,,,0.03445,,,
row 6,0.281701,,,,0.289804,


#### Setting values with scalars

In [30]:
series_obj[['row 1', 'row 2']] = 8
series_obj

row 1    8
row 2    8
row 3    2
row 4    3
row 5    4
row 6    5
row 7    6
row 8    7
dtype: int32

### Segment 2 - Treating Missing Values

In [2]:
import pandas as pd
import numpy as np


#### Figuring Out what data is missing

In [4]:
missing = np.nan

series_obj = pd.Series(['row 1', 'row 2', missing, 'row 4', 'row 5', 'row 6', missing, 'row 8'])
print(series_obj)

0    row 1
1    row 2
2      NaN
3    row 4
4    row 5
5    row 6
6      NaN
7    row 8
dtype: object


In [6]:
# Returns true for all indices that have NULL or missing values
series_obj.isnull()

0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
dtype: bool

#### Filling in for Missing Values

In [19]:
np.random.seed(25)
DF_obj = pd.DataFrame(np.random.rand(36).reshape(6,6)) # Returns a 6x6 dataframe with random values
print(DF_obj)

          0         1         2         3         4         5
0  0.870124  0.582277  0.278839  0.185911  0.411100  0.117376
1  0.684969  0.437611  0.556229  0.367080  0.402366  0.113041
2  0.447031  0.585445  0.161985  0.520719  0.326051  0.699186
3  0.366395  0.836375  0.481343  0.516502  0.383048  0.997541
4  0.514244  0.559053  0.034450  0.719930  0.421004  0.436935
5  0.281701  0.900274  0.669612  0.456069  0.289804  0.525819


In [20]:
DF_obj.loc[3:5,0] = missing
DF_obj.loc[1:4,5] = missing
print(DF_obj)

          0         1         2         3         4         5
0  0.870124  0.582277  0.278839  0.185911  0.411100  0.117376
1  0.684969  0.437611  0.556229  0.367080  0.402366       NaN
2  0.447031  0.585445  0.161985  0.520719  0.326051       NaN
3       NaN  0.836375  0.481343  0.516502  0.383048       NaN
4       NaN  0.559053  0.034450  0.719930  0.421004       NaN
5       NaN  0.900274  0.669612  0.456069  0.289804  0.525819


In [15]:
filled_df = DF_obj.fillna(0)
print(filled_df)

          0         1         2         3         4         5
0  0.870124  0.582277  0.278839  0.185911  0.411100  0.117376
1  0.684969  0.437611  0.556229  0.367080  0.402366  0.000000
2  0.447031  0.585445  0.161985  0.520719  0.326051  0.000000
3  0.000000  0.836375  0.481343  0.516502  0.383048  0.000000
4  0.000000  0.559053  0.034450  0.719930  0.421004  0.000000
5  0.000000  0.900274  0.669612  0.456069  0.289804  0.525819


In [16]:
# Filling missing values on different columns with different values 
filled_df = DF_obj.fillna({0:0.1,5:1.25}) # the dict passed has the format col_num:new value - 0th column:value = 0.1
print(filled_df)

          0         1         2         3         4         5
0  0.870124  0.582277  0.278839  0.185911  0.411100  0.117376
1  0.684969  0.437611  0.556229  0.367080  0.402366  1.250000
2  0.447031  0.585445  0.161985  0.520719  0.326051  1.250000
3  0.100000  0.836375  0.481343  0.516502  0.383048  1.250000
4  0.100000  0.559053  0.034450  0.719930  0.421004  1.250000
5  0.100000  0.900274  0.669612  0.456069  0.289804  0.525819


In [17]:
filled_df = DF_obj.fillna(method='ffill')
print(filled_df)

          0         1         2         3         4         5
0  0.870124  0.582277  0.278839  0.185911  0.411100  0.117376
1  0.684969  0.437611  0.556229  0.367080  0.402366  0.117376
2  0.447031  0.585445  0.161985  0.520719  0.326051  0.117376
3  0.447031  0.836375  0.481343  0.516502  0.383048  0.117376
4  0.447031  0.559053  0.034450  0.719930  0.421004  0.117376
5  0.447031  0.900274  0.669612  0.456069  0.289804  0.525819


#### Counting Missing values

In [22]:
np.random.seed(25)
DF_obj = pd.DataFrame(np.random.rand(36).reshape(6,6)) # Returns a 6x6 dataframe with random values
DF_obj.loc[3:5,0] = missing
DF_obj.loc[1:4,5] = missing
print(DF_obj)

          0         1         2         3         4         5
0  0.870124  0.582277  0.278839  0.185911  0.411100  0.117376
1  0.684969  0.437611  0.556229  0.367080  0.402366       NaN
2  0.447031  0.585445  0.161985  0.520719  0.326051       NaN
3       NaN  0.836375  0.481343  0.516502  0.383048       NaN
4       NaN  0.559053  0.034450  0.719930  0.421004       NaN
5       NaN  0.900274  0.669612  0.456069  0.289804  0.525819


In [23]:
# Returns number of Null values in each column
DF_obj.isnull().sum()

0    3
1    0
2    0
3    0
4    0
5    4
dtype: int64

#### Filtering out Missing Values

In [24]:
# All the rows with any Null values are dropped
df_nonan = DF_obj.dropna()
print(df_nonan)

          0         1         2         3       4         5
0  0.870124  0.582277  0.278839  0.185911  0.4111  0.117376


In [25]:
# All the columns with any Null values are dropped
df_nonan = DF_obj.dropna(axis=1)
print(df_nonan)

          1         2         3         4
0  0.582277  0.278839  0.185911  0.411100
1  0.437611  0.556229  0.367080  0.402366
2  0.585445  0.161985  0.520719  0.326051
3  0.836375  0.481343  0.516502  0.383048
4  0.559053  0.034450  0.719930  0.421004
5  0.900274  0.669612  0.456069  0.289804


#### Segment 3 - Removing Duplicates