# Operating on Null Values

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.Series([1, np.nan, 'hello', None])

In [3]:
#Detecting null values
#Boolean mask over the data
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [4]:
data[data.isnull()]

1     NaN
3    None
dtype: object

In [5]:
#Dropping null values

data.dropna()

0        1
2    hello
dtype: object

In [7]:
data

0        1
1      NaN
2    hello
3     None
dtype: object

In [9]:
df = pd.DataFrame([[1, np.nan, 2],
                   [2,     3,  5],
                   [np.nan, 4,  6]])

df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [10]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [11]:
df.dropna(axis = 'columns')

Unnamed: 0,2
0,2
1,5
2,6


In [12]:
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [13]:
df.dropna(axis = 'columns', how = 'all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [14]:
df.dropna(axis = 'rows', thresh = 3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


In [15]:
# Filling null values

data = pd.Series([1, np.nan, 'hello', None, 3],
                index = ['a', 'b', 'c', 'd', 'e'] )

data

a        1
b      NaN
c    hello
d     None
e        3
dtype: object

In [16]:
#forward-fill
data.fillna(method = 'ffill')

a        1
b        1
c    hello
d    hello
e        3
dtype: object

In [17]:
#backward-fill
data.fillna(method = 'bfill')

a        1
b    hello
c    hello
d        3
e        3
dtype: object

In [20]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [21]:
df.fillna(method = 'ffill', axis = 'columns')

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


In [22]:
df.fillna(method = 'ffill', axis = 'rows')

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,2.0,4.0,6,


In [23]:
df.fillna(np.mean(df))

Unnamed: 0,0,1,2,3
0,1.0,3.5,2,
1,2.0,3.0,5,
2,1.5,4.0,6,


In [24]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


## Combining Datasets: Concat and Append

In [26]:
ser1 = pd.Series(['A', 'B', 'C'],
                 index = [1,2,3])

ser2 = pd.Series(['D', 'E', 'F'],
                 index = [4,5,6])

pd.concat([ser1,ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [29]:
df1 = pd.DataFrame([['A1', 'B1'], ['A2', 'B2']],
                   columns = ['A', 'B'],
                   index = [1,2])

df2 = pd.DataFrame([['A3', 'B3'], ['A4', 'B4']],
                   columns = ['A', 'B'],
                   index = [3,4])

print(df1)
print()
print(df2)
print()
print(pd.concat([df1,df2])) #row-wise(default)

    A   B
1  A1  B1
2  A2  B2

    A   B
3  A3  B3
4  A4  B4

    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4


In [30]:
df3 = pd.DataFrame([['A1', 'B1'], ['A2', 'B2']],
                   columns = ['A', 'B'],
                   index = [1,2])

df4 = pd.DataFrame([['A3', 'B3'], ['A4', 'B4']],
                   columns = ['C', 'D'],
                   index = [1,2])

print(df3)
print()
print(df4)
print()
print(pd.concat([df3,df4], axis = 'columns')) #column-wise(default)

    A   B
1  A1  B1
2  A2  B2

    C   D
1  A3  B3
2  A4  B4

    A   B   C   D
1  A1  B1  A3  B3
2  A2  B2  A4  B4


In [33]:
x = pd.DataFrame([['A1', 'B1'], ['A2', 'B2']],
                   columns = ['A', 'B'],
                   index = [1,2])

y = pd.DataFrame([['A3', 'B3'], ['A4', 'B4']],
                   columns = ['A', 'B'],
                   index = [3,4])

y.index = x.index #make duplicate indices!


print(x)
print()
print(y)
print()
print(pd.concat([x,y])) #row-wise(default)
p = pd.concat([x,y])

    A   B
1  A1  B1
2  A2  B2

    A   B
1  A3  B3
2  A4  B4

    A   B
1  A1  B1
2  A2  B2
1  A3  B3
2  A4  B4


In [36]:
p.loc[1]

Unnamed: 0,A,B
1,A1,B1
1,A3,B3


In [37]:
pd.concat([x,y], verify_integrity= True)

ValueError: Indexes have overlapping values: Int64Index([1, 2], dtype='int64')

In [38]:
#Catch the duplicate indexes

try:
    pd.concat([x,y], verify_integrity= True)
except ValueError as e:
    print("Value Error: ", e)

Value Error:  Indexes have overlapping values: Int64Index([1, 2], dtype='int64')


In [39]:
#Ignoring the index

print(x)
print()
print(y)
print()
print(pd.concat([x,y], ignore_index = True))

    A   B
1  A1  B1
2  A2  B2

    A   B
1  A3  B3
2  A4  B4

    A   B
0  A1  B1
1  A2  B2
2  A3  B3
3  A4  B4


## Concatination with Joins

In [40]:
x = pd.DataFrame([['A1', 'B1', 'C1'], ['A2', 'B2', 'C2']],
                   columns = ['A', 'B', 'C'],
                   index = [1,2])

y = pd.DataFrame([['B3', 'C3', 'D3'], ['B4', 'C4', 'D4']],
                   columns = ['B', 'C', 'D'],
                   index = [3,4])


print(x)
print()
print(y)
print()
print(pd.concat([x,y])) #outer join - union of the columns

    A   B   C
1  A1  B1  C1
2  A2  B2  C2

    B   C   D
3  B3  C3  D3
4  B4  C4  D4

     A   B   C    D
1   A1  B1  C1  NaN
2   A2  B2  C2  NaN
3  NaN  B3  C3   D3
4  NaN  B4  C4   D4


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [41]:
print(x)
print()
print(y)
print()
print(pd.concat([x,y], join = 'inner')) #inner join - intersection of the columns

    A   B   C
1  A1  B1  C1
2  A2  B2  C2

    B   C   D
3  B3  C3  D3
4  B4  C4  D4

    B   C
1  B1  C1
2  B2  C2
3  B3  C3
4  B4  C4


In [42]:
print(x)
print()
print(y)
print()
print(pd.concat([x,y], join_axes = [x.columns])) 

    A   B   C
1  A1  B1  C1
2  A2  B2  C2

    B   C   D
3  B3  C3  D3
4  B4  C4  D4

     A   B   C
1   A1  B1  C1
2   A2  B2  C2
3  NaN  B3  C3
4  NaN  B4  C4


  """


In [44]:
print(x)
print()
print(y)
print()
print(x.append(y))

    A   B   C
1  A1  B1  C1
2  A2  B2  C2

    B   C   D
3  B3  C3  D3
4  B4  C4  D4

     A   B   C    D
1   A1  B1  C1  NaN
2   A2  B2  C2  NaN
3  NaN  B3  C3   D3
4  NaN  B4  C4   D4


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


## Combining Datasets: Merge and Join:

- **Relational algebra**: Set of rules for manipulating relational data, and forms the foundation of operations 
  available in most databases
  
- This behaviour is implemented in **pd.merge()** which is a subset of what is called relational algebra
  


### Categories of Joins

- One-to-one join : Column-wise concatenation

In [46]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})

df2 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})

print(df1)
print()
print(df2)

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR

  employee  hire_date
0      Bob       2004
1     Jake       2008
2     Lisa       2012
3      Sue       2014


In [47]:
df3 = pd.merge(df1, df2)
df3

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2004
1,Jake,Engineering,2008
2,Lisa,Engineering,2012
3,Sue,HR,2014


- The pd.merge() function recognizes that each DataFrame has an 'employee' column,
  and automatically joins them using this column as a **key**
  
- The merge in general discards the index, except in cases of merges by index

#### Many-to-one joins
- Joins in which one of the two key columns contains duplicate entries

In [48]:
df3

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2004
1,Jake,Engineering,2008
2,Lisa,Engineering,2012
3,Sue,HR,2014


In [49]:
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                    'supervisor': ['Max', 'John', 'Steve']})

In [50]:
print(df3)
print()
print(df4)
print()
print(pd.merge(df3,df4))

  employee        group  hire_date
0      Bob   Accounting       2004
1     Jake  Engineering       2008
2     Lisa  Engineering       2012
3      Sue           HR       2014

         group supervisor
0   Accounting        Max
1  Engineering       John
2           HR      Steve

  employee        group  hire_date supervisor
0      Bob   Accounting       2004        Max
1     Jake  Engineering       2008       John
2     Lisa  Engineering       2012       John
3      Sue           HR       2014      Steve


#### Many-to-many joins
- Joins in which both of the left and right key columns contain duplicate entries

In [51]:
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting', 'Engineering', 'Engineering', 'HR', 'HR'],
                    'skills': ['math', 'excel', 'coding', 'linux', 'excel', 'organization']})

In [52]:
df5

Unnamed: 0,group,skills
0,Accounting,math
1,Accounting,excel
2,Engineering,coding
3,Engineering,linux
4,HR,excel
5,HR,organization


In [53]:
print(df1)
print()
print(df5)
print()
print(pd.merge(df1,df5))

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR

         group        skills
0   Accounting          math
1   Accounting         excel
2  Engineering        coding
3  Engineering         linux
4           HR         excel
5           HR  organization

  employee        group        skills
0      Bob   Accounting          math
1      Bob   Accounting         excel
2     Jake  Engineering        coding
3     Jake  Engineering         linux
4     Lisa  Engineering        coding
5     Lisa  Engineering         linux
6      Sue           HR         excel
7      Sue           HR  organization


## Specification of the Merge Key:

In [56]:
#The on keyword
#Works only if both left and right DataFrame have the specified column
#Take a column name or a list of column names

print(df1)
print()
print(df2)
print()
print(pd.merge(df1,df2, on = 'employee'))

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR

  employee  hire_date
0      Bob       2004
1     Jake       2008
2     Lisa       2012
3      Sue       2014

  employee        group  hire_date
0      Bob   Accounting       2004
1     Jake  Engineering       2008
2     Lisa  Engineering       2012
3      Sue           HR       2014


In [59]:
#The left_on and right_on keywords

df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'salary': [70000, 80000, 120000, 90000]})

print(df1)
print()
print(df3)
print()
print(pd.merge(df1,df3, left_on = 'employee', right_on = 'name'))

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR

   name  salary
0   Bob   70000
1  Jake   80000
2  Lisa  120000
3   Sue   90000

  employee        group  name  salary
0      Bob   Accounting   Bob   70000
1     Jake  Engineering  Jake   80000
2     Lisa  Engineering  Lisa  120000
3      Sue           HR   Sue   90000


In [62]:
print(pd.merge(df1,df3, left_on = 'employee', right_on = 'name').drop('name', axis = 'columns'))

  employee        group  salary
0      Bob   Accounting   70000
1     Jake  Engineering   80000
2     Lisa  Engineering  120000
3      Sue           HR   90000


In [63]:
#The left_index and right_index keywords

df1a = df1.set_index('employee')
df2a = df2.set_index('employee')

print(df1a)
print()
print(df2a)

                group
employee             
Bob        Accounting
Jake      Engineering
Lisa      Engineering
Sue                HR

          hire_date
employee           
Bob            2004
Jake           2008
Lisa           2012
Sue            2014


In [64]:
print(pd.merge(df1a, df2a, left_index = True, right_index = True))

                group  hire_date
employee                        
Bob        Accounting       2004
Jake      Engineering       2008
Lisa      Engineering       2012
Sue                HR       2014


In [66]:
print(pd.merge(df1a, df3, left_index = True, right_on = 'name'))

         group  name  salary
0   Accounting   Bob   70000
1  Engineering  Jake   80000
2  Engineering  Lisa  120000
3           HR   Sue   90000


In [67]:
p = pd.merge(df1a, df3, left_index = True, right_on = 'name')

In [73]:
#Change column names
p.columns.values[1] = 'employee'

In [74]:
p

Unnamed: 0,group,employee,salary
0,Accounting,Bob,70000
1,Engineering,Jake,80000
2,Engineering,Lisa,120000
3,HR,Sue,90000
