## Missing Values


In [2]:
import numpy as np
import pandas as pd

In [3]:
data=pd.DataFrame({
    'A':[1,2,3,4,5],
    'B':[2,4,np.nan,6,np.nan],
    'C':[3,6,9,np.nan,np.nan],
    'D':[np.nan,8,12,16,20],
    'E':[np.nan,11,15,np.nan,np.nan],
    'F':[6,12,18,24,30]
    })

In [4]:
df=pd.DataFrame(data)
print(df)

   A    B    C     D     E   F
0  1  2.0  3.0   NaN   NaN   6
1  2  4.0  6.0   8.0  11.0  12
2  3  NaN  9.0  12.0  15.0  18
3  4  6.0  NaN  16.0   NaN  24
4  5  NaN  NaN  20.0   NaN  30


In [5]:
df.isna()

Unnamed: 0,A,B,C,D,E,F
0,False,False,False,True,True,False
1,False,False,False,False,False,False
2,False,True,False,False,False,False
3,False,False,True,False,True,False
4,False,True,True,False,True,False


In [6]:
df.isna().sum()

A    0
B    2
C    2
D    1
E    3
F    0
dtype: int64

In [7]:
df.isna().any()

A    False
B     True
C     True
D     True
E     True
F    False
dtype: bool

In [8]:
#Delete row with any nan value
df.dropna()
df.dropna(axis=0,how='any') #Both will give same output


Unnamed: 0,A,B,C,D,E,F
1,2,4.0,6.0,8.0,11.0,12


In [9]:
#Delete row with aall nan value
df.dropna(axis=0,how='all') 

Unnamed: 0,A,B,C,D,E,F
0,1,2.0,3.0,,,6
1,2,4.0,6.0,8.0,11.0,12
2,3,,9.0,12.0,15.0,18
3,4,6.0,,16.0,,24
4,5,,,20.0,,30


In [10]:
#Delete Col with any nan value
df.dropna(axis=1)
df.dropna(axis=1,how='any') #Both will give same output


Unnamed: 0,A,F
0,1,6
1,2,12
2,3,18
3,4,24
4,5,30


In [11]:
#Delete row with aall nan value
df.dropna(axis=1,how='all')

Unnamed: 0,A,B,C,D,E,F
0,1,2.0,3.0,,,6
1,2,4.0,6.0,8.0,11.0,12
2,3,,9.0,12.0,15.0,18
3,4,6.0,,16.0,,24
4,5,,,20.0,,30


In [12]:
#Threshold -- At Least
df.dropna(thresh=4) #Atleast 4 Non Nan Calue in row

Unnamed: 0,A,B,C,D,E,F
0,1,2.0,3.0,,,6
1,2,4.0,6.0,8.0,11.0,12
2,3,,9.0,12.0,15.0,18
3,4,6.0,,16.0,,24


In [13]:
df.dropna(axis=1,thresh=4) #Atleast 4 Non Nan Calue in column
print(df)

   A    B    C     D     E   F
0  1  2.0  3.0   NaN   NaN   6
1  2  4.0  6.0   8.0  11.0  12
2  3  NaN  9.0  12.0  15.0  18
3  4  6.0  NaN  16.0   NaN  24
4  5  NaN  NaN  20.0   NaN  30


In [14]:
df

Unnamed: 0,A,B,C,D,E,F
0,1,2.0,3.0,,,6
1,2,4.0,6.0,8.0,11.0,12
2,3,,9.0,12.0,15.0,18
3,4,6.0,,16.0,,24
4,5,,,20.0,,30


In [15]:
#Fill The Missing Value with 0
df.fillna(0)

Unnamed: 0,A,B,C,D,E,F
0,1,2.0,3.0,0.0,0.0,6
1,2,4.0,6.0,8.0,11.0,12
2,3,0.0,9.0,12.0,15.0,18
3,4,6.0,0.0,16.0,0.0,24
4,5,0.0,0.0,20.0,0.0,30


In [16]:
values={'A':100,'B':200,'C':300,'D':400,'E':500,'F':600}
df.fillna(value=values)

Unnamed: 0,A,B,C,D,E,F
0,1,2.0,3.0,400.0,500.0,6
1,2,4.0,6.0,8.0,11.0,12
2,3,200.0,9.0,12.0,15.0,18
3,4,6.0,300.0,16.0,500.0,24
4,5,200.0,300.0,20.0,500.0,30


In [17]:
#Fill with mean
df.fillna(df.mean())

Unnamed: 0,A,B,C,D,E,F
0,1,2.0,3.0,14.0,13.0,6
1,2,4.0,6.0,8.0,11.0,12
2,3,4.0,9.0,12.0,15.0,18
3,4,6.0,6.0,16.0,13.0,24
4,5,4.0,6.0,20.0,13.0,30


## Merging, Joining andConcatenation 

✅ merge → When you have common columns

✅ join → When index is already set

✅ concat → When stacking or combining data without logic


In [32]:
Employees = pd.DataFrame({
    "Emp_ID": [101, 102, 103, 104, 105],
    "Name": ["Annu", "Raahi", "Tanish", "Kirti", "Shivani"],
    "Department": ["IT", "HR", "Finance", "IT", "Sales"],
})

# DataFrame 2
Salaries = pd.DataFrame({
    "Emp_ID": [101, 102, 103, 106, 107],
    "Name": ["Tanish", "Arohi", "Priya", "Kirti", "Shivani"],
    "Salary": [50000, 55000, 60000, 45000, 47000]
})

print("Employees Data :")
print(df1)

print("\nSalary Data :")
print(df2)


Employees Data :
   Emp_ID     Name Department  Salary
0     101     Annu         IT   50000
1     102    Raahi         HR   55000
2     103   Tanish    Finance   60000
3     104    Kirti         IT   52000
4     105  Shivani      Sales   48000

Salary Data :
   Emp_ID     Name Department  Salary
0     101   Tanish         IT   50000
1     102    Arohi         HR   55000
2     103    Priya    Finance   60000
3     106    Kirti  Marketing   45000
4     107  Shivani    Support   47000


#### Merging 

Merging usually Combines DataFrames based on common column(s)

In [33]:
pd.merge(Employees,Salaries,how='inner',on='Emp_ID') #how has 'inner','outer','left','right'

Unnamed: 0,Emp_ID,Name_x,Department,Name_y,Salary
0,101,Annu,IT,Tanish,50000
1,102,Raahi,HR,Arohi,55000
2,103,Tanish,Finance,Priya,60000


#### Joining
Joining Works mainly with index, not columns

In [46]:
# Employees_indexed = Employees.set_index("Emp_ID")
# Salaries_indexed = Salaries.set_index("Emp_ID")

# join_result = Employees_indexed.join(Salaries_indexed, how="left", lsuffix="_emp", rsuffix="_sal")
# print(join_result)




#### Concatenation
Stacks DataFrames, does NOT match rows by default

In [40]:
pd.concat([Employees,Salaries],axis=0) #Append rowsa dn if column does not match return NAN

Unnamed: 0,Emp_ID,Name,Department,Salary
0,101,Annu,IT,
1,102,Raahi,HR,
2,103,Tanish,Finance,
3,104,Kirti,IT,
4,105,Shivani,Sales,
0,101,Tanish,,50000.0
1,102,Arohi,,55000.0
2,103,Priya,,60000.0
3,106,Kirti,,45000.0
4,107,Shivani,,47000.0


In [42]:
pd.concat([Employees,Salaries],axis=1) #Side-by-side Index must align

Unnamed: 0,Emp_ID,Name,Department,Emp_ID.1,Name.1,Salary
0,101,Annu,IT,101,Tanish,50000
1,102,Raahi,HR,102,Arohi,55000
2,103,Tanish,Finance,103,Priya,60000
3,104,Kirti,IT,106,Kirti,45000
4,105,Shivani,Sales,107,Shivani,47000
