# Pandas Data Manipulation: Handling Missing Values, Grouping, Merging, and Pivot Tables

In [1]:
##Handling Missing Values

import pandas as pd
import numpy as np

df = pd.DataFrame({
    "Name": ["Riya", "Ishaan", "Zahir"],
    "Age": [24, np.nan, 22],
    "Score": [85, 90, np.nan]
})

print("Original DataFrame:\n", df, "\n")
print("Detect missing values:\n", df.isnull(), "\n")
print("Fill missing values:\n", df.fillna(0), "\n")
print("Drop rows with missing values:\n", df.dropna(), "\n")


Original DataFrame:
      Name   Age  Score
0    Riya  24.0   85.0
1  Ishaan   NaN   90.0
2   Zahir  22.0    NaN 

Detect missing values:
     Name    Age  Score
0  False  False  False
1  False   True  False
2  False  False   True 

Fill missing values:
      Name   Age  Score
0    Riya  24.0   85.0
1  Ishaan   0.0   90.0
2   Zahir  22.0    0.0 

Drop rows with missing values:
    Name   Age  Score
0  Riya  24.0   85.0 



In [2]:
##Adding DataFrames with Overlaps

df1 = pd.DataFrame({"Score": [85, 90, 88]}, index=["Riya", "Ishaan", "Zahir"])
df2 = pd.DataFrame({"Score": [92, 80]}, index=["Ishaan", "Sonali"])

print("DataFrame 1:\n", df1, "\n")
print("DataFrame 2:\n", df2, "\n")

print("Adding df1 and df2:\n", df1 + df2, "\n")

df3 = df1.reindex(["Riya", "Ishaan", "Zahir", "Sonali"], fill_value=0)
print("Reindexed DataFrame:\n", df3, "\n")


DataFrame 1:
         Score
Riya       85
Ishaan     90
Zahir      88 

DataFrame 2:
         Score
Ishaan     92
Sonali     80 

Adding df1 and df2:
         Score
Ishaan  182.0
Riya      NaN
Sonali    NaN
Zahir     NaN 

Reindexed DataFrame:
         Score
Riya       85
Ishaan     90
Zahir      88
Sonali      0 



In [3]:
##Grouping and Aggregation

df = pd.DataFrame({
    "Name": ["Riya", "Ishaan", "Zahir", "Sonali", "Aarav"],
    "Age": [24, 27, 22, 30, 28],
    "Score": [85, 90, 88, 95, 89]
})

print("Original DataFrame:\n", df, "\n")
print("Sorted by Score:\n", df.sort_values(by="Score"), "\n")

grouped = df.groupby("Age")["Score"].mean()
print("Average score grouped by Age:\n", grouped, "\n")


Original DataFrame:
      Name  Age  Score
0    Riya   24     85
1  Ishaan   27     90
2   Zahir   22     88
3  Sonali   30     95
4   Aarav   28     89 

Sorted by Score:
      Name  Age  Score
0    Riya   24     85
2   Zahir   22     88
4   Aarav   28     89
1  Ishaan   27     90
3  Sonali   30     95 

Average score grouped by Age:
 Age
22    88
24    85
27    90
28    89
30    95
Name: Score, dtype: int64 



In [4]:
##Grouping a Series

salary = pd.Series([50000, 55000, 60000, 62000],
                   index=["Riya", "Ishaan", "Zahir", "Sonali"])
department = pd.Series(["HR", "HR", "IT", "IT"],
                       index=["Riya", "Ishaan", "Zahir", "Sonali"])

grouped_series = salary.groupby(department)
print("Grouped Series:\n", grouped_series, "\n")
print("HR group:\n", grouped_series.get_group("HR"), "\n")


Grouped Series:
 <pandas.core.groupby.generic.SeriesGroupBy object at 0x7f9bc8cb40d0> 

HR group:
 Riya      50000
Ishaan    55000
dtype: int64 



In [5]:
##Aggregating a Grouped Series

print("Mean per department:\n", grouped_series.mean(), "\n")
print("Sum per department:\n", grouped_series.sum(), "\n")
print("Max per department:\n", grouped_series.max(), "\n")


Mean per department:
 HR    52500
IT    61000
dtype: int64 

Sum per department:
 HR    105000
IT    122000
dtype: int64 

Max per department:
 HR    55000
IT    62000
dtype: int64 



In [6]:
##MultiIndex Series Grouping

arrays = [["HR", "HR", "IT", "IT"], ["Riya", "Ishaan", "Zahir", "Sonali"]]
index = pd.MultiIndex.from_arrays(arrays, names=("Dept", "Employee"))
multi_s = pd.Series([50000, 55000, 60000, 62000], index=index)

grouped_multi = multi_s.groupby(level=0).mean()
print("Mean salary by department (MultiLevel Series):\n", grouped_multi, "\n")


Mean salary by department (MultiLevel Series):
 Dept
HR    52500
IT    61000
dtype: int64 



In [8]:
##Grouping a DataFrame

df = pd.DataFrame({
    "Department": ["HR", "HR", "IT", "IT"],
    "Employee": ["Riya", "Ishaan", "Zahir", "Sonali"],
    "Salary": [50000, 55000, 60000, 62000]
})

grouped_df = df.groupby("Department")
print("HR group in DataFrame:\n", grouped_df.get_group("HR"), "\n")
print("Mean salary per department:\n", grouped_df["Salary"].mean(), "\n")


HR group in DataFrame:
   Department Employee  Salary
0         HR     Riya   50000
1         HR   Ishaan   55000 

Mean salary per department:
 Department
HR    52500
IT    61000
Name: Salary, dtype: int64 



In [10]:
 ## Aggregated DataFrame
aggregated_df = grouped_df["Salary"].mean()
print("Aggregated DataFrame (mean salary per department):\n", aggregated_df, "\n")

multi_agg_df = grouped_df.agg({"Salary": ["mean", "sum", "max"]})
print("Aggregated DataFrame (multiple stats):\n", multi_agg_df, "\n")


Aggregated DataFrame (mean salary per department):
 Department
HR    52500
IT    61000
Name: Salary, dtype: int64 

Aggregated DataFrame (multiple stats):
            Salary               
             mean     sum    max
Department                      
HR          52500  105000  55000
IT          61000  122000  62000 



In [11]:
##Grouping, aggregation, filtering
df = pd.DataFrame({
    "Department": ["HR", "HR", "IT", "IT", "HR", "IT"],
    "Employee": ["Riya", "Ishaan", "Zahir", "Sonali", "Eva", "Aarav"],
    "Salary": [50000, 55000, 60000, 62000, 58000, 61000],
    "Bonus": [5000, 6000, 7000, 8000, 5500, 7500]
})

grouped = df.groupby("Department")
mean_salary = grouped["Salary"].mean()
agg_stats = grouped.agg({
    "Salary": ["mean", "max", "min"],
    "Bonus": ["sum", "mean"]
})
high_salary_dept = grouped.filter(lambda x: x["Salary"].mean() > 55000)

print("Mean Salary per Department:\n", mean_salary, "\n")
print("Aggregated stats per Department:\n", agg_stats, "\n")
print("Departments with mean salary > 55000:\n", high_salary_dept, "\n")


Mean Salary per Department:
 Department
HR    54333.333333
IT    61000.000000
Name: Salary, dtype: float64 

Aggregated stats per Department:
                   Salary                Bonus      
                    mean    max    min    sum  mean
Department                                         
HR          54333.333333  58000  50000  16500  5500
IT          61000.000000  62000  60000  22500  7500 

Departments with mean salary > 55000:
   Department Employee  Salary  Bonus
2         IT    Zahir   60000   7000
3         IT   Sonali   62000   8000
5         IT    Aarav   61000   7500 



In [12]:
## Grouping without aggregation
salary = pd.Series([50000, 55000, 60000, 62000],
                   index=["Riya", "Ishaan", "Zahir", "Sonali"])
department = pd.Series(["HR", "HR", "IT", "IT"],
                       index=["Riya", "Ishaan", "Zahir", "Sonali"])

grouped_series = salary.groupby(department)
for dept, values in grouped_series:
    print(f"{dept}:\n{values}\n")

print("Mean salary per department:\n", grouped_series.mean())


HR:
Riya      50000
Ishaan    55000
dtype: int64

IT:
Zahir     60000
Sonali    62000
dtype: int64

Mean salary per department:
 HR    52500
IT    61000
dtype: int64


In [9]:
##Pivot Table 

df = pd.DataFrame({
    "Department": ["HR", "HR", "IT", "IT", "HR", "IT"],
    "Team": ["A", "B", "A", "B", "A", "B"],
    "Salary": [50000, 55000, 60000, 62000, 58000, 61000]
})

pivot_table = df.pivot_table(values="Salary", index="Department", columns="Team", aggfunc="mean")
print("Pivot Table (Average Salary):\n", pivot_table)


Pivot Table (Average Salary):
 Team            A      B
Department              
HR          54000  55000
IT          60000  61500


In [13]:
## Group DataFrame and iterate
df = pd.DataFrame({
    "Department": ["HR", "HR", "IT", "IT"],
    "Employee": ["Riya", "Ishaan", "Zahir", "Sonali"],
    "Salary": [50000, 55000, 60000, 62000]
})
grouped_df = df.groupby("Department")
for dept, group in grouped_df:
    print(f"{dept}:\n{group}\n")
print("Mean salary per department:\n", grouped_df["Salary"].mean())


HR:
  Department Employee  Salary
0         HR     Riya   50000
1         HR   Ishaan   55000

IT:
  Department Employee  Salary
2         IT    Zahir   60000
3         IT   Sonali   62000

Mean salary per department:
 Department
HR    52500
IT    61000
Name: Salary, dtype: int64


In [14]:
##Grouping Series and DataFrame
salary_series = pd.Series([50000, 55000, 60000, 62000],
                          index=["Riya", "Ishaan", "Zahir", "Sonali"])
department_series = pd.Series(["HR", "HR", "IT", "IT"],
                              index=["Riya", "Ishaan", "Zahir", "Sonali"])

grouped_series = salary_series.groupby(department_series)
for dept, group in grouped_series:
    print(f"{dept}: {group.values}")
aggregated_series = grouped_series.mean()
print("Aggregated Series (mean salary per department):\n", aggregated_series)

df = pd.DataFrame({
    "Department": ["HR", "HR", "IT", "IT"],
    "Employee": ["Riya", "Ishaan", "Zahir", "Sonali"],
    "Salary": [50000, 55000, 60000, 62000],
    "Bonus": [5000, 6000, 7000, 8000]
})
grouped_df = df.groupby("Department")
for dept, group in grouped_df:
    print(f"{dept} group:\n{group}\n")
aggregated_df = grouped_df.agg({"Salary": ["mean", "sum"], "Bonus": ["mean", "sum"]})
print("Aggregated DataFrame (Salary and Bonus stats per Department):\n", aggregated_df)


HR: [50000 55000]
IT: [60000 62000]
Aggregated Series (mean salary per department):
 HR    52500
IT    61000
dtype: int64
HR group:
  Department Employee  Salary  Bonus
0         HR     Riya   50000   5000
1         HR   Ishaan   55000   6000

IT group:
  Department Employee  Salary  Bonus
2         IT    Zahir   60000   7000
3         IT   Sonali   62000   8000

Aggregated DataFrame (Salary and Bonus stats per Department):
            Salary         Bonus       
             mean     sum  mean    sum
Department                            
HR          52500  105000  5500  11000
IT          61000  122000  7500  15000


In [15]:
## Merge two DataFrames
df1 = pd.DataFrame({
    "Employee": ["Riya", "Ishaan", "Zahir", "Sonali"],
    "Department": ["HR", "IT", "HR", "IT"]
})
df2 = pd.DataFrame({
    "Employee": ["Riya", "Ishaan", "Zahir", "Eva"],
    "Salary": [50000, 60000, 55000, 58000]
})
inner_merge = pd.merge(df1, df2, on="Employee", how="inner")
outer_merge = pd.merge(df1, df2, on="Employee", how="outer")
print("Inner Merge:\n", inner_merge)
print("Outer Merge:\n", outer_merge)


Inner Merge:
   Employee Department  Salary
0     Riya         HR   50000
1   Ishaan         IT   60000
2    Zahir         HR   55000
Outer Merge:
   Employee Department   Salary
0     Riya         HR  50000.0
1   Ishaan         IT  60000.0
2    Zahir         HR  55000.0
3   Sonali         IT      NaN
4      Eva        NaN  58000.0


In [16]:
# # Merge on different key columns
df1 = pd.DataFrame({"EmpID": [1, 2, 3], "Name": ["Riya", "Ishaan", "Zahir"]})
df2 = pd.DataFrame({"EmployeeID": [2, 3, 4], "Salary": [60000, 55000, 58000]})
merged_df = pd.merge(df1, df2, left_on="EmpID", right_on="EmployeeID", how="inner")
print("Merge on different keys:\n", merged_df)


Merge on different keys:
    EmpID    Name  EmployeeID  Salary
0      2  Ishaan           2   60000
1      3   Zahir           3   55000


In [17]:
## Merge using indexes
df1 = pd.DataFrame({"Salary": [50000, 60000]}, index=["Riya", "Ishaan"])
df2 = pd.DataFrame({"Department": ["HR", "IT"]}, index=["Riya", "Ishaan"])
merged_index_df = pd.merge(df1, df2, left_index=True, right_index=True)
print("Merge using indexes:\n", merged_index_df)


Merge using indexes:
         Salary Department
Riya     50000         HR
Ishaan   60000         IT


In [18]:
# Pivot table showing means
df = pd.DataFrame({
    "Department": ["HR","HR","IT","IT"],
    "Team": ["A","B","A","B"],
    "Salary": [50000,55000,60000,62000]
})
summary = df.pivot_table(values="Salary", index="Department", columns="Team", aggfunc="mean")
print(summary)


Team            A      B
Department              
HR          50000  55000
IT          60000  62000


In [19]:
# Cell 22: Additional demonstration
df = pd.DataFrame({
    'Category': ['A', 'B', 'A', 'C', None],
    'Value': [10, 20, None, 40, 50],
    'Date': ['2020-01-01', '2020-01-02', '2020-02-01', None, '2020-02-03']
})

print('\n--- df.head() ---')
display(df.head())

print('\ndf.info()')
df.info()

print('\ndf.describe() ')
display(df.describe(include='all'))

print('\nMissing values per column ')
print(df.isnull().sum())

if 'Value' in df.columns and df['Value'].isnull().any():
    df['Value_filled'] = df['Value'].fillna(df['Value'].median())
    display(df[['Value', 'Value_filled']].head())

print('\n Group by Category and compute mean of Value ')
if 'Category' in df.columns and 'Value' in df.columns:
    display(df.groupby('Category', dropna=False).agg({'Value': 'mean'}))

print('\nConvert Date column to datetime and show min/max ')
if 'Date' in df.columns:
    df['Date_parsed'] = pd.to_datetime(df['Date'], errors='coerce')
    display(df[['Date', 'Date_parsed']].head())
    print('Date range:', df['Date_parsed'].min(), 'to', df['Date_parsed'].max())

print('\nExample value_counts for Category')
if 'Category' in df.columns:
    print(df['Category'].value_counts(dropna=False))



--- df.head() ---


Unnamed: 0,Category,Value,Date
0,A,10.0,2020-01-01
1,B,20.0,2020-01-02
2,A,,2020-02-01
3,C,40.0,
4,,50.0,2020-02-03



df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Category  4 non-null      object 
 1   Value     4 non-null      float64
 2   Date      4 non-null      object 
dtypes: float64(1), object(2)
memory usage: 248.0+ bytes

df.describe() 


Unnamed: 0,Category,Value,Date
count,4,4.0,4
unique,3,,4
top,A,,2020-02-01
freq,2,,1
mean,,30.0,
std,,18.257419,
min,,10.0,
25%,,17.5,
50%,,30.0,
75%,,42.5,



Missing values per column 
Category    1
Value       1
Date        1
dtype: int64


Unnamed: 0,Value,Value_filled
0,10.0,10.0
1,20.0,20.0
2,,30.0
3,40.0,40.0
4,50.0,50.0



 Group by Category and compute mean of Value 


Unnamed: 0_level_0,Value
Category,Unnamed: 1_level_1
A,10.0
B,20.0
C,40.0
,50.0



Convert Date column to datetime and show min/max 


Unnamed: 0,Date,Date_parsed
0,2020-01-01,2020-01-01
1,2020-01-02,2020-01-02
2,2020-02-01,2020-02-01
3,,NaT
4,2020-02-03,2020-02-03


Date range: 2020-01-01 00:00:00 to 2020-02-03 00:00:00

Example value_counts for Category
A      2
NaN    1
C      1
B      1
Name: Category, dtype: int64
