#Pandas - CRUD
##(C)reate - add, extend
##(R)ead - Query, View, Get, Display, Print
##(U)pdate  - modify, edit, change, replace
##(D)elete  - dropping


#Basic Opertions

In [None]:
#@title imports
import pandas as pd

In [None]:
#@title Create a dataframe
data = {'ID': [1, 2, 3, 4],
        'Name': ['Anna', 'Bindu', 'Charlie', 'David'],
        'Score': [85, 90, 78, 92]}

df = pd.DataFrame(data)

display("Initial DataFrame:\n", df)

'Initial DataFrame:\n'

Unnamed: 0,ID,Name,Score
0,1,Anna,85
1,2,Bindu,90
2,3,Charlie,78
3,4,David,92


#Adding columns

In [None]:
#@title Adding a new column, all rows having same value
display("Initial DataFrame:\n", df)

df['course'] = "python 101"

display("Changed DataFrame:\n", df)

'Initial DataFrame:\n'

Unnamed: 0,ID,Name,Score
0,1,Anna,85
1,2,Bindu,90
2,3,Charlie,78
3,4,David,92


'Changed DataFrame:\n'

Unnamed: 0,ID,Name,Score,course
0,1,Anna,85,python 101
1,2,Bindu,90,python 101
2,3,Charlie,78,python 101
3,4,David,92,python 101


In [None]:
#@title Adding a new column, rows having different values
df['Age'] = [23, 25, 22, 14]
display("\nDataFrame after adding Age column:\n", df)


'\nDataFrame after adding Age column:\n'

Unnamed: 0,ID,Name,Score,course,Age
0,1,Anna,85,python 101,23
1,2,Bindu,90,python 101,25
2,3,Charlie,78,python 101,22
3,4,David,92,python 101,14


In [None]:
#@title Recoding a column to create a new column (mapping function)
# mapping applies only to Series
def map_grade(score):
    if score >= 90:
      return 'A'
    elif score >= 80:
      return 'B'
    else:
      return 'C'

#pass the Score to the map_gradefunction
df['Grade'] = df['Score'].map(map_grade)

display("\nDataFrame after mapping Scores to Grades:\n", df)

'\nDataFrame after mapping Scores to Grades:\n'

Unnamed: 0,ID,Name,Score,course,Age,Grade
0,1,Anna,85,python 101,23,B
1,2,Bindu,90,python 101,25,A
2,3,Charlie,78,python 101,22,C
3,4,David,92,python 101,14,A


In [None]:
#@title Recoding a column to create a new column (lambda)
df['Exam_Result'] = df['Score'].apply(lambda x: 'Pass' if x >= 80 else 'Fail')
display("\nDataFrame after adding Pass/Fail column:\n", df)


'\nDataFrame after adding Pass/Fail column:\n'

Unnamed: 0,ID,Name,Score,course,Age,Grade,Exam_Result
0,1,Anna,85,python 101,23,B,Pass
1,2,Bindu,90,python 101,25,A,Pass
2,3,Charlie,78,python 101,22,C,Fail
3,4,David,92,python 101,14,A,Pass


In [None]:
#@title Recoding a column using a custom function through apply()
# Very useful if you want to take decision based on multiple values
# can access any column value and take a decision
# flexible
def apply_grade(row):
    score = row['Score']
    age = row['Age']
    if age > 60:
      score = score + 10

    if score >= 90:
      return 'A'
    elif score >= 80:
      return 'B'
    else:
      return 'C'

df['Grade'] = df.apply(apply_grade, axis=1)
display("DataFrame after applying Scores to Grades using apply() with a custom function:", df)

'DataFrame after applying Scores to Grades using apply() with a custom function:'

Unnamed: 0,ID,Name,Score,course,Age,Grade,Exam_Result
0,1,Anna,85,python 101,23,B,Pass
1,2,Bindu,90,python 101,25,A,Pass
2,3,Charlie,78,python 101,22,C,Fail
3,4,David,92,python 101,14,A,Pass


In [None]:
#@title applymap (for every cell in the dataframe)

# Sample DataFrame
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
display(df1)

# Apply a function element-wise
df_transformed = df1.applymap(lambda x: x * 10 + 2)

display(df_transformed)


Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


  df_transformed = df1.applymap(lambda x: x * 10 + 2)


Unnamed: 0,A,B
0,12,42
1,22,52
2,32,62


In [None]:
#@title adding multiple columns through concat
# Creating the original DataFrame with 5 rows
df1 = pd.DataFrame({
    'ID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Score': [90, 85, 88, 92, 88],
    'course': ['Python 101', 'Data Science', 'AI Basics', 'ML Advanced', 'Python 101'],
    'Age': [25, 24, 27, 26, 26],
    'Grade': ['A', 'B', 'A', 'A', 'A'],
    'Exam_Result': ['Pass', 'Pass', 'Fail', 'Pass', 'Pass']
})

print("Original DataFrame:")
print(df1)

# Creating new rows with additional columns (Email and City)
new_rows_df = pd.DataFrame({
    'ID': [6, 7],
    'Name': ['Frank', 'Grace'],
    'Score': [90, 87],
    'course': ['AI Basics', 'Data Science'],
    'Age': [28, 23],
    'Grade': ['B', 'A'],
    'Exam_Result': ['Pass', 'Pass'],
    'Email': ['frank@example.com', 'grace@example.com'],  # New Column
    'City': ['New York', 'Los Angeles']  # New Column
})

# Adding new rows to the original DataFrame using concat()
df2 = pd.concat([df1, new_rows_df], ignore_index=True)

# Filling NaN values for missing columns in old data
df2.fillna({'Email': 'N/A', 'City': 'Unknown'}, inplace=True)

print("\nDataFrame after adding new rows and columns:")
display(df2)


Original DataFrame:
   ID     Name  Score        course  Age Grade Exam_Result
0   1    Alice     90    Python 101   25     A        Pass
1   2      Bob     85  Data Science   24     B        Pass
2   3  Charlie     88     AI Basics   27     A        Fail
3   4    David     92   ML Advanced   26     A        Pass
4   5      Eve     88    Python 101   26     A        Pass

DataFrame after adding new rows and columns:


Unnamed: 0,ID,Name,Score,course,Age,Grade,Exam_Result,Email,City
0,1,Alice,90,Python 101,25,A,Pass,,Unknown
1,2,Bob,85,Data Science,24,B,Pass,,Unknown
2,3,Charlie,88,AI Basics,27,A,Fail,,Unknown
3,4,David,92,ML Advanced,26,A,Pass,,Unknown
4,5,Eve,88,Python 101,26,A,Pass,,Unknown
5,6,Frank,90,AI Basics,28,B,Pass,frank@example.com,New York
6,7,Grace,87,Data Science,23,A,Pass,grace@example.com,Los Angeles


#Adding rows

In [None]:
#@title Adding a single row at the end
display(df.columns)
display("Current dataframe: ", df)
df.loc[len(df)] = [8, 'Henry', 88, 'Python 101', 26, 'A', 'Pass']
df.loc[len(df)] = [8, 'Hobbit', 88, 'Python 101', 26, 'A', 'Pass']
display("\nDataFrame after adding a new row:\n", df)


Index(['ID', 'Name', 'Score', 'course', 'Age', 'Grade', 'Exam_Result'], dtype='object')

'Current dataframe: '

Unnamed: 0,ID,Name,Score,course,Age,Grade,Exam_Result
0,1,Anna,85,python 101,23,B,Pass
1,2,Bindu,90,python 101,25,A,Pass
2,3,Charlie,78,python 101,22,C,Fail
3,4,David,92,python 101,14,A,Pass


'\nDataFrame after adding a new row:\n'

Unnamed: 0,ID,Name,Score,course,Age,Grade,Exam_Result
0,1,Anna,85,python 101,23,B,Pass
1,2,Bindu,90,python 101,25,A,Pass
2,3,Charlie,78,python 101,22,C,Fail
3,4,David,92,python 101,14,A,Pass
4,8,Henry,88,Python 101,26,A,Pass
5,8,Hobbit,88,Python 101,26,A,Pass


In [None]:
#@title Adding multiple rows through concat
# Creating new rows as a separate DataFrame
new_rows_df = pd.DataFrame({'ID': [5, 6, 7],
                         'Name': ['Isabel', 'Jack', 'Larry'],
                         'Score': [88, 90, 95],
                         'course': ['Python DS', 'Python ML', 'Python AI'],
                         'Age': [12, 13, 14],
                         'Grade': ['A', 'B', 'A'],
                         'Exam_Result': ['Pass', 'Pass', 'Pass']})


display("Original DataFrame:")
display(df)

display("New Rows To be added: ")
display(new_rows_df)

# Adding multiple rows using `concat()`
df = pd.concat([df, new_rows_df], ignore_index=True)

print("\nDataFrame after adding new rows:")
display(df)

'Original DataFrame:'

Unnamed: 0,ID,Name,Score,course,Age,Grade,Exam_Result
0,1,Anna,85,python 101,23,B,Pass
1,2,Bindu,90,python 101,25,A,Pass
2,3,Charlie,78,python 101,22,C,Fail
3,4,David,92,python 101,14,A,Pass
4,8,Henry,88,Python 101,26,A,Pass
5,8,Hobbit,88,Python 101,26,A,Pass


'New Rows To be added: '

Unnamed: 0,ID,Name,Score,course,Age,Grade,Exam_Result
0,5,Isabel,88,Python DS,12,A,Pass
1,6,Jack,90,Python ML,13,B,Pass
2,7,Larry,95,Python AI,14,A,Pass



DataFrame after adding new rows:


Unnamed: 0,ID,Name,Score,course,Age,Grade,Exam_Result
0,1,Anna,85,python 101,23,B,Pass
1,2,Bindu,90,python 101,25,A,Pass
2,3,Charlie,78,python 101,22,C,Fail
3,4,David,92,python 101,14,A,Pass
4,8,Henry,88,Python 101,26,A,Pass
5,8,Hobbit,88,Python 101,26,A,Pass
6,5,Isabel,88,Python DS,12,A,Pass
7,6,Jack,90,Python ML,13,B,Pass
8,7,Larry,95,Python AI,14,A,Pass


#Updating the rows, columns and cells

In [None]:
#@title Updating an existing column (value update)
display("Current DataFrame:")
display(df)

df['Score'] = df['Score'] + 5

display("\nDataFrame after updating Score column:\n")
display(df)


'Current DataFrame:'

Unnamed: 0,ID,Name,Score,course,Age,Grade,Exam_Result
0,1,Anna,90,python 101,23,B,Pass
1,2,Bindu,95,python 101,25,A,Pass
2,3,Charlie,83,python 101,22,C,Fail
3,4,David,97,python 101,14,A,Pass
4,8,Henry,93,Python 101,26,A,Pass
5,8,Hobbit,93,Python 101,26,A,Pass
6,5,Isabel,93,Python DS,12,A,Pass
7,6,Jack,95,Python ML,13,B,Pass
8,7,Larry,100,Python AI,14,A,Pass


'\nDataFrame after updating Score column:\n'

Unnamed: 0,ID,Name,Score,course,Age,Grade,Exam_Result
0,1,Anna,95,python 101,23,B,Pass
1,2,Bindu,100,python 101,25,A,Pass
2,3,Charlie,88,python 101,22,C,Fail
3,4,David,102,python 101,14,A,Pass
4,8,Henry,98,Python 101,26,A,Pass
5,8,Hobbit,98,Python 101,26,A,Pass
6,5,Isabel,98,Python DS,12,A,Pass
7,6,Jack,100,Python ML,13,B,Pass
8,7,Larry,105,Python AI,14,A,Pass


In [None]:
#@title Updating an existing column values using a custom function
def update_score(x):
    return x + 5 if x < 95 else x

df['Score'] = df['Score'].apply(update_score)
display("DataFrame after updating Score column with a custom function:", df)
df['Score'] = df['Score'] + 5
display("\nDataFrame after updating Score column:\n", df)


'DataFrame after updating Score column with a custom function:'

Unnamed: 0,ID,Name,Score,course,Age,Grade,Exam_Result
0,1,Anna,95,python 101,23,B,Pass
1,2,Bindu,100,python 101,25,A,Pass
2,3,Charlie,93,python 101,22,C,Fail
3,4,David,102,python 101,14,A,Pass
4,8,Henry,98,Python 101,26,A,Pass
5,8,Hobbit,98,Python 101,26,A,Pass
6,5,Isabel,98,Python DS,12,A,Pass
7,6,Jack,100,Python ML,13,B,Pass
8,7,Larry,105,Python AI,14,A,Pass


'\nDataFrame after updating Score column:\n'

Unnamed: 0,ID,Name,Score,course,Age,Grade,Exam_Result
0,1,Anna,100,python 101,23,B,Pass
1,2,Bindu,105,python 101,25,A,Pass
2,3,Charlie,98,python 101,22,C,Fail
3,4,David,107,python 101,14,A,Pass
4,8,Henry,103,Python 101,26,A,Pass
5,8,Hobbit,103,Python 101,26,A,Pass
6,5,Isabel,103,Python DS,12,A,Pass
7,6,Jack,105,Python ML,13,B,Pass
8,7,Larry,110,Python AI,14,A,Pass


In [None]:
#@title Updating the row values
display("\Current DataFrame:\n", df)
df.loc[df['Name'] == 'Anna', ['Score', 'Age']] = [95, 26]
display("\nDataFrame after updating Bob's row:\n", df)

'\\Current DataFrame:\n'

Unnamed: 0,ID,Name,Score,course,Age,Grade,Exam_Result
0,1,Anna,100,python 101,23,B,Pass
1,2,Bindu,105,python 101,25,A,Pass
2,3,Charlie,98,python 101,22,C,Fail
3,4,David,107,python 101,14,A,Pass
4,8,Henry,103,Python 101,26,A,Pass
5,8,Hobbit,103,Python 101,26,A,Pass
6,5,Isabel,103,Python DS,12,A,Pass
7,6,Jack,105,Python ML,13,B,Pass
8,7,Larry,110,Python AI,14,A,Pass


"\nDataFrame after updating Bob's row:\n"

Unnamed: 0,ID,Name,Score,course,Age,Grade,Exam_Result
0,1,Anna,95,python 101,26,B,Pass
1,2,Bindu,105,python 101,25,A,Pass
2,3,Charlie,98,python 101,22,C,Fail
3,4,David,107,python 101,14,A,Pass
4,8,Henry,103,Python 101,26,A,Pass
5,8,Hobbit,103,Python 101,26,A,Pass
6,5,Isabel,103,Python DS,12,A,Pass
7,6,Jack,105,Python ML,13,B,Pass
8,7,Larry,110,Python AI,14,A,Pass


In [None]:
#@title Updating a single cell
df.at[2, 'Name'] = 'Charlie Brown'
print("\nDataFrame after updating a cell:")
display(df)

#Hang on to this dataframe for later use
df_master = df


DataFrame after updating a cell:


Unnamed: 0,ID,Name,Score,course,Age,Grade,Exam_Result
0,1,Anna,95,python 101,26,B,Pass
1,2,Bindu,105,python 101,25,A,Pass
2,3,Charlie Brown,98,python 101,22,C,Fail
3,4,David,107,python 101,14,A,Pass
4,8,Henry,103,Python 101,26,A,Pass
5,8,Hobbit,103,Python 101,26,A,Pass
6,5,Isabel,103,Python DS,12,A,Pass
7,6,Jack,105,Python ML,13,B,Pass
8,7,Larry,110,Python AI,14,A,Pass


# Changing the datatype of a column(s)

In [None]:
#@title demo df for changing the data types
import pandas as pd

# Sample DataFrame with mixed datatypes
data = {
    'ID': ['1', '2', '3', '4'],         # Strings (should be integers)
    'Score': ['85.5', '90.3', '78', '92'], # Strings (should be floats)
    'Age': [25, 30, '27', '29'],        # Mixed integers and strings
    'Grade': [90, 85, 88, 'A'],         # Numeric & Non-Numeric values
}

df = pd.DataFrame(data)
print("Data Types of df")
display(df.dtypes)  # Check column types

print("Original DataFrame:")
display(df)



Data Types of df


Unnamed: 0,0
ID,object
Score,object
Age,object
Grade,object


Original DataFrame:


Unnamed: 0,ID,Score,Age,Grade
0,1,85.5,25,90
1,2,90.3,30,85
2,3,78.0,27,88
3,4,92.0,29,A


In [None]:
#@title Convert a column to int
print("\nDataFrame Before Converting 'ID' & 'Age' to int:")
display(df.dtypes)
df['ID'] = df['ID'].astype(int)
df['Age'] = df['Age'].astype(int)
print("\nDataFrame After Converting 'ID' & 'Age' to int:")
display(df.dtypes)



DataFrame Before Converting 'ID' & 'Age' to int:


Unnamed: 0,0
ID,object
Score,object
Age,object
Grade,object



DataFrame After Converting 'ID' & 'Age' to int:


Unnamed: 0,0
ID,int64
Score,object
Age,int64
Grade,object


In [None]:
#@title Changing a column to float
print("\nDataFrame Before Converting 'Score' to float:")
print(df.dtypes)
df['Score'] = df['Score'].astype(float)
print("\nDataFrame After Converting 'Score' to float:")
print(df.dtypes)



DataFrame Before Converting 'Score' to float:
ID        int64
Score    object
Age       int64
Grade    object
dtype: object

DataFrame After Converting 'Score' to float:
ID         int64
Score    float64
Age        int64
Grade     object
dtype: object


In [None]:
#@title Changing a column to a String
print("\nDataFrame Before Converting 'ID' to string:")
print(df.dtypes)
df['ID'] = df['ID'].astype(str)
print("\nDataFrame After Converting 'ID' to string:")
print(df.dtypes)



DataFrame Before Converting 'ID' to string:
ID         int64
Score    float64
Age        int64
Grade     object
dtype: object

DataFrame After Converting 'ID' to string:
ID        object
Score    float64
Age        int64
Grade     object
dtype: object


In [None]:
#@title Changing multiple columns to Numeric

#Before
print("\nDataFrame Before Converting All Numeric Columns:")
display(df.dtypes)
display(df)

# Converts invalid values to NaN
df['Score'] = pd.to_numeric(df['Score'], errors='coerce')
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
# Converts 'A' to NaN
df['Grade'] = pd.to_numeric(df['Grade'], errors='coerce')

print("\nDataFrame After Converting All Numeric Columns:")
display(df.dtypes)
display("Modified DataFrame:")
display(df)


'Current DataFrame:'

Unnamed: 0,ID,Score,Age,Grade
0,1,85.5,25,90
1,2,90.3,30,85
2,3,78.0,27,88
3,4,92.0,29,A



DataFrame Before Converting All Numeric Columns:


Unnamed: 0,0
ID,object
Score,float64
Age,int64
Grade,object


Unnamed: 0,ID,Score,Age,Grade
0,1,85.5,25,90
1,2,90.3,30,85
2,3,78.0,27,88
3,4,92.0,29,A



DataFrame After Converting All Numeric Columns:


Unnamed: 0,0
ID,object
Score,float64
Age,int64
Grade,float64


'Modified DataFrame:'

Unnamed: 0,ID,Score,Age,Grade
0,1,85.5,25,90.0
1,2,90.3,30,85.0
2,3,78.0,27,88.0
3,4,92.0,29,


In [None]:
#@title Convert entire dataframe to numeric
df_numeric = df.apply(pd.to_numeric, errors='coerce')
print("\nDataFrame After Converting All Columns to Numeric:")
print(df_numeric.dtypes)
print(df_numeric)



DataFrame After Converting All Columns to Numeric:
ID         int64
Score    float64
Age        int64
Grade    float64
dtype: object
   ID  Score  Age  Grade
0   1   85.5   25   90.0
1   2   90.3   30   85.0
2   3   78.0   27   88.0
3   4   92.0   29    NaN


#map, apply, applymap

In [None]:
#@title Creating a test DataFrame
data = {
    'A': [1, 2, 3, 4],
    'B': [5, 6, 7, 8]
}
df = pd.DataFrame(data)
print("Initial DataFrame:\n", df)


In [None]:
#@title 1.Using map() with a lambda function on a Series
df['A_mapped'] = df['A'].map(lambda x: x * 10)
print("\nDataFrame after map() with lambda:\n", df)

In [None]:
#@title 2.Using map() with a custom function
def custom_map_function(x):
    return x * 2
df['A_mapped_custom'] = df['A'].map(custom_map_function)
print("\nDataFrame after map() with custom function:\n", df)


In [None]:
#@title 3.Using apply() with a lambda function on a Series
df['B_applied'] = df['B'].apply(lambda x: x + 3)
print("\nDataFrame after apply() with lambda:\n", df)


In [None]:
#@title 4.Using apply() with a custom function on a Series
def custom_apply_function(x):
    return x - 1
df['B_applied_custom'] = df['B'].apply(custom_apply_function)
print("\nDataFrame after apply() with custom function:\n", df)


In [None]:
#@title 5.Using applymap() with a lambda function on entire DataFrame
df_applymap_lambda = df[['A', 'B']].applymap(lambda x: x * 100)
print("\nDataFrame after applymap() with lambda:\n", df_applymap_lambda)


In [None]:
#@title 6.Using applymap() with a custom function
def custom_applymap_function(x):
    return x ** 2

df_applymap_custom = df[['A', 'B']].applymap(custom_applymap_function)

print("\nDataFrame after applymap() with custom function:\n")
display(df_applymap_custom)

#Renaming columns and rows

In [None]:
#@title Renaming just one column
df = df_master
display("Current DataFrame: ", df)
df.rename(columns={'Name': 'Student Name'}, inplace=True)
display("MOdifed DataFrame: ", df)

'Current DataFrame: '

Unnamed: 0,ID,Name,Score,course,Age,Grade,Exam_Result
0,1,Anna,95,python 101,26,B,Pass
1,2,Bindu,105,python 101,25,A,Pass
2,3,Charlie Brown,98,python 101,22,C,Fail
3,4,David,107,python 101,14,A,Pass
4,8,Henry,103,Python 101,26,A,Pass
5,8,Hobbit,103,Python 101,26,A,Pass
6,5,Isabel,103,Python DS,12,A,Pass
7,6,Jack,105,Python ML,13,B,Pass
8,7,Larry,110,Python AI,14,A,Pass


'MOdifed DataFrame: '

Unnamed: 0,ID,Student Name,Score,course,Age,Grade,Exam_Result
0,1,Anna,95,python 101,26,B,Pass
1,2,Bindu,105,python 101,25,A,Pass
2,3,Charlie Brown,98,python 101,22,C,Fail
3,4,David,107,python 101,14,A,Pass
4,8,Henry,103,Python 101,26,A,Pass
5,8,Hobbit,103,Python 101,26,A,Pass
6,5,Isabel,103,Python DS,12,A,Pass
7,6,Jack,105,Python ML,13,B,Pass
8,7,Larry,110,Python AI,14,A,Pass


In [None]:
#@title Renaming multiple columns
df_renamed = df.rename(columns={'Name': 'Student Name', 'Score': 'Final Score'})
print("\nDataFrame after renaming columns:")
display(df_renamed)



DataFrame after renaming columns:


Unnamed: 0,ID,Student Name,Final Score,course,Age,Grade,Exam_Result
0,1,Anna,95,python 101,26,B,Pass
1,2,Bindu,105,python 101,25,A,Pass
2,3,Charlie Brown,98,python 101,22,C,Fail
3,4,David,107,python 101,14,A,Pass
4,8,Henry,103,Python 101,26,A,Pass
5,8,Hobbit,103,Python 101,26,A,Pass
6,5,Isabel,103,Python DS,12,A,Pass
7,6,Jack,105,Python ML,13,B,Pass
8,7,Larry,110,Python AI,14,A,Pass


In [None]:
#@title use of "inplace"
#this doesn't change the original dataframe
display(df)
df.rename(columns={'course':'Course'})
display(df)

df.rename(columns={'course':'Course'}, inplace = True)
display(df)

Unnamed: 0,ID,Student Name,Score,course,Age,Grade,Exam_Result
0,1,Anna,95,python 101,26,B,Pass
1,2,Bindu,105,python 101,25,A,Pass
2,3,Charlie Brown,98,python 101,22,C,Fail
3,4,David,107,python 101,14,A,Pass
4,8,Henry,103,Python 101,26,A,Pass
5,8,Hobbit,103,Python 101,26,A,Pass
6,5,Isabel,103,Python DS,12,A,Pass
7,6,Jack,105,Python ML,13,B,Pass
8,7,Larry,110,Python AI,14,A,Pass


Unnamed: 0,ID,Student Name,Score,course,Age,Grade,Exam_Result
0,1,Anna,95,python 101,26,B,Pass
1,2,Bindu,105,python 101,25,A,Pass
2,3,Charlie Brown,98,python 101,22,C,Fail
3,4,David,107,python 101,14,A,Pass
4,8,Henry,103,Python 101,26,A,Pass
5,8,Hobbit,103,Python 101,26,A,Pass
6,5,Isabel,103,Python DS,12,A,Pass
7,6,Jack,105,Python ML,13,B,Pass
8,7,Larry,110,Python AI,14,A,Pass


Unnamed: 0,ID,Student Name,Score,Course,Age,Grade,Exam_Result
0,1,Anna,95,python 101,26,B,Pass
1,2,Bindu,105,python 101,25,A,Pass
2,3,Charlie Brown,98,python 101,22,C,Fail
3,4,David,107,python 101,14,A,Pass
4,8,Henry,103,Python 101,26,A,Pass
5,8,Hobbit,103,Python 101,26,A,Pass
6,5,Isabel,103,Python DS,12,A,Pass
7,6,Jack,105,Python ML,13,B,Pass
8,7,Larry,110,Python AI,14,A,Pass


#Resetting the index

In [None]:
#@title Resetting the index (new index from 0 to N-1)
display("Current DataFrame: ", df)
df_reset = df.reset_index(drop=True)
print("\nDataFrame after resetting the index:")
display(df_reset)


'Current DataFrame: '

Unnamed: 0,ID,Student Name,Score,Course,Age,Grade,Exam_Result
0,1,Anna,95,python 101,26,B,Pass
1,2,Bindu,105,python 101,25,A,Pass
2,3,Charlie Brown,98,python 101,22,C,Fail
3,4,David,107,python 101,14,A,Pass
4,8,Henry,103,Python 101,26,A,Pass
5,8,Hobbit,103,Python 101,26,A,Pass
6,5,Isabel,103,Python DS,12,A,Pass
7,6,Jack,105,Python ML,13,B,Pass
8,7,Larry,110,Python AI,14,A,Pass



DataFrame after resetting the index:


Unnamed: 0,ID,Student Name,Score,Course,Age,Grade,Exam_Result
0,1,Anna,95,python 101,26,B,Pass
1,2,Bindu,105,python 101,25,A,Pass
2,3,Charlie Brown,98,python 101,22,C,Fail
3,4,David,107,python 101,14,A,Pass
4,8,Henry,103,Python 101,26,A,Pass
5,8,Hobbit,103,Python 101,26,A,Pass
6,5,Isabel,103,Python DS,12,A,Pass
7,6,Jack,105,Python ML,13,B,Pass
8,7,Larry,110,Python AI,14,A,Pass


In [None]:
#@title resetting index to a specific column
df_indexed = df.set_index('Student Name')
display("DataFrame after setting 'Student Name' as index:", df_indexed)


df_reset_1 = df.reset_index(drop=True)
display("\nDataFrame after resetting the index:\n", df_reset_1)

# reset the index and do not add it back as a new column
df_reset_2 = df_indexed.reset_index(drop=True)
display("\nDataFrame after resetting the index:\n", df_reset_2)

# reset the index and do not add it back as a new column
df_reset_3 = df_indexed.reset_index(drop=False)
display("\nDataFrame after resetting the index:\n", df_reset_3)

"DataFrame after setting 'Student Name' as index:"

Unnamed: 0_level_0,ID,Score,Course,Age,Grade,Exam_Result
Student Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Anna,1,95,python 101,26,B,Pass
Bindu,2,105,python 101,25,A,Pass
Charlie Brown,3,98,python 101,22,C,Fail
David,4,107,python 101,14,A,Pass
Henry,8,103,Python 101,26,A,Pass
Hobbit,8,103,Python 101,26,A,Pass
Isabel,5,103,Python DS,12,A,Pass
Jack,6,105,Python ML,13,B,Pass
Larry,7,110,Python AI,14,A,Pass


'\nDataFrame after resetting the index:\n'

Unnamed: 0,ID,Student Name,Score,Course,Age,Grade,Exam_Result
0,1,Anna,95,python 101,26,B,Pass
1,2,Bindu,105,python 101,25,A,Pass
2,3,Charlie Brown,98,python 101,22,C,Fail
3,4,David,107,python 101,14,A,Pass
4,8,Henry,103,Python 101,26,A,Pass
5,8,Hobbit,103,Python 101,26,A,Pass
6,5,Isabel,103,Python DS,12,A,Pass
7,6,Jack,105,Python ML,13,B,Pass
8,7,Larry,110,Python AI,14,A,Pass


'\nDataFrame after resetting the index:\n'

Unnamed: 0,ID,Score,Course,Age,Grade,Exam_Result
0,1,95,python 101,26,B,Pass
1,2,105,python 101,25,A,Pass
2,3,98,python 101,22,C,Fail
3,4,107,python 101,14,A,Pass
4,8,103,Python 101,26,A,Pass
5,8,103,Python 101,26,A,Pass
6,5,103,Python DS,12,A,Pass
7,6,105,Python ML,13,B,Pass
8,7,110,Python AI,14,A,Pass


'\nDataFrame after resetting the index:\n'

Unnamed: 0,Student Name,ID,Score,Course,Age,Grade,Exam_Result
0,Anna,1,95,python 101,26,B,Pass
1,Bindu,2,105,python 101,25,A,Pass
2,Charlie Brown,3,98,python 101,22,C,Fail
3,David,4,107,python 101,14,A,Pass
4,Henry,8,103,Python 101,26,A,Pass
5,Hobbit,8,103,Python 101,26,A,Pass
6,Isabel,5,103,Python DS,12,A,Pass
7,Jack,6,105,Python ML,13,B,Pass
8,Larry,7,110,Python AI,14,A,Pass


#Deleting data

In [None]:
#@title adding more data for deletingpurpose
display(df)
df.loc[len(df)] = [101, 'xavier', 78, 'python ds', 18, 'C', 'Pass']
df.loc[len(df)] = [102, 'zack', 88, 'python ds', 18, 'C', 'Pass']
df.loc[len(df)] = [103, 'peter', 98, 'python ds', 18, 'C', 'Pass']
df.loc[len(df)] = [104, 'wanda', 58, 'python ds', 18, 'C', 'Pass']
display(df)


Unnamed: 0,ID,Student Name,Score,Course,Age,Grade,Exam_Result
0,1,Anna,95,python 101,26,B,Pass
1,2,Bindu,105,python 101,25,A,Pass
2,3,Charlie Brown,98,python 101,22,C,Fail
3,4,David,107,python 101,14,A,Pass
4,8,Henry,103,Python 101,26,A,Pass
5,8,Hobbit,103,Python 101,26,A,Pass
6,5,Isabel,103,Python DS,12,A,Pass
7,6,Jack,105,Python ML,13,B,Pass
8,7,Larry,110,Python AI,14,A,Pass
9,101,xavier,78,python ds,18,C,Pass


Unnamed: 0,ID,Student Name,Score,Course,Age,Grade,Exam_Result
0,1,Anna,95,python 101,26,B,Pass
1,2,Bindu,105,python 101,25,A,Pass
2,3,Charlie Brown,98,python 101,22,C,Fail
3,4,David,107,python 101,14,A,Pass
4,8,Henry,103,Python 101,26,A,Pass
5,8,Hobbit,103,Python 101,26,A,Pass
6,5,Isabel,103,Python DS,12,A,Pass
7,6,Jack,105,Python ML,13,B,Pass
8,7,Larry,110,Python AI,14,A,Pass
9,101,xavier,78,python ds,18,C,Pass


In [None]:
#@title Deleting a column
display("Current DataFrame: ", df)
df.drop(columns=['Age'], inplace=True)
print("\nDataFrame after deleting 'Age' column:\n")
display(df)

'Current DataFrame: '

Unnamed: 0,ID,Student Name,Score,Course,Age,Grade,Exam_Result
0,1,Anna,95,python 101,26,B,Pass
1,2,Bindu,105,python 101,25,A,Pass
2,3,Charlie Brown,98,python 101,22,C,Fail
3,4,David,107,python 101,14,A,Pass
4,8,Henry,103,Python 101,26,A,Pass
5,8,Hobbit,103,Python 101,26,A,Pass
6,5,Isabel,103,Python DS,12,A,Pass
7,6,Jack,105,Python ML,13,B,Pass
8,7,Larry,110,Python AI,14,A,Pass
9,101,xavier,78,python ds,18,C,Pass



DataFrame after deleting 'Age' column:



Unnamed: 0,ID,Student Name,Score,Course,Grade,Exam_Result
0,1,Anna,95,python 101,B,Pass
1,2,Bindu,105,python 101,A,Pass
2,3,Charlie Brown,98,python 101,C,Fail
3,4,David,107,python 101,A,Pass
4,8,Henry,103,Python 101,A,Pass
5,8,Hobbit,103,Python 101,A,Pass
6,5,Isabel,103,Python DS,A,Pass
7,6,Jack,105,Python ML,B,Pass
8,7,Larry,110,Python AI,A,Pass
9,101,xavier,78,python ds,C,Pass


In [None]:
#@title Deleting multiple columns
display("Current DataFrame: ", df)
df.drop(columns=['Exam_Result', 'Grade'], inplace=True)
print("\nDataFrame after deleting multiple columns:\n")
display(df)

'Current DataFrame: '

Unnamed: 0,ID,Student Name,Score,Course,Grade,Exam_Result
0,1,Anna,95,python 101,B,Pass
1,2,Bindu,105,python 101,A,Pass
2,3,Charlie Brown,98,python 101,C,Fail
3,4,David,107,python 101,A,Pass
4,8,Henry,103,Python 101,A,Pass
5,8,Hobbit,103,Python 101,A,Pass
6,5,Isabel,103,Python DS,A,Pass
7,6,Jack,105,Python ML,B,Pass
8,7,Larry,110,Python AI,A,Pass
9,101,xavier,78,python ds,C,Pass



DataFrame after deleting multiple columns:



Unnamed: 0,ID,Student Name,Score,Course
0,1,Anna,95,python 101
1,2,Bindu,105,python 101
2,3,Charlie Brown,98,python 101
3,4,David,107,python 101
4,8,Henry,103,Python 101
5,8,Hobbit,103,Python 101
6,5,Isabel,103,Python DS
7,6,Jack,105,Python ML
8,7,Larry,110,Python AI
9,101,xavier,78,python ds


In [None]:
#@title Deleting a specific row by index
display("Current DataFrame: ", df)
df.drop(index=2, inplace=True)
print("\nDataFrame after deleting row with index 2:\n")
display(df)

df.reset_index(drop=True, inplace=True)
print("\nDataFrame after resetting the index:\n")
display(df)

'Current DataFrame: '

Unnamed: 0,ID,Student Name,Score,Course
0,1,Anna,95,python 101
1,2,Bindu,105,python 101
3,4,David,107,python 101
4,8,Henry,103,Python 101
5,8,Hobbit,103,Python 101
6,5,Isabel,103,Python DS
7,6,Jack,105,Python ML
8,7,Larry,110,Python AI
9,101,xavier,78,python ds
10,102,zack,78,python ds


KeyError: '[2] not found in axis'

In [None]:
#@title Deleting multiple rows
df.drop(index=[1, 3], inplace=True)
print("\nDataFrame after deleting multiple rows:\n")
display(df)


DataFrame after deleting multiple rows:



Unnamed: 0,ID,Student Name,Score,Course
0,1,Anna,95,python 101
4,8,Henry,103,Python 101
5,8,Hobbit,103,Python 101
6,5,Isabel,103,Python DS
7,6,Jack,105,Python ML
8,7,Larry,110,Python AI
9,101,xavier,78,python ds
10,102,zack,78,python ds
11,103,peter,78,python ds
12,104,wanda,78,python ds


In [None]:
#@title reset_index to restore the sequence
df.reset_index(drop=True, inplace=True)
print("\nDataFrame after resetting the index:\n")
display(df)


DataFrame after resetting the index:



Unnamed: 0,ID,Student Name,Score,Course
0,1,Anna,95,python 101
1,8,Henry,103,Python 101
2,8,Hobbit,103,Python 101
3,5,Isabel,103,Python DS
4,6,Jack,105,Python ML
5,7,Larry,110,Python AI
6,101,xavier,78,python ds
7,102,zack,78,python ds
8,103,peter,78,python ds
9,104,wanda,78,python ds


In [None]:
#@title Deleting rows matching some criteria on a column
display("Current DataFrame: ", df)
df = df[df['Score'] >= 80]
print("\nDataFrame after deleting rows where Score < 80:\n")
display(df)

'Current DataFrame: '

Unnamed: 0,ID,Student Name,Score,Course
0,1,Anna,95,python 101
1,8,Henry,103,Python 101
2,8,Hobbit,103,Python 101
3,5,Isabel,103,Python DS
4,6,Jack,105,Python ML
5,7,Larry,110,Python AI
6,101,xavier,78,python ds
7,102,zack,78,python ds
8,103,peter,78,python ds
9,104,wanda,78,python ds



DataFrame after deleting rows where Score < 80:



Unnamed: 0,ID,Student Name,Score,Course
0,1,Anna,95,python 101
1,8,Henry,103,Python 101
2,8,Hobbit,103,Python 101
3,5,Isabel,103,Python DS
4,6,Jack,105,Python ML
5,7,Larry,110,Python AI
15,102,zack,88,python ds
16,103,peter,98,python ds


In [None]:
#@title Deleting rows matching some criteria on multiple columns
display("Current DataFrame: ", df)
df = df[(df['Score'] >= 85) & (df['ID'] != 5)]
print("\nDataFrame after deleting rows where Score < 85 and ID is 5:\n")
display(df)

'Current DataFrame: '

Unnamed: 0,ID,Student Name,Score,Course
0,1,Anna,95,python 101
1,8,Henry,103,Python 101
2,8,Hobbit,103,Python 101
3,5,Isabel,103,Python DS
4,6,Jack,105,Python ML
5,7,Larry,110,Python AI
15,102,zack,88,python ds
16,103,peter,98,python ds



DataFrame after deleting rows where Score < 85 and ID is 5:



Unnamed: 0,ID,Student Name,Score,Course
0,1,Anna,95,python 101
1,8,Henry,103,Python 101
2,8,Hobbit,103,Python 101
4,6,Jack,105,Python ML
5,7,Larry,110,Python AI
15,102,zack,88,python ds
16,103,peter,98,python ds


#Operations that change the form or structure

In [None]:
#@title Sorting the DataFrame by Score
display("Current DataFrame: ", df)
df_sorted = df.sort_values(by='Score', ascending=False)
print("\nDataFrame after sorting by Score:\n")
display(df_sorted)


'Current DataFrame: '

Unnamed: 0,ID,Student Name,Score,Course
0,1,Anna,95,python 101
1,8,Henry,103,Python 101
2,8,Hobbit,103,Python 101
4,6,Jack,105,Python ML
5,7,Larry,110,Python AI
15,102,zack,88,python ds
16,103,peter,98,python ds



DataFrame after sorting by Score:



Unnamed: 0,ID,Student Name,Score,Course,Age,Grade,Exam_Result
8,7,Larry,110,Python AI,14,A,Pass
3,4,David,107,python 101,14,A,Pass
1,2,Bindu,105,python 101,25,A,Pass
7,6,Jack,105,Python ML,13,B,Pass
4,8,Henry,103,Python 101,26,A,Pass
5,8,Hobbit,103,Python 101,26,A,Pass
6,5,Isabel,103,Python DS,12,A,Pass
2,3,Charlie Brown,98,python 101,22,C,Fail
0,1,Anna,95,python 101,26,B,Pass


In [None]:
#@title Vertical split of the DataFrame
columns_1 = ['ID', 'Student Name', 'Score']
columns_2 = ['Exam_Result', 'Age', 'Grade']
df1, df2 = df[columns_1], df[columns_2]
display("\nFirst split DataFrame:\n")
display(df1)

display("\nSecond split DataFrame:\n")
display(df2)


'\nFirst split DataFrame:\n'

Unnamed: 0,ID,Student Name,Score
0,1,Anna,100
1,2,Bindu,100
2,3,Charlie Brown,93
3,4,David,102
4,8,Henry,103
5,5,Isabel,103
6,6,Jack,100
7,7,Larry,105


'\nSecond split DataFrame:\n'

Unnamed: 0,Exam_Result,Age,Grade
0,Pass,23,B
1,Pass,25,A
2,Fail,22,C
3,Pass,24,A
4,Pass,26,A
5,Pass,12,A
6,Pass,13,B
7,Pass,14,A


In [None]:
#@title Horizontal split of the DataFrame
split_point = 2
df_left = df.iloc[:, :split_point]
df_right = df.iloc[:, split_point:]
display("\nLeft split DataFrame:\n", df_left)
display("\nRight split DataFrame:\n", df_right)


'\nLeft split DataFrame:\n'

Unnamed: 0,ID,Student Name
0,1,Anna
1,2,Bindu
2,3,Charlie Brown
3,4,David
4,8,Henry
5,5,Isabel
6,6,Jack
7,7,Larry


'\nRight split DataFrame:\n'

Unnamed: 0,Score,course,Age,Grade,Exam_Result
0,100,python 101,23,B,Pass
1,100,python 101,25,A,Pass
2,93,python 101,22,C,Fail
3,102,python 101,24,A,Pass
4,103,Python 101,26,A,Pass
5,103,Python DS,12,A,Pass
6,100,Python ML,13,B,Pass
7,105,Python AI,14,A,Pass


In [None]:
#@title Transposing the DataFrame
df_transposed = df.T
print("\nTransposed DataFrame:\n")
display(df_transposed)



Transposed DataFrame:



Unnamed: 0,0,1,2,3,4,5,6,7
ID,1,2,3,4,8,5,6,7
Student Name,Anna,Bindu,Charlie Brown,David,Henry,Isabel,Jack,Larry
Score,100,100,93,102,103,103,100,105
course,python 101,python 101,python 101,python 101,Python 101,Python DS,Python ML,Python AI
Age,23,25,22,24,26,12,13,14
Grade,B,A,C,A,A,A,B,A
Exam_Result,Pass,Pass,Fail,Pass,Pass,Pass,Pass,Pass


#Handling Missing Data (Example 1)

In [None]:
#@title Loading the weather data
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/sjasthi/python_input_files/refs/heads/main/weather_data.csv")

df = pd.read_csv("https://raw.githubusercontent.com/sjasthi/python_input_files/refs/heads/main/weather_data.csv", parse_dates=['day'])


print(type(df.day[0]))
display(df)

<class 'pandas._libs.tslibs.timestamps.Timestamp'>


Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32,6,Rain
1,2017-01-02,-99999,7,Sunny
2,2017-01-03,28,-99999,Snow
3,2017-01-04,-99999,7,0
4,2017-01-05,32,-99999,Rain
5,2017-01-06,31,2,Sunny
6,2017-01-06,34,5,0


In [None]:
#@title fillna  - fill the blank values with 0
new_df = df.fillna(0)
display(new_df)

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32,6,Rain
1,2017-01-02,-99999,7,Sunny
2,2017-01-03,28,-99999,Snow
3,2017-01-04,-99999,7,0
4,2017-01-05,32,-99999,Rain
5,2017-01-06,31,2,Sunny
6,2017-01-06,34,5,0


In [None]:
#@title ffill method
#Fills NaNs with previous row's value
new_df = df.fillna(method="ffill")
new_df


  new_df = df.fillna(method="ffill")


Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32,6,Rain
1,2017-01-02,-99999,7,Sunny
2,2017-01-03,28,-99999,Snow
3,2017-01-04,-99999,7,0
4,2017-01-05,32,-99999,Rain
5,2017-01-06,31,2,Sunny
6,2017-01-06,34,5,0


In [None]:
#@title bfill (backward fill)
#Fills NaNs with next row's value
new_df_1 = df.fillna(method="bfill")
display(new_df_1)

new_df_2 = df.bfill()
display(new_df_2)

  new_df_1 = df.fillna(method="bfill")


Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32,6,Rain
1,2017-01-02,-99999,7,Sunny
2,2017-01-03,28,-99999,Snow
3,2017-01-04,-99999,7,0
4,2017-01-05,32,-99999,Rain
5,2017-01-06,31,2,Sunny
6,2017-01-06,34,5,0


Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32,6,Rain
1,2017-01-02,-99999,7,Sunny
2,2017-01-03,28,-99999,Snow
3,2017-01-04,-99999,7,0
4,2017-01-05,32,-99999,Rain
5,2017-01-06,31,2,Sunny
6,2017-01-06,34,5,0


In [None]:
#@title specifying the axis while filling
# axis is either "index" or "columns"
new_df = df.fillna(method="bfill", axis="columns")
new_df

  new_df = df.fillna(method="bfill", axis="columns")


Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01 00:00:00,32,6,Rain
1,2017-01-02 00:00:00,-99999,7,Sunny
2,2017-01-03 00:00:00,28,-99999,Snow
3,2017-01-04 00:00:00,-99999,7,0
4,2017-01-05 00:00:00,32,-99999,Rain
5,2017-01-06 00:00:00,31,2,Sunny
6,2017-01-06 00:00:00,34,5,0


In [None]:
#@title fill based on the interpolation (linear method)
import pandas as pd
import numpy as np

# Creating a DataFrame with missing values
data = {
    'ID': [1, 2, 3, 4, 5],
    'Score': [90, np.nan, np.nan, 85, 80]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
display(df)

# Applying interpolation
new_df = df.interpolate()

print("\nDataFrame after Interpolation:")
display(new_df)


Original DataFrame:


Unnamed: 0,ID,Score
0,1,90.0
1,2,
2,3,
3,4,85.0
4,5,80.0



DataFrame after Interpolation:


Unnamed: 0,ID,Score
0,1,90.0
1,2,88.333333
2,3,86.666667
3,4,85.0
4,5,80.0


In [None]:
#@title Polynomial Interpolation (Quadratic)
new_df_poly = df.interpolate(method="polynomial", order=2)
print("\nDataFrame after Polynomial Interpolation:")
display(new_df_poly)



DataFrame after Polynomial Interpolation:


Unnamed: 0,ID,Score
0,1,90.0
1,2,90.0
2,3,88.333333
3,4,85.0
4,5,80.0


In [None]:
#@title drop the null values
new_df = df.dropna()
new_df



Unnamed: 0,ID,Score
0,1,90.0
3,4,85.0
4,5,80.0


In [None]:
#@title specifying "how" to drop
# TODO: What are the other possible values of how
new_df = df.dropna(how='all')
new_df

Unnamed: 0,ID,Score
0,1,90.0
1,2,
2,3,
3,4,85.0
4,5,80.0


In [None]:
#@title Using threshold for decision making
# The thresh parameter in the Pandas dropna() method specifies the minimum number of non-NaN (non-missing) values required for a row (or column) to be kept.
new_df = df.dropna(thresh=1)
new_df



Unnamed: 0,ID,Score
0,1,90.0
1,2,
2,3,
3,4,85.0
4,5,80.0


In [None]:
#@title inserting the missing dates
dt = pd.date_range("01-01-2017","01-11-2017")
idx = pd.DatetimeIndex(dt)
df.reindex(idx)

Unnamed: 0,ID,Score
2017-01-01,,
2017-01-02,,
2017-01-03,,
2017-01-04,,
2017-01-05,,
2017-01-06,,
2017-01-07,,
2017-01-08,,
2017-01-09,,
2017-01-10,,


#Handling Missing Data (Example 2)

In [None]:
#@title dummy dataframe (example 2)
import pandas as pd
import numpy as np

# Creating a dummy DataFrame with missing values
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [np.nan, 2, 3, np.nan, 5],
    'C': [1, np.nan, np.nan, 4, 5]
}


df = pd.DataFrame(data)
print("Original DataFrame:")
display(df)


Original DataFrame:


Unnamed: 0,A,B,C
0,1.0,,1.0
1,2.0,2.0,
2,,3.0,
3,4.0,,4.0
4,5.0,5.0,5.0


In [None]:
#@title Fill missing values with a specified value (e.g., 0)
df_filled = df.fillna(2.0)
print("DataFrame after fillna(0):")
display(df_filled)


DataFrame after fillna(0):


Unnamed: 0,A,B,C
0,1.0,2.0,1.0
1,2.0,2.0,2.0
2,2.0,3.0,2.0
3,4.0,2.0,4.0
4,5.0,5.0,5.0


In [None]:
#@title Fill missing values with column mean
df_filled_mean = df.fillna(df.mean(numeric_only=True))
print("DataFrame after filling missing values with column mean:")
print(df_filled_mean)


DataFrame after filling missing values with column mean:
     A         B         C
0  1.0  3.333333  1.000000
1  2.0  2.000000  3.333333
2  3.0  3.000000  3.333333
3  4.0  3.333333  4.000000
4  5.0  5.000000  5.000000


In [None]:
#@title Drop rows with missing values
df_dropped_rows = df.dropna()
print("DataFrame after dropping rows with missing values:")
print(df_dropped_rows)


DataFrame after dropping rows with missing values:
     A    B    C
4  5.0  5.0  5.0


In [None]:
#@title Drop columns with missing values
df_dropped_cols = df.dropna(axis=1)
print("DataFrame after dropping columns with missing values:")
print(df_dropped_cols)


DataFrame after dropping columns with missing values:
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]


#Handling Duplicates

In [None]:
#@title Creating a dataframe with duplicate values
import pandas as pd

# Sample DataFrame with duplicate rows
data = {
    'ID': [1, 2, 3, 4, 2, 3, 5],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Bob', 'Charlie', 'Eve'],
    'Score': [90, 85, 88, 92, 85, 88, 90]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
display(df)


Original DataFrame:


Unnamed: 0,ID,Name,Score
0,1,Alice,90
1,2,Bob,85
2,3,Charlie,88
3,4,David,92
4,2,Bob,85
5,3,Charlie,88
6,5,Eve,90


In [None]:
#@title Detecting the duplicates
duplicates = df.duplicated()
display("Duplicate Rows:\n", duplicates)


'Duplicate Rows:\n'

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,True
5,True
6,False


In [None]:
#@title Counting the number of duplicated rows
duplicate_count = df.duplicated().sum()
print("\nNumber of Duplicate Rows:", duplicate_count)

duplicate_names = df.duplicated(subset=['Score']).sum()
print("\nNumber of Duplicates in 'Score':", duplicate_names)



Number of Duplicate Rows: 2

Number of Duplicates in 'Score': 3


In [None]:
#@title Removing the duplicates
#Question: Which are dropped? Which one is kept (first one? or last one?)
df_no_duplicates = df.drop_duplicates()
print("\nDataFrame After Dropping Duplicates:")
display(df_no_duplicates)



DataFrame After Dropping Duplicates:


Unnamed: 0,ID,Name,Score
0,1,Alice,90
1,2,Bob,85
2,3,Charlie,88
3,4,David,92
6,5,Eve,90


In [None]:
#@title Removing the duplicates based on a specific column
# Sample DataFrame with duplicate rows
data = {
    'ID': [1, 2, 3, 4, 2, 3, 5],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Bob', 'Charlie', 'Eve'],
    'Score': [90, 85, 88, 92, 199, 88, 95]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
display(df)

#Todo: play with this code
#df_unique_name = df.drop_duplicates()
df_unique_name = df.drop_duplicates(subset=['Name'])

print("\nDataFrame After Dropping Duplicates Based on 'Name':")
display(df_unique_name)


Original DataFrame:


Unnamed: 0,ID,Name,Score
0,1,Alice,90
1,2,Bob,85
2,3,Charlie,88
3,4,David,92
4,2,Bob,199
5,3,Charlie,88
6,5,Eve,95



DataFrame After Dropping Duplicates Based on 'Name':


Unnamed: 0,ID,Name,Score
0,1,Alice,90
1,2,Bob,85
2,3,Charlie,88
3,4,David,92
6,5,Eve,95


In [None]:
#@title Keeping the last occurrence instead of first
df_last = df.drop_duplicates(keep='last')
print("\nDataFrame After Keeping Last Occurrence:")
display(df_last)



DataFrame After Keeping Last Occurrence:


Unnamed: 0,ID,Name,Score
0,1,Alice,90
1,2,Bob,85
3,4,David,92
4,2,Bob,199
5,3,Charlie,88
6,5,Eve,95


In [None]:
#@title Removing all duplicates (Keeping None)
df_no_dupes = df.drop_duplicates(keep=False)
print("\nDataFrame After Removing All Duplicates:")
display(df_no_dupes)



DataFrame After Removing All Duplicates:


Unnamed: 0,ID,Name,Score
0,1,Alice,90
1,2,Bob,85
3,4,David,92
4,2,Bob,199
6,5,Eve,95
