In [1]:
# Import libraries
import pandas as pd
from google.colab import files

# Extract
# Read csv into dataframe
df = pd.read_csv("raw_marks.csv")
display(df)

# Transform
# Find missing values
print("Missing:", df.isnull().sum())

# Check string values
# coerce = if conversion fails, turn the value into NaN
for col in ['Physics','Geography','Math']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Check marks are within valid range (0 and 100)
# clip(lower=0, upper=100) = force all numbers to stay between 0 and 100.
for col in ['Physics','Geography','Math']:
    df[col] = df[col].clip(lower=0, upper=100)

# Fill missing marks with median
for col in ['Physics','Geography','Math']:
    df[col].fillna(df[col].median(), inplace=True)

# Normalize Semester column and remove duplicates
df['Semester'] = df['Semester'].astype(str).str.replace('-', '')
df = df.drop_duplicates()

# Add total and average columns
df['Total'] = df[['Physics','Geography','Math']].sum(axis=1)
df['Average'] = df['Total'] / 3

# Round 2 decimal places
df['Average'] = df['Average'].round(2)

# Load
# Read cleaned csv
display(df)

# Save the cleaned dataframe as a csv
cleaned_marks = "cleaned_marks.csv"
df.to_csv(cleaned_marks, index=False)

# Download the cleaned csv
files.download(cleaned_marks)


Unnamed: 0,StudentID,StudentName,Physics,Geography,Math,Semester
0,100100,Alpha,65,54.0,78,2025S1
1,100102,Beta,70,62.0,80,2025S1
2,100103,Gamma,32,71.0,96,2025S1
3,100104,Delta,60,,-10,2025-S1


Missing: StudentID      0
StudentName    0
Physics        0
Geography      1
Math           0
Semester       0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


Unnamed: 0,StudentID,StudentName,Physics,Geography,Math,Semester,Total,Average
0,100100,Alpha,65,54.0,78,2025S1,197.0,65.67
1,100102,Beta,70,62.0,80,2025S1,212.0,70.67
2,100103,Gamma,32,71.0,96,2025S1,199.0,66.33
3,100104,Delta,60,62.0,0,2025S1,122.0,40.67


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>