In [42]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("student_data.csv")
print("Data Cleaning Started ..........")


# Handling Missing Values (NaN)
# -----------------------------

print("\nMissing values in each Column")
print(df.isnull().sum())

df['Math_Score'] = df['Math_Score'].fillna(df['Math_Score'].mean())
df['Attendance_Percentage'] = df['Attendance_Percentage'].fillna(df['Attendance_Percentage'].median())

print("\nAfter cleaning.... Missing values in each column")
print(df.isnull().sum())


# Handling Infinite Values (inf, -inf)
# -------------------------------------

print("\nCount of infinite values in each column before cleaning:")
print(np.isinf(df).sum())

# Replace +inf with max and -inf with min in Science_Score
science_no_inf = df['Science_Score'].replace([np.inf, -np.inf], np.nan)
max_val = science_no_inf.max()
min_val = science_no_inf.min()

df['Science_Score'] = df['Science_Score'].replace(np.inf, max_val)
df['Science_Score'] = df['Science_Score'].replace(-np.inf, min_val)

print("\nCount of infinite values in each column after cleaning:")
print(np.isinf(df).sum())

# Handling Invalid Negative Values
# ---------------------------------

# Find negative values in Math_Score
negative_count = (df['Math_Score'] < 0).sum()
print("\nNumber of negative values in Math_Score: ", negative_count)

# calculate mean 
mean_valid = df[df['Math_Score'] >= 0]['Math_Score'].mean()

# replace negative values with the mean
df.loc[df['Math_Score']< 0, 'Math_Score'] = mean_valid

print("\nAfter replacing negative values:")
negative_count = (df['Math_Score'] < 0).sum()
print(negative_count)


#  Detect outliers in Attendance_Percentage (>100)
# -----------------------------------------------

outlier_mask = df['Attendance_Percentage'] > 100
outlier_count = outlier_mask.sum()
print(f"\nNumber of outliers in Attendance_Percentage: {outlier_count}")

# Calculate mean excluding outliers
mean_attendance = df.loc[~outlier_mask, 'Attendance_Percentage'].mean()

# Replace outliers with the mean
df.loc[outlier_mask, 'Attendance_Percentage'] = mean_attendance


# Handling Duplicate Rows
# ----------------------

#  Remove duplicate rows
duplicates_count = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates_count}")
df = df.drop_duplicates()

# Final Cleaned Dataset

df.to_csv("cleaned_student_data.csv")
print("\nData Cleaning is Done !")
print("Cleaned Data is Saved as 'cleaned_student_data.csv' ")



Data Cleaning Started ..........

Missing values in each Column
Student_ID               0
Math_Score               1
Science_Score            0
Attendance_Percentage    1
dtype: int64

After cleaning.... Missing values in each column
Student_ID               0
Math_Score               0
Science_Score            0
Attendance_Percentage    0
dtype: int64

Count of infinite values in each column before cleaning:
Student_ID               0
Math_Score               0
Science_Score            2
Attendance_Percentage    0
dtype: int64

Count of infinite values in each column after cleaning:
Student_ID               0
Math_Score               0
Science_Score            0
Attendance_Percentage    0
dtype: int64

Number of negative values in Math_Score:  1

After replacing negative values:
0

Number of outliers in Attendance_Percentage: 1

Number of duplicate rows: 1

Data Cleaning is Done !
Cleaned Data is Saved as 'cleaned_student_data.csv' 
