In [81]:
import os
import pandas as pd
from scipy import stats



In [87]:
data = pd.read_csv('/content/StudentsPerformance.csv')



In [88]:

print("Missing values before cleaning:\n", data.isnull().sum())



Missing values before cleaning:
 gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64


In [98]:

data['math score'].fillna(data['math score'].mean(), inplace=True)
data['reading score'].fillna(data['reading score'].mean(), inplace=True)
data['writing score'].fillna(data['writing score'].mean(), inplace=True)

print("\nMissing values after filling:")
print(data[['math score', 'reading score', 'writing score']].isnull().sum())


print("\nFirst few rows after filling missing values:")
print(data[['math score', 'reading score', 'writing score']].head())


Missing values after filling:
math score       0
reading score    0
writing score    0
dtype: int64

First few rows after filling missing values:
   math score  reading score  writing score
0          72             72             74
1          69             90             88
2          90             95             93
3          47             57             44
4          76             78             75


In [99]:

print("Missing values before filling:")
print(data[['gender', 'lunch']].isnull().sum())


data['gender'].fillna(data['gender'].mode()[0], inplace=True)
data['lunch'].fillna(data['lunch'].mode()[0], inplace=True)

print("\nMissing values after filling:")
print(data[['gender', 'lunch']].isnull().sum())


print("\nFirst few rows after filling missing categorical values:")
print(data[['gender', 'lunch']].head())


Missing values before filling:
gender    0
lunch     0
dtype: int64

Missing values after filling:
gender    0
lunch     0
dtype: int64

First few rows after filling missing categorical values:
   gender         lunch
0  female      standard
1  female      standard
2  female      standard
3    male  free/reduced
4    male      standard


In [91]:

data.drop_duplicates(inplace=True)



In [92]:

data['math score'] = pd.to_numeric(data['math score'], errors='coerce')
data['reading score'] = pd.to_numeric(data['reading score'], errors='coerce')
data['writing score'] = pd.to_numeric(data['writing score'], errors='coerce')



In [93]:

output_dir = 'cleaned_data/'
os.makedirs(output_dir, exist_ok=True)



In [94]:

cleaned_data_path = os.path.join(output_dir, 'cleaned_students_performance_data.csv')
data.to_csv(cleaned_data_path, index=False)



In [95]:

print("Data cleaning completed. Summary:\n")
print(data.info())
print(data.describe())

print(f"Cleaned data saved to: {cleaned_data_path}")

Data cleaning completed. Summary:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB
None
       math score  reading score  writing score
count  1000.00000    1000.000000    1000.000000
mean     66.08900      69.169000      68.054000
std      15.16308      14.600192      15.195657
min       0.00000      17.000000      10.0