In [1]:
import pandas as pd

In [2]:
# sample dataset
data = {
    'Name': ['Ali','Sara','John',None,'Marry'],
    'Age': [25, None, 30, 22, None],
    'City':['KL', 'Penang', None, 'Johor', 'KL']
}

In [3]:
df=pd.DataFrame(data)
print('Original DataFrame:')
print(df)

Original DataFrame:
    Name   Age    City
0    Ali  25.0      KL
1   Sara   NaN  Penang
2   John  30.0    None
3   None  22.0   Johor
4  Marry   NaN      KL


# Check the missing values Row by Row

In [4]:
for index, row in df.iterrows():
    print(f"Row {index}: Missing = {row.isnull().any()}, Details={row.isnull().to_dict()}")

Row 0: Missing = False, Details={'Name': False, 'Age': False, 'City': False}
Row 1: Missing = True, Details={'Name': False, 'Age': True, 'City': False}
Row 2: Missing = True, Details={'Name': False, 'Age': False, 'City': True}
Row 3: Missing = True, Details={'Name': True, 'Age': False, 'City': False}
Row 4: Missing = True, Details={'Name': False, 'Age': True, 'City': False}


In [5]:
# show only rows with missing data
missing_rows=df[df.isnull().any(axis=1)]
print('Rows with missing data:')
print(missing_rows)

Rows with missing data:
    Name   Age    City
1   Sara   NaN  Penang
2   John  30.0    None
3   None  22.0   Johor
4  Marry   NaN      KL


In [6]:
# Simulate rows that would be dropped using dropna()
to_drop = df[df.isnull().any(axis=1)]
print('These rows would be dropped using dropna():')
print(to_drop)

These rows would be dropped using dropna():
    Name   Age    City
1   Sara   NaN  Penang
2   John  30.0    None
3   None  22.0   Johor
4  Marry   NaN      KL


In [7]:
df_cleaned = df.dropna()
print('\nAfter dropna():')
print(df_cleaned)


After dropna():
  Name   Age City
0  Ali  25.0   KL


In [8]:
print('Before fillna():')
print(df)

Before fillna():
    Name   Age    City
0    Ali  25.0      KL
1   Sara   NaN  Penang
2   John  30.0    None
3   None  22.0   Johor
4  Marry   NaN      KL


In [9]:
df_filled = df.fillna({
    'Name': 'Unknown',
    'Age': df['Age'].mean(),
    'City': 'Not Available'
})

In [10]:
print('\nAfter fillna():')
print(df_filled)


After fillna():
      Name        Age           City
0      Ali  25.000000             KL
1     Sara  25.666667         Penang
2     John  30.000000  Not Available
3  Unknown  22.000000          Johor
4    Marry  25.666667             KL


In [11]:
df_sfill = df.fillna({
    'Name': 'Unknown',
    'Age': round(df['Age'].mean(),2),
    'City': 'Not Available'
})

In [12]:
print('\nAfter fillna(): age upto to decimal digit')
print(df_sfill)


After fillna(): age upto to decimal digit
      Name    Age           City
0      Ali  25.00             KL
1     Sara  25.67         Penang
2     John  30.00  Not Available
3  Unknown  22.00          Johor
4    Marry  25.67             KL


In [13]:
import math

In [14]:
df_new = df.fillna({
    'Name': 'Unknown',
    'Age': math.floor((df['Age'].mean()).astype(int)),
    'City': 'Not Available'
})

In [15]:
print('\nAfter fillna(): floor without decimal')
print(df_new)


After fillna(): floor without decimal
      Name   Age           City
0      Ali  25.0             KL
1     Sara  25.0         Penang
2     John  30.0  Not Available
3  Unknown  22.0          Johor
4    Marry  25.0             KL


In [16]:
df_new = df.fillna({
    'Name': 'Unknown',
    'Age': math.ceil((df['Age'].mean()).astype(int)),
    'City': 'Not Available'
})

In [17]:
print('\nAfter fillna(): floor without decimal')
print(df_new)


After fillna(): floor without decimal
      Name   Age           City
0      Ali  25.0             KL
1     Sara  25.0         Penang
2     John  30.0  Not Available
3  Unknown  22.0          Johor
4    Marry  25.0             KL


## Data Cleaning Exercise
### Step 1:Use Day-15 students_performance_dirty.csv , You can download from GitHub (if you miss out Day-15)
### Step 2: Get basic information about dataset
### Step 3: print('Missing values per column:')
### Step 4: Check missing data line by line
### Step 5: Drop missing rows (if any)
### Step 6: Compare before and after
### Step 7: Fill missing values
### Step 8: Compare before and after

In [34]:
#step 1 "C:\Users\salma\OneDrive\Desktop\aimltraining\Day-15\students_performance_dirty.csv"
df = pd.read_csv('C:/Users/sabri/OneDrive/Desktop/aimltraining/Day-15/students_performance_dirty.csv')
print('Dataset loaded successfully!')
print(df.head())

Dataset loaded successfully!
   gender  study_hours  attendance_pct  math_score  reading_score  final_score
0    male          1.7            83.6        62.0           91.0         38.9
1     NaN          2.6            91.2        82.0           60.0         37.5
2  female          2.9            97.5        69.0           57.0         35.0
3  female          4.8            85.7        78.0           62.0         36.5
4    male          3.9           -10.0        64.0           95.0         30.9


In [35]:
# Step 2: Get basic information about dataset
print('--- Dataset Info ---')
print(df.info())

--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   gender          52 non-null     object 
 1   study_hours     52 non-null     float64
 2   attendance_pct  60 non-null     float64
 3   math_score      60 non-null     float64
 4   reading_score   60 non-null     float64
 5   final_score     60 non-null     float64
dtypes: float64(5), object(1)
memory usage: 2.9+ KB
None


In [36]:
#Step 3 print('Missing values per column:')
print('Missing values per column:')
print(df.isnull().sum())

Missing values per column:
gender            8
study_hours       8
attendance_pct    0
math_score        0
reading_score     0
final_score       0
dtype: int64


In [37]:
#step 4:Check missing Data Line by Line
for index, row in df.iterrows():
    if row.isnull().any():
        print(f'Row {index}: Missing Data -> {row.isnull().to_dict()}')

Row 1: Missing Data -> {'gender': True, 'study_hours': False, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 6: Missing Data -> {'gender': False, 'study_hours': True, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 7: Missing Data -> {'gender': False, 'study_hours': True, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 11: Missing Data -> {'gender': True, 'study_hours': False, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 12: Missing Data -> {'gender': False, 'study_hours': True, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 13: Missing Data -> {'gender': False, 'study_hours': True, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 20: Missing Data -> {'gender': False, 'study_hours': True, 'attendanc

In [38]:
# Step 5: Drop missing rows (if any)
df_dropped = df.dropna()

In [39]:
#Step 6:Comparing before and after
print('Shape before dropna:', df.shape)
print('Shape after dropna:', df_dropped.shape)

Shape before dropna: (60, 6)
Shape after dropna: (45, 6)


In [40]:
df

Unnamed: 0,gender,study_hours,attendance_pct,math_score,reading_score,final_score
0,male,1.7,83.6,62.0,91.0,38.9
1,,2.6,91.2,82.0,60.0,37.5
2,female,2.9,97.5,69.0,57.0,35.0
3,female,4.8,85.7,78.0,62.0,36.5
4,male,3.9,-10.0,64.0,95.0,30.9
5,male,1.4,93.7,77.0,72.0,39.0
6,male,,87.1,95.0,78.0,43.4
7,femle,,70.6,59.0,70.0,32.8
8,female,2.7,74.7,68.0,71.0,35.3
9,female,4.2,92.4,63.0,66.0,35.0


In [41]:
#Step 7 fillna()
df_filled = df.fillna({
    'gender': 'Unknown',
    'study_hours': df['study_hours'].mean()
   })

In [32]:
df_filled

Unnamed: 0,Name,Age,City
0,Ali,25.0,KL
1,Sara,25.666667,Penang
2,John,30.0,Not Available
3,Unknown,22.0,Johor
4,Marry,25.666667,KL


In [42]:
# Step 8 Compare before and after
# Compare each row before and after fillna()
for i in range(len(df)):
    print(f"\nRow {i} Before: {df.iloc[i].to_dict()}")
    print(f"Row {i} After : {df_filled.iloc[i].to_dict()}")


Row 0 Before: {'gender': 'male', 'study_hours': 1.7, 'attendance_pct': 83.6, 'math_score': 62.0, 'reading_score': 91.0, 'final_score': 38.9}
Row 0 After : {'gender': 'male', 'study_hours': 1.7, 'attendance_pct': 83.6, 'math_score': 62.0, 'reading_score': 91.0, 'final_score': 38.9}

Row 1 Before: {'gender': nan, 'study_hours': 2.6, 'attendance_pct': 91.2, 'math_score': 82.0, 'reading_score': 60.0, 'final_score': 37.5}
Row 1 After : {'gender': 'Unknown', 'study_hours': 2.6, 'attendance_pct': 91.2, 'math_score': 82.0, 'reading_score': 60.0, 'final_score': 37.5}

Row 2 Before: {'gender': 'female', 'study_hours': 2.9, 'attendance_pct': 97.5, 'math_score': 69.0, 'reading_score': 57.0, 'final_score': 35.0}
Row 2 After : {'gender': 'female', 'study_hours': 2.9, 'attendance_pct': 97.5, 'math_score': 69.0, 'reading_score': 57.0, 'final_score': 35.0}

Row 3 Before: {'gender': 'female', 'study_hours': 4.8, 'attendance_pct': 85.7, 'math_score': 78.0, 'reading_score': 62.0, 'final_score': 36.5}
Row