In [18]:
import pandas as pd
import numpy as np
import random

np.random.seed(42)
random.seed(42)

# Create a sample dataset
data = {
    'Student_ID': ['S001', 'S002', 'S003', 'S004', 'S005', 'S006', 'S007', 'S008', 'S009', 'S010'],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Helen', 'Ivy', 'Jack'],
    'Math_Score': np.random.randint(50, 100, 10),
    'Science_Score': np.random.randint(50, 100, 10),
    'History_Score': np.random.randint(50, 100, 10),
    'Gender': ['F', 'M', 'M', 'F', 'F', 'M', 'F', 'F', 'M', 'M'],
    'Age': [18, 19, 20, 19, 18, 20, 19, 18, 21, 20],
    'Class': ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B']
}

df = pd.DataFrame(data)

# Introduce missing values and inconsistencies
df.loc[2, 'Math_Score'] = np.nan
df.loc[5, 'History_Score'] = np.nan
df.loc[7, 'Gender'] = 'X'
df.loc[9, 'Age'] = -1

print("Sample Dataset:")
print(df)


Sample Dataset:
  Student_ID     Name  Math_Score  Science_Score  History_Score Gender  Age  \
0       S001    Alice        88.0             60           79.0      F   18   
1       S002      Bob        78.0             73           87.0      M   19   
2       S003  Charlie         NaN             85           51.0      M   20   
3       S004    David        92.0             89           70.0      F   19   
4       S005      Eve        57.0             73           82.0      F   18   
5       S006    Frank        70.0             52            NaN      M   20   
6       S007    Grace        88.0             71           71.0      F   19   
7       S008    Helen        68.0             51           93.0      X   18   
8       S009      Ivy        72.0             73           74.0      M   21   
9       S010     Jack        60.0             93           98.0      M   -1   

  Class  
0     A  
1     B  
2     A  
3     B  
4     A  
5     B  
6     A  
7     B  
8     A  
9     B  


In [19]:
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Check for inconsistencies
print("\nInconsistencies:")
print("Gender:", df['Gender'].unique())
print("Age:", df['Age'].unique())





Missing Values:
Student_ID       0
Name             0
Math_Score       1
Science_Score    0
History_Score    1
Gender           0
Age              0
Class            0
dtype: int64

Inconsistencies:
Gender: ['F' 'M' 'X']
Age: [18 19 20 21 -1]


In [20]:
# Handling missing values and inconsistencies
# Replace missing values in 'Math_Score' and 'History_Score' with the mean
df['Math_Score']=df['Math_Score'].fillna(df['Math_Score'].mean())
df['History_Score']=df['History_Score'].fillna(df['History_Score'].mean())

# Replace 'Gender' values other than 'F' and 'M' with NaN
df['Gender'] = df['Gender'].apply(lambda x: x if x in ['F', 'M'] else np.nan)

# Replace negative age values with NaN
df['Age'] = df['Age'].apply(lambda x: x if x >= 0 else np.nan)

print("\nDataset after handling missing values and inconsistencies:")
print(df)


Dataset after handling missing values and inconsistencies:
  Student_ID     Name  Math_Score  Science_Score  History_Score Gender   Age  \
0       S001    Alice   88.000000             60      79.000000      F  18.0   
1       S002      Bob   78.000000             73      87.000000      M  19.0   
2       S003  Charlie   74.777778             85      51.000000      M  20.0   
3       S004    David   92.000000             89      70.000000      F  19.0   
4       S005      Eve   57.000000             73      82.000000      F  18.0   
5       S006    Frank   70.000000             52      78.333333      M  20.0   
6       S007    Grace   88.000000             71      71.000000      F  19.0   
7       S008    Helen   68.000000             51      93.000000    NaN  18.0   
8       S009      Ivy   72.000000             73      74.000000      M  21.0   
9       S010     Jack   60.000000             93      98.000000      M   NaN   

  Class  
0     A  
1     B  
2     A  
3     B  
4     A  

In [21]:
# Check for outliers using z-score
from scipy.stats import zscore

numeric_cols = ['Math_Score', 'Science_Score', 'History_Score', 'Age']
z_scores = np.abs(zscore(df[numeric_cols]))
outliers = (z_scores > 3).any(axis=1)

print("\nOutliers:")
print(df[outliers])

# Replace outliers with NaN
df.loc[outliers, numeric_cols] = np.nan

print("\nDataset after handling outliers:")
print(df)



Outliers:
Empty DataFrame
Columns: [Student_ID, Name, Math_Score, Science_Score, History_Score, Gender, Age, Class]
Index: []

Dataset after handling outliers:
  Student_ID     Name  Math_Score  Science_Score  History_Score Gender   Age  \
0       S001    Alice   88.000000           60.0      79.000000      F  18.0   
1       S002      Bob   78.000000           73.0      87.000000      M  19.0   
2       S003  Charlie   74.777778           85.0      51.000000      M  20.0   
3       S004    David   92.000000           89.0      70.000000      F  19.0   
4       S005      Eve   57.000000           73.0      82.000000      F  18.0   
5       S006    Frank   70.000000           52.0      78.333333      M  20.0   
6       S007    Grace   88.000000           71.0      71.000000      F  19.0   
7       S008    Helen   68.000000           51.0      93.000000    NaN  18.0   
8       S009      Ivy   72.000000           73.0      74.000000      M  21.0   
9       S010     Jack   60.000000      

In [24]:
# Apply log transformation on 'Math_Score' column
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df["Math_Score"]= scaler.fit_transform(df['Math_Score'].values.reshape(-1,1))

In [25]:
print(df)

  Student_ID     Name  Math_Score  Science_Score  History_Score Gender   Age  \
0       S001    Alice    0.906889           60.0      79.000000      F  18.0   
1       S002      Bob    0.654454           73.0      87.000000      M  19.0   
2       S003  Charlie    0.566257           85.0      51.000000      M  20.0   
3       S004    David    1.000000           89.0      70.000000      F  19.0   
4       S005      Eve    0.000000           73.0      82.000000      F  18.0   
5       S006    Frank    0.428326           52.0      78.333333      M  20.0   
6       S007    Grace    0.906889           71.0      71.000000      F  19.0   
7       S008    Helen    0.367809           51.0      93.000000    NaN  18.0   
8       S009      Ivy    0.487161           73.0      74.000000      M  21.0   
9       S010     Jack    0.106810           93.0      98.000000      M   NaN   

  Class  
0     A  
1     B  
2     A  
3     B  
4     A  
5     B  
6     A  
7     B  
8     A  
9     B  
