In [106]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")  # Suppress all warnings


In [107]:
# Creating the dataset with intentional outliers
data = {
    'Roll_no': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Age': [19, 20, 21, 22, 20, 19, 21, 23, 18, 20],
    'Attendance (%)': [85, 90, 78, 95, 88, 70, np.nan, 60, 96, 20],  # Outlier at 20
    'Assignment Score': [78, 85, 65, 92, 43, 50, 89, 40, 150, np.nan],  # Outlier at 150
    'Midterm Score': [72, 88, 70, 91, 82, 55, 85, np.nan, 98, 78],  
    'Final Score': [80, 84, 74, np.nan, 85, 60, 90, 50, 97, 10],  # Outlier at 10
    'Overall Grade': [8.0, 8.5, 7.0, 9.2, 8.2, 6.0, 9.0, 5.0, 9.8, 7.8]
}

dataset = pd.DataFrame(data)

dataset

Unnamed: 0,Roll_no,Age,Attendance (%),Assignment Score,Midterm Score,Final Score,Overall Grade
0,1,19,85.0,78.0,72.0,80.0,8.0
1,2,20,90.0,85.0,88.0,84.0,8.5
2,3,21,78.0,65.0,70.0,74.0,7.0
3,4,22,95.0,92.0,91.0,,9.2
4,5,20,88.0,43.0,82.0,85.0,8.2
5,6,19,70.0,50.0,55.0,60.0,6.0
6,7,21,,89.0,85.0,90.0,9.0
7,8,23,60.0,40.0,,50.0,5.0
8,9,18,96.0,150.0,98.0,97.0,9.8
9,10,20,20.0,,78.0,10.0,7.8


In [108]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Roll_no           10 non-null     int64  
 1   Age               10 non-null     int64  
 2   Attendance (%)    9 non-null      float64
 3   Assignment Score  9 non-null      float64
 4   Midterm Score     9 non-null      float64
 5   Final Score       9 non-null      float64
 6   Overall Grade     10 non-null     float64
dtypes: float64(5), int64(2)
memory usage: 692.0 bytes


In [109]:
#Finding Missing Values
dataset.isnull().sum()

Roll_no             0
Age                 0
Attendance (%)      1
Assignment Score    1
Midterm Score       1
Final Score         1
Overall Grade       0
dtype: int64

In [110]:
# Filling missing values with the mean of each column
dataset.fillna(dataset.mean(numeric_only=True), inplace=True)

dataset

Unnamed: 0,Roll_no,Age,Attendance (%),Assignment Score,Midterm Score,Final Score,Overall Grade
0,1,19,85.0,78.0,72.0,80.0,8.0
1,2,20,90.0,85.0,88.0,84.0,8.5
2,3,21,78.0,65.0,70.0,74.0,7.0
3,4,22,95.0,92.0,91.0,70.0,9.2
4,5,20,88.0,43.0,82.0,85.0,8.2
5,6,19,70.0,50.0,55.0,60.0,6.0
6,7,21,75.777778,89.0,85.0,90.0,9.0
7,8,23,60.0,40.0,79.888889,50.0,5.0
8,9,18,96.0,150.0,98.0,97.0,9.8
9,10,20,20.0,76.888889,78.0,10.0,7.8


In [111]:
dataset.isnull().sum()

Roll_no             0
Age                 0
Attendance (%)      0
Assignment Score    0
Midterm Score       0
Final Score         0
Overall Grade       0
dtype: int64

In [112]:
#Outliers

# Calculate Q1, Q3, and IQR only for numeric columns
Q1 = dataset.select_dtypes(include=[np.number]).quantile(0.25)
Q3 = dataset.select_dtypes(include=[np.number]).quantile(0.75)
IQR = Q3 - Q1

# Finding outliers
outliers = ((dataset.select_dtypes(include=[np.number]) < (Q1 - 1.5 * IQR)) | 
            (dataset.select_dtypes(include=[np.number]) > (Q3 + 1.5 * IQR)))

# Display rows that contain outliers
outlier_rows = dataset[outliers.any(axis=1)]
outlier_rows

Unnamed: 0,Roll_no,Age,Attendance (%),Assignment Score,Midterm Score,Final Score,Overall Grade
8,9,18,96.0,150.0,98.0,97.0,9.8
9,10,20,20.0,76.888889,78.0,10.0,7.8


In [113]:
# Replace outliers with the median of respective columns
for col in dataset.select_dtypes(include=[np.number]).columns:
    median_value = dataset[col].median()
    dataset.loc[outliers[col], col] = median_value

dataset

Unnamed: 0,Roll_no,Age,Attendance (%),Assignment Score,Midterm Score,Final Score,Overall Grade
0,1.0,19,85.0,78.0,72.0,80.0,8.0
1,2.0,20,90.0,85.0,88.0,84.0,8.5
2,3.0,21,78.0,65.0,70.0,74.0,7.0
3,4.0,22,95.0,92.0,91.0,70.0,9.2
4,5.0,20,88.0,43.0,82.0,85.0,8.2
5,6.0,19,70.0,50.0,55.0,60.0,6.0
6,7.0,21,75.777778,89.0,85.0,90.0,9.0
7,8.0,23,60.0,40.0,79.888889,50.0,5.0
8,9.0,18,96.0,77.444444,98.0,97.0,9.8
9,10.0,20,81.5,76.888889,78.0,77.0,7.8


In [114]:
#Data transformation

#We will apply a log transformation to the Assignment Score variable to reduce skewness and convert the distribution into a more normal shape.
#Log transformation helps compress large values, reducing the impact of outliers.

# Applying Log Transformation (Adding 1 to avoid log(0))
dataset['Assignment Score (Log)'] = np.log1p(dataset['Assignment Score'])

# Display transformed dataset
dataset

Unnamed: 0,Roll_no,Age,Attendance (%),Assignment Score,Midterm Score,Final Score,Overall Grade,Assignment Score (Log)
0,1.0,19,85.0,78.0,72.0,80.0,8.0,4.369448
1,2.0,20,90.0,85.0,88.0,84.0,8.5,4.454347
2,3.0,21,78.0,65.0,70.0,74.0,7.0,4.189655
3,4.0,22,95.0,92.0,91.0,70.0,9.2,4.532599
4,5.0,20,88.0,43.0,82.0,85.0,8.2,3.78419
5,6.0,19,70.0,50.0,55.0,60.0,6.0,3.931826
6,7.0,21,75.777778,89.0,85.0,90.0,9.0,4.49981
7,8.0,23,60.0,40.0,79.888889,50.0,5.0,3.713572
8,9.0,18,96.0,77.444444,98.0,97.0,9.8,4.362391
9,10.0,20,81.5,76.888889,78.0,77.0,7.8,4.355283


In [3]:
#Explanation
'''
1) Log transformation
Log Transformation is a technique used to transform skewed data into a more normal (Gaussian) distribution. It involves replacing each value x in your dataset with log(x).


2) Skewed data 
It means the data is not evenly spread—it's lopsided.

Types of Skew:
Right-skewed (Positive Skew):
Tail is on the right (big values pull the mean right).
Example: Income, where few people earn a lot.

Left-skewed (Negative Skew):
Tail is on the left (small values pull the mean left).
Example: Age at retirement—most people retire around the same age, few retire very early.
'''


"\n1) Log transformation\nLog Transformation is a technique used to transform skewed data into a more normal (Gaussian) distribution. It involves replacing each value x in your dataset with log(x).\n\n\n2) Skewed data \nIt means the data is not evenly spread—it's lopsided.\n\nTypes of Skew:\nRight-skewed (Positive Skew):\nTail is on the right (big values pull the mean right).\nExample: Income, where few people earn a lot.\n\nLeft-skewed (Negative Skew):\nTail is on the left (small values pull the mean left).\nExample: Age at retirement—most people retire around the same age, few retire very early.\n"