In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
# Load your dataset
df = pd.read_csv('attendance.csv')

In [5]:
# Handling Missing Values

# Option 1: Remove rows with missing values
df_cleaned = df.dropna()

In [7]:
# Option 2: Fill missing values with the mean (for numerical columns)
df_filled = df.fillna(df.select_dtypes(include=np.number).mean()) # Select only numeric columns for calculating the mean

In [9]:
# Option 3: Fill missing values with the median (for numerical columns)
df_filled_median = df.fillna(df.select_dtypes(include=np.number).median()) # Select numeric columns for median calculation

In [10]:
# Option 4: Fill missing values with the mode (for categorical columns)
df_filled_mode = df.apply(lambda x: x.fillna(x.mode()[0]) if x.dtype == 'O' else x)

In [12]:
# Handling Outliers

# Using the Interquartile Range (IQR) method
Q1 = df.select_dtypes(include=np.number).quantile(0.25) # Select numeric columns for quantile calculations
Q3 = df.select_dtypes(include=np.number).quantile(0.75) # Select numeric columns for quantile calculations
IQR = Q3 - Q1

In [13]:
# Define outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [15]:
# Identify outliers
outliers = ((df.select_dtypes(include=np.number) < lower_bound) | (df.select_dtypes(include=np.number) > upper_bound))

In [17]:
# Option 1: Remove outliers
df_no_outliers = df[~((df.select_dtypes(include=np.number) < lower_bound) | (df.select_dtypes(include=np.number) > upper_bound)).any(axis=1)]

In [21]:
# Option 2: Cap outliers to the lower and upper bounds
df_capped = df.copy()
numeric_df = df.select_dtypes(include=np.number) # Select numeric columns for comparison

for col in numeric_df.columns:
    df_capped.loc[numeric_df[col] < lower_bound[col], col] = lower_bound[col]
    df_capped.loc[numeric_df[col] > upper_bound[col], col] = upper_bound[col]

  df_capped.loc[numeric_df[col] < lower_bound[col], col] = lower_bound[col]


In [23]:
# Option 3: Impute outliers with mean/median
df_imputed_outliers = df.copy()
df_imputed_outliers[outliers] = np.nan

# Calculate the mean only for numeric columns
numeric_df = df.select_dtypes(include=np.number)
df_imputed_outliers = df_imputed_outliers.fillna(numeric_df.mean())

In [24]:
print("Original DataFrame:\n", df)
print("DataFrame after handling missing values and outliers:\n", df_imputed_outliers)

Original DataFrame:
       ROLL ID    SAP ID            NAME  Mobile No.  Jan  Feb March   Apr  \
0  R211235456  50002354   Pranav Sharma  2195275467  90%  75%   75%  100%   
1  R215665563  50003546    Ranjan Kumar  3458681667  85%  55%   65%   45%   
2  R218999654  50007894  Riya Srivastav  6355230652  80%  76%   89%   55%   
3  R212256653  50003156     Aditi Tamta  5199011174  75%  89%   97%   89%   
4  R218656723  50007235   Mehak Kandpal  9400440148  95%  88%  100%   89%   
5  R212544582  50002324     Sajal Suyal  5480911851  58%  90%   77%   66%   
6  R218946821  50008943  Lokesh Kapkoti  5322590673  80%  69%   95%   45%   
7  R219756894  50003013   Diwakar Bisht  3239190309  70%  50%   80%   89%   
8  R218656595  50002364  Devank Rathore  4069562366  69%  89%   60%   99%   

      May  
0     79%  
1     67%  
2     90%  
3     67%  
4    100%  
5     55%  
6  89.50%  
7     56%  
8     89%  
DataFrame after handling missing values and outliers:
       ROLL ID    SAP ID          