In [2]:
# ==============================================
# DATA ANALYSIS: LITERACY RATES IN INDIA (2011-2022)
# ==============================================

# Importing required libraries with clear purpose explanations
import warnings  # For suppressing non-critical warnings during execution
warnings.filterwarnings("ignore")  # Ignore warnings to keep output clean

# Data manipulation and analysis
import numpy as np  # Fundamental package for numerical computations
import pandas as pd_s_data  # Primary tool for data structures and analysis

# Load dataset from CSV file
# Note: Ensure the file path is correct relative to your working directory
df_s_data = pd_s_data .read_csv('TM351 -Fall 25-26- KSADATASET.csv')

print("\n=== SAMPLE DATA ===")
df_s_data.head(20)  # Display first 15 rows to understand content


=== SAMPLE DATA ===


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
6,female,group B,some college,standard,completed,88,95,92
7,male,group B,some college,free/reduced,none,40,43,39
8,male,group D,high school,free/reduced,completed,64,64,67
9,female,group B,high school,free/reduced,none,38,60,50


In [3]:
# Initial data exploration
print("\n=== DATA STRUCTURE ===")
df_s_data.info()  # Shows column count, data types, and memory usage


=== DATA STRUCTURE ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [4]:
# ======================
# DATA QUALITY CHECKS
# ======================

# Check for missing values in each column
print("\n=== MISSING VALUE ANALYSIS ===")
missing_data = df_s_data.isnull().sum()
print(df_s_data.isnull().sum())
print(missing_data[missing_data > 0])  # Only show columns with missing values



=== MISSING VALUE ANALYSIS ===
gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64
Series([], dtype: int64)


In [5]:

# Identify duplicate records
print("\n=== DUPLICATE RECORDS ===")
print(f"Total duplicates: {df_s_data.duplicated().sum()}")  # Count exact row duplicates




=== DUPLICATE RECORDS ===
Total duplicates: 0


In [6]:
# ======================
# OUTLIER DETECTION
# ======================

# Focus only on numerical columns for outlier analysis
numeric_cols = df_s_data.select_dtypes(include=[np.number])

# Calculate statistical thresholds for outlier detection
# Using 10 standard deviations from mean as cutoff
OUTLIER_STD_THRESHOLD = 10 
stats = {
    'mean': numeric_cols.mean(),
    'std': numeric_cols.std(),
    'upper_limit': numeric_cols.mean() + OUTLIER_STD_THRESHOLD * numeric_cols.std(),
    'lower_limit': numeric_cols.mean() - OUTLIER_STD_THRESHOLD * numeric_cols.std()
}

# Iterate through each numeric column to find outliers
total_outliers = 0
for col in numeric_cols:
    col_outliers = df_s_data[
        (df_s_data[col] > stats['upper_limit'][col]) | 
        (df_s_data[col] < stats['lower_limit'][col])
    ]
    print(f"\nOutliers in {col}: {len(col_outliers)}")
    total_outliers += len(col_outliers)

print(f"\nTOTAL OUTLIERS IDENTIFIED: {total_outliers}")


Outliers in math score: 0

Outliers in reading score: 0

Outliers in writing score: 0

TOTAL OUTLIERS IDENTIFIED: 0
