## Exploratory Data Analysis


In [0]:
import pandas as pd
import numpy as np
import os
# importing required libraries


In [0]:
# defining file paths instead of hardcoding 
BASE_PATH = "/Volumes/student_risk_data/default/data"

files = {
    "demographics": "/Volumes/student_risk_data/default/data/student_demographics.csv",
    "academics": "/Volumes/student_risk_data/default/data/student_academic_performance.csv",
    "attendance": "/Volumes/student_risk_data/default/data/student_attendance.csv",
    "retention": "/Volumes/student_risk_data/default/data/student_retention_history.csv"
}


In [0]:
def load_csv(file_path):
    try:
        df = pd.read_csv(file_path)
        print(f"Loaded {os.path.basename(file_path)} successfully")
        return df
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except pd.errors.EmptyDataError:
        print(f"File is empty: {file_path}")
    except Exception as e:
        print(f"Error loading {file_path}: {e}")


## Student Demographics EDA

In [0]:
# loading student demographics dataset into a df
df_demographics = load_csv(os.path.join(BASE_PATH, files["demographics"]))

Loaded student_demographics.csv successfully


In [0]:
# inspecting dataframes
df_demographics.head()

Unnamed: 0,student_id,student_name,gender,date_of_birth,academic_year,disability_flag,annual_family_income,parental_education,first_generation_student,urban_rural,admission_type,institution_code
0,S001-24,Aarav Sharma,MALE,21-11-2003,2023-2024,0.0,259805.0,none,0,URBAN,RESERVATION,INST01
1,S002-25,Aarav Verma,female,14-05-2001,2024-2025,0.0,1899189.0,graduate,1,URBAN,merit,INST02
2,S003-24,Aarav Patel,FEMALE,12-08-2002,2023-2024,1.0,3903935.0,primary,0,Rural,merit,INST02
3,S004-24,Aarav Iyer,Female,12-08-2002,2023-2024,0.0,3155303.0,PRIMARY,1,Rural,RESERVATION,INST02
4,S005-24,Aarav Reddy,female,12-08-2002,2023-2024,1.0,1971394.0,primary,1,URBAN,merit,INST02


In [0]:
df_demographics["student_id"].duplicated().sum() # indicates all unique students in demographics dim table

np.int64(0)

In [0]:
print(df_demographics.shape)
print(df_demographics.columns)

(1000, 12)
Index(['student_id', 'student_name', 'gender', 'date_of_birth',
       'academic_year', 'disability_flag', 'annual_family_income',
       'parental_education', 'first_generation_student', 'urban_rural',
       'admission_type', 'institution_code'],
      dtype='object')


In [0]:
df_demographics.info()
# 1000 unique & non-null student_id (identifiers)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   student_id                1000 non-null   object 
 1   student_name              1000 non-null   object 
 2   gender                    976 non-null    object 
 3   date_of_birth             1000 non-null   object 
 4   academic_year             1000 non-null   object 
 5   disability_flag           920 non-null    float64
 6   annual_family_income      912 non-null    float64
 7   parental_education        936 non-null    object 
 8   first_generation_student  1000 non-null   int64  
 9   urban_rural               963 non-null    object 
 10  admission_type            927 non-null    object 
 11  institution_code          1000 non-null   object 
dtypes: float64(2), int64(1), object(9)
memory usage: 93.9+ KB


In [0]:
# categorical variables : looking for imbalanced categories and format issues
print(f"Gender Counts: \n",df_demographics["gender"].value_counts(dropna=False))
print(f"Region Counts: \n",df_demographics["urban_rural"].value_counts(dropna=False))
print(f"Academic Year Counts: \n",df_demographics["academic_year"].value_counts(dropna=False))
print(f"Admission Type Counts:\n",df_demographics["admission_type"].value_counts(dropna=False))


Gender Counts: 
 gender
male      214
female    200
MALE      161
Female    138
FEMALE    134
Male      129
NaN        24
Name: count, dtype: int64
Region Counts: 
 urban_rural
urban    200
rural    187
URBAN    152
RURAL    147
Urban    143
Rural    134
NaN       37
Name: count, dtype: int64
Academic Year Counts: 
 academic_year
2024-2025    502
2023-2024    498
Name: count, dtype: int64
Admission Type Counts:
 admission_type
management     127
merit          126
reservation    119
RESERVATION    111
MANAGEMENT      98
MERIT           97
Merit           92
Management      82
Reservation     75
NaN             73
Name: count, dtype: int64


In [0]:
# checking for missing values
df_demographics.isna().sum().sort_values(ascending=False)

annual_family_income        88
disability_flag             80
admission_type              73
parental_education          64
urban_rural                 37
gender                      24
student_id                   0
student_name                 0
date_of_birth                0
academic_year                0
first_generation_student     0
institution_code             0
dtype: int64

In [0]:
# numerical variables : checking income spread, min, max
print(df_demographics["annual_family_income"].describe())


count    9.120000e+02
mean     2.528974e+06
std      1.415167e+06
min      5.296600e+04
25%      1.260898e+06
50%      2.562014e+06
75%      3.730788e+06
max      4.993542e+06
Name: annual_family_income, dtype: float64


In [0]:
low_income_students = df_demographics[
    df_demographics["annual_family_income"] < 150000
]
# filtering low-income students
low_income_students.shape 

(24, 12)

## Attendance EDA


In [0]:
# loading student attendance dataset into a df
df_attendance = load_csv(os.path.join(BASE_PATH, files["attendance"]))

Loaded student_attendance.csv successfully


In [0]:
df_attendance.head()

Unnamed: 0,student_id,academic_year,subject_code,attendance_percentage,participation_score
0,S001-24,2023-2024,SUB04,69.0,7.0
1,S002-25,2024-2025,SUB04,71.0,7.0
2,S002-25,2024-2025,SUB05,63.0,7.0
3,S004-24,2023-2024,SUB01,76.0,7.0
4,S177-24,2023-2024,SUB02,95.0,7.0


In [0]:
df_attendance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5025 entries, 0 to 5024
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   student_id             5025 non-null   object 
 1   academic_year          5025 non-null   object 
 2   subject_code           5025 non-null   object 
 3   attendance_percentage  4777 non-null   float64
 4   participation_score    4788 non-null   float64
dtypes: float64(2), object(3)
memory usage: 196.4+ KB


In [0]:
df_attendance.shape


(5025, 5)

In [0]:
df_attendance.columns

Index(['student_id', 'academic_year', 'subject_code', 'attendance_percentage',
       'participation_score'],
      dtype='object')

In [0]:
# Attendance percentage distribution
df_attendance["attendance_percentage"].describe()


count    4777.000000
mean       66.225455
std        15.681625
min        30.000000
25%        53.000000
50%        67.000000
75%        79.000000
max       185.000000
Name: attendance_percentage, dtype: float64

In [0]:
df_attendance[df_attendance["attendance_percentage"] < 60].shape # Filtering records with low attendance (grain: per student per subject wise) 
# Attendance as a dropout risk factor

(1722, 5)

In [0]:
df_attendance.isna().sum() # checks missing values


student_id                 0
academic_year              0
subject_code               0
attendance_percentage    248
participation_score      237
dtype: int64

## Academics EDA


In [0]:
# loading student academics dataset into a df
df_academics = load_csv(os.path.join(BASE_PATH, files["academics"]))

Loaded student_academic_performance.csv successfully


In [0]:
df_academics.head()

Unnamed: 0,student_id,academic_year,subject_code,subject_name,internal_marks,external_mark
0,S719-25,2024-2025,SUB04,Social Studies,134.0,25.0
1,S636-25,2024-2025,SUB05,Computer Science,133.0,34.0
2,S001-24,2023-2024,SUB03,English,35.0,44.0
3,S671-24,2023-2024,SUB04,Social Studies,35.0,50.0
4,S811-25,2024-2025,SUB04,Social Studies,35.0,50.0


In [0]:
df_academics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5025 entries, 0 to 5024
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   student_id      5025 non-null   object 
 1   academic_year   5025 non-null   object 
 2   subject_code    5025 non-null   object 
 3   subject_name    5025 non-null   object 
 4   internal_marks  4778 non-null   float64
 5   external_mark   4765 non-null   float64
dtypes: float64(2), object(4)
memory usage: 235.7+ KB


In [0]:
print(df_academics.shape)
print(df_academics.columns)

(5025, 6)
Index(['student_id', 'academic_year', 'subject_code', 'subject_name',
       'internal_marks', 'external_mark'],
      dtype='object')


In [0]:
# checks numerical statsitics
df_academics["external_mark"].describe()


count    4765.000000
mean       31.718153
std        12.883891
min         5.000000
25%        21.000000
50%        32.000000
75%        43.000000
max        55.000000
Name: external_mark, dtype: float64

In [0]:
df_academics["external_mark"].isna().sum() 
# check missing scores

np.int64(260)

In [0]:
low_marks = df_academics[
    (df_academics["internal_marks"] + df_academics["external_mark"]) < 40
]
low_marks
# Pandas Filtering demonstration: Students with weak academic performance

Unnamed: 0,student_id,academic_year,subject_code,subject_name,internal_marks,external_mark
1058,S009-24,2023-2024,SUB01,Mathematics,28.0,10.0
1059,S014-24,2023-2024,SUB02,Science,28.0,10.0
1060,S024-25,2024-2025,SUB01,Mathematics,28.0,10.0
1061,S028-25,2024-2025,SUB02,Science,28.0,10.0
1127,S773-24,2023-2024,SUB04,Social Studies,28.0,11.0
...,...,...,...,...,...,...
4773,S192-24,2023-2024,SUB03,English,0.0,29.0
4774,S655-25,2024-2025,SUB02,Science,0.0,12.0
4775,S860-25,2024-2025,SUB02,Science,0.0,14.0
4776,S450-25,2024-2025,SUB05,Computer Science,0.0,21.0


## Retention Data EDA


In [0]:
# loading student retention dataset into a df
df_retention = load_csv(os.path.join(BASE_PATH, files["retention"]))

Loaded student_retention_history.csv successfully


In [0]:
df_retention.head()

Unnamed: 0,student_id,academic_year,dropout_flag,dropout_date,dropout_stage,dropout_reason,overall_attendance_percentage,family_income_band,academic_score,institution_code,age_at_enrollment,gender
0,H325-23,2022-2023,1,24-03-2022,Early,Academic,85.0,Low,55.0,INST01,24.0,female
1,H002-21,2020-2021,0,,,,33.0,Low,62.0,INST01,25.0,Male
2,H003-23,2022-2023,0,,,,64.0,Low,58.0,INST02,24.0,male
3,H1822-22,2021-2022,1,09-11-2021,Late,Academic,55.0,Low,55.0,INST01,21.0,male
4,H005-23,2022-2023,0,,,,61.0,Low,81.0,INST02,20.0,male


In [0]:
df_retention.shape

(2000, 12)

In [0]:
df_retention.columns

Index(['student_id', 'academic_year', 'dropout_flag', 'dropout_date',
       'dropout_stage', 'dropout_reason', 'overall_attendance_percentage',
       'family_income_band', 'academic_score', 'institution_code',
       'age_at_enrollment', 'gender'],
      dtype='object')

In [0]:
# Retention distribution
df_retention["dropout_flag"].value_counts()

dropout_flag
1    1005
0     995
Name: count, dtype: int64

In [0]:
df_retention["dropout_flag"].value_counts(normalize=True) * 100 
# percentage retention


dropout_flag
1    50.25
0    49.75
Name: proportion, dtype: float64

## Grouping & Aggregation using Pandas

In [0]:
df_demographics["income_band"] = pd.cut(
    df_demographics["annual_family_income"],
    bins=[0, 150000, 300000, 5000000],
    labels=["Low", "Medium", "High"]
)

attendance_income = (
    df_attendance
    .merge(df_demographics[["student_id", "income_band"]], on="student_id")
    .groupby("income_band")["attendance_percentage"]
    .mean()
    .reset_index()
)
# socio-economic impact on attendance
attendance_income


  .groupby("income_band")["attendance_percentage"]


Unnamed: 0,income_band,attendance_percentage
0,Low,47.256637
1,Medium,65.561983
2,High,66.679845


In [0]:
df_academics["total_marks"] = (
    df_academics["internal_marks"] + df_academics["external_mark"]
)

subject_performance = (
    df_academics
    .groupby("subject_name")["total_marks"]
    .mean()
    .reset_index()
    .sort_values("total_marks")
)
# Identifies difficult subjects
subject_performance


Unnamed: 0,subject_name,total_marks
0,Computer Science,49.830752
4,Social Studies,51.321389
1,English,51.802198
3,Science,51.947426
2,Mathematics,52.031694
