In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

<h1>Data Cleaning</h1>

In [None]:
df = pd.read_csv("../data/StudentPerformanceFactors.csv")
print(df.info())

missing_cols = ["Teacher_Quality", "Parental_Education_Level", "Distance_from_Home"]
for col in missing_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

print(df.isnull().sum())

<h1>Feature Encoding</h1>

<h1>Kaustuv</h1>

In [None]:
#  Parental_involvement,Motivation_Level,Teacher_Quality(Low=1,Medium=2,High=3)
print(df.Parental_Involvement.unique())
print(df.Motivation_Level.unique())
print(df.Teacher_Quality.unique())
ordinal_priority_map = {"Low": 1, "Medium": 2, "High": 3}
df["Parental_Involvement"] = df["Parental_Involvement"].replace(ordinal_priority_map)
df["Motivation_Level"] = df["Motivation_Level"].replace(ordinal_priority_map)
df["Teacher_Quality"] = df["Teacher_Quality"].replace(ordinal_priority_map)
df.Parental_Involvement.sample(3)
df.Motivation_Level.sample(3)
df.Teacher_Quality.sample(3)

In [None]:
# Access_to_Resources,Family_Income(Low=1,Medium=2,High=3),Parental_Education_Level(High School=1,college=2,postgraduate=3)
print(df.Access_to_Resources.unique())
print(df.Family_Income.unique())
print(df.Parental_Education_Level.unique())

df["Access_to_Resources"] = df["Access_to_Resources"].replace(ordinal_priority_map)
df["Family_Income"] = df["Family_Income"].replace(ordinal_priority_map)
df["Parental_Education_Level"] = df["Parental_Education_Level"].replace(
    {"High School": 1, "College": 2, "Postgraduate": 3}
)

In [None]:
# Extracurricular_Activities,Internet_Access,Learning_Disabilities(Yes=1,No=0),
print(df.Extracurricular_Activities.unique())
print(df.Internet_Access.unique())
ys = {
    'Yes': 1,
    'No':0
}
df["Extracurricular_Activities"] = df["Extracurricular_Activities"].replace(ys)
df["Internet_Access"] = df["Internet_Access"].replace(ys)
df["Learning_Disabilities"] = df["Learning_Disabilities"].replace(ys)

In [None]:
# Peer_Influence(Negative=1,Neutral=2,Positive=3),Distance_from_Home(Near=1,Moderate=2,Far=3)
# print(df.Gender.unique())
print(df.Peer_Influence.unique())
print(df.Distance_from_Home.unique())
df["Peer_Influence"] = df["Peer_Influence"].replace(
    {"Negative": 1, "Neutral": 2, "Positive": 3}
)
df["Distance_from_Home"] = df["Distance_from_Home"].replace(
    {"Near": 1, "Moderate": 2, "Far": 3}
)
df[["Peer_Influence", "Distance_from_Home"]].sample(3)

<h1>Anjal</h1>

In [None]:
# Anjal's Code
print(df.School_Type.unique())
print(df.Gender.unique())

In [None]:
# School_Type and Gender mapping
# School_Type-> Public-0, Private-1
df["School_Type"] = df["School_Type"].replace({"Public": 0, "Private": 1})
# Gender-> Female-0, Male-1
df["Gender"] = df["Gender"].replace({"Male": 1, "Female": 0})

In [None]:
df[['School_Type','Gender']].sample(3)
print(df.head())

<h2>Duplicate Removal</h2>

In [None]:
duplicate_count = df.duplicated().sum()
print(f"I found {duplicate_count} duplicate rows.")
df.drop_duplicates(inplace=True)

<h2>Missing Value Audit</h2>

In [None]:
# The Missing Value Audit
missing_data = df.isnull().sum()
print("--- Missing Values Per Column ---")
print(missing_data)

# A quick check for the Lead
if missing_data.sum() == 0:
    print("\n✅ Audit Complete: No missing values found. Data is safe for ML.")
else:
    print("\n⚠️ Warning: Missing values detected. We need to fill or drop them.")

<h1>Data Visualization</h1>

In [None]:
encoded_cols = [
    "Parental_Involvement",
    "Motivation_Level",
    "Teacher_Quality",
    "Access_to_Resources",
    "Family_Income",
    "Parental_Education_Level",
    "Extracurricular_Activities",
    "Internet_Access",
    "Learning_Disabilities",
    "Peer_Influence",
    "Distance_from_Home",
    "School_Type",
    "Gender",
]
df[encoded_cols] = df[encoded_cols].apply(pd.to_numeric, errors="coerce")
print(df[encoded_cols].dtypes)
sns.set_theme(style="whitegrid")
# Create a heatmap for numerical features
plt.figure(figsize=(12, 8))
numeric_df = df.select_dtypes(include=["int64", "float64"])
sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap of Student Factors")
plt.show()
print(df.info())