In [829]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder

In [830]:
seed = 42

# Data preparation

In [831]:
def clean_df(df):
    return df


def prep_features(df: pd.DataFrame):
    df = df.drop(["ID", "Exam_Score"], axis=1, errors="ignore")

    # drop irrelevant features
    df = df.drop(["Gender"], axis=1)

    # fill missing
    missing_cols = ["Distance_from_Home", "Parental_Education_Level", "Teacher_Quality"]
    for col in missing_cols:
        df[col] = df[col].fillna(value=df[col].mode().values[0])

    # ordinal encoding
    ordinal_mapping = {
        "Distance_from_Home": ["Near", "Moderate", "Far"],
        "Parental_Education_Level": ["High School", "College", "Postgraduate"],
        "Peer_Influence": ["Negative", "Neutral", "Positive"],
        "Teacher_Quality": ["Low", "Medium", "High"],
        "Family_Income": ["Low", "Medium", "High"],
        "Motivation_Level": ["Low", "Medium", "High"],
        "Access_to_Resources": ["Low", "Medium", "High"],
        "Parental_Involvement": ["Low", "Medium", "High"],
    }

    ordinal_cols = list(ordinal_mapping.keys())

    encoder = OrdinalEncoder(categories=[ordinal_mapping[col] for col in ordinal_cols])
    df[ordinal_cols] = encoder.fit_transform(df[ordinal_cols]) + 1

    # dummy encoding
    dummy_cols = df.select_dtypes(include='object').columns
    dummies = pd.get_dummies(df[dummy_cols], dummy_cols, drop_first=True)
    df = pd.concat([df, dummies], axis=1)

    # drop strings
    df = df.select_dtypes(exclude="object")

    return df

In [832]:
df = pd.read_csv("train_data.csv")
df = clean_df(df)

df_train = prep_features(df)

In [833]:
df_train.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Sleep_Hours,Previous_Scores,Motivation_Level,Tutoring_Sessions,Family_Income,Teacher_Quality,Peer_Influence,Physical_Activity,Parental_Education_Level,Distance_from_Home,Extracurricular_Activities_Yes,Internet_Access_Yes,School_Type_Public,Learning_Disabilities_Yes
0,27,79,1.0,3.0,8,63,3.0,2,1.0,2.0,1.0,5,2.0,2.0,True,True,True,False
1,16,86,3.0,2.0,7,94,2.0,2,1.0,3.0,2.0,3,1.0,2.0,True,True,True,False
2,22,87,1.0,2.0,8,83,1.0,1,1.0,2.0,2.0,1,2.0,3.0,False,True,True,False
3,18,100,3.0,2.0,10,86,2.0,1,2.0,2.0,2.0,3,1.0,1.0,True,True,True,False
4,35,78,3.0,1.0,10,99,2.0,1,1.0,2.0,3.0,2,1.0,1.0,True,True,False,False


In [834]:
df_train["Parental_Involvement"]

0       1.0
1       3.0
2       1.0
3       3.0
4       3.0
       ... 
5280    2.0
5281    2.0
5282    3.0
5283    3.0
5284    2.0
Name: Parental_Involvement, Length: 5285, dtype: float64

# Exploratory data analysis

In [835]:
label = df["Exam_Score"]

df_train.corrwith(label).sort_values(ascending=False)

Attendance                        0.579000
Hours_Studied                     0.436510
Access_to_Resources               0.174277
Previous_Scores                   0.172939
Tutoring_Sessions                 0.156122
Parental_Involvement              0.154670
Parental_Education_Level          0.107924
Peer_Influence                    0.096389
Family_Income                     0.095601
Teacher_Quality                   0.077356
Extracurricular_Activities_Yes    0.076524
Motivation_Level                  0.075882
Internet_Access_Yes               0.050502
Physical_Activity                 0.035173
School_Type_Public               -0.016727
Sleep_Hours                      -0.023253
Distance_from_Home               -0.088037
Learning_Disabilities_Yes        -0.091616
dtype: float64

In [836]:
df_train.isna().sum().sort_values(ascending=False).sum()

0

# Models

In [837]:
X_train, X_val, y_train, y_val = train_test_split(
    df_train, df["Exam_Score"], test_size=0.2, random_state=seed
)

In [838]:
def evaluate(model):
    scores = cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
    cv = scores.mean() - scores.std()

    model.fit(X_train, y_train)
    mae = mean_absolute_error(y_val, model.predict(X_val))

    return -cv, mae

In [839]:
lr = Ridge(alpha=0.1, random_state=seed)

evaluate(lr)

(0.5425846171054398, 0.5068719343379974)

In [840]:
model = lr

model.fit(X_train, y_train)

# Submission

In [841]:
df_test = pd.read_csv("test_data.csv")
df_test = clean_df(df_test)
features = prep_features(df_test)

In [842]:
# subtask 1
mean_hours = df["Hours_Studied"].mean()
subtask1 = np.abs(df_test["Hours_Studied"] - mean_hours)

# subtask 2
subtask2 = (df_test["Sleep_Hours"] < 7).map(lambda x: "True" if x == 1.0 else "False")

# subtask 3
subtask3 = df_test["Previous_Scores"].apply(lambda x: (df["Previous_Scores"] >= x).sum())

# subtask 4
motiv = df["Motivation_Level"].value_counts()
subtask4 = motiv[df_test["Motivation_Level"]].values

# subtask 5
subtask5 = model.predict(features)

In [843]:
def build_subtask(subtask_id, answer):
    return pd.DataFrame(
        {"subtaskID": subtask_id, "datapointID": df_test["ID"], "answer": answer}
    )


subtasks = [(subtask1, 1), (subtask2, 2), (subtask3, 3), (subtask4, 4), (subtask5, 5)]

submission = pd.concat(
    [build_subtask(sid, answer) for answer, sid in subtasks], ignore_index=True
)

submission.head()

Unnamed: 0,subtaskID,datapointID,answer
0,1,5286,0.032923
1,1,5287,2.032923
2,1,5288,1.032923
3,1,5289,7.967077
4,1,5290,1.032923


In [844]:
submission.to_csv("submission.csv", index=False)