In [240]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures

In [241]:
seed = 42

# Data prep

In [242]:
def clean_df(df):
    return df

def prep_features(df: pd.DataFrame):
    df = df.drop(["User_ID", "Calories"], axis=1, errors="ignore")

    # dummy encoding
    dummy_cols = ["Gender"]
    dummies = pd.get_dummies(df[dummy_cols], dummy_cols, drop_first=True)
    df = pd.concat([df, dummies], axis=1)

    df = df.select_dtypes(exclude='object')
    return df

In [243]:
df = pd.read_csv("train_data.csv")
df = clean_df(df)

df_train = prep_features(df)

In [244]:
df_train.head()

Unnamed: 0,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Gender_male
0,79,165,73,13,79,40.0,True
1,28,148,48,6,91,39.4,False
2,27,187,82,25,101,40.8,True
3,40,173,71,16,93,40.4,True
4,40,183,90,5,80,39.0,True


# EDA

In [245]:
df_train.corrwith(df["Calories"]).sort_values(ascending=False)

Duration       0.955035
Heart_Rate     0.896451
Body_Temp      0.823176
Age            0.158377
Weight         0.039494
Height         0.020819
Gender_male    0.020050
dtype: float64

# Model selection

In [246]:
X_train, X_test, y_train, y_test = train_test_split(
    df_train, df["Calories"], test_size=0.33, random_state=seed
)

In [247]:
poly = PolynomialFeatures(3)
X_train = poly.fit_transform(X_train)
X_test = poly.transform(X_test)

In [248]:
def evaluate(model):
    scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=3, n_jobs=-1)
    cv = scores.mean() - scores.std()

    model.fit(X_train, y_train)
    score = mean_absolute_error(y_test, model.predict(X_test))

    return -cv, score

In [249]:
lr = Ridge(alpha=10)

evaluate(lr)

(0.2688595660645371, 0.2573999419192032)

In [250]:
model = lr

# Submission

In [251]:
df_test = pd.read_csv("test_data.csv")
df_test = clean_df(df_test)

# prepare for subtask 5
df_test_5 = df_test[df_test["Subtask"] == 5]
df_test_5 = df_test_5.drop(["Subtask"], axis=1)

features_5 = prep_features(df_test_5)
features_5 = poly.transform(features_5)

# prepare for subtask 6
df_test_6 = df_test[df_test["Subtask"] == 6]
df_test_6 = df_test_6.drop(["Subtask"], axis=1)

features_6 = prep_features(df_test_6)

features_6["Gender_male"] = True

features_6 = poly.transform(features_6)

In [252]:
# subtask 1
subtask1 = len(df_train)

# subtask 2
subtask2 = len(df_train[df_train["Gender_male"]])

# subtask 3
subtask3 = df_train["Duration"].mean()

# subtask 4
subtask4 = len(df_train[df_train["Age"] >= 75])

# subtask 5
subtask5 = model.predict(features_5)

# subtask 6
subtask6 = model.predict(features_6)

In [253]:
df1234 = pd.DataFrame({
    "subtaskID": [1,2,3,4], "datapointID": 1, "answer": [subtask1, subtask2, subtask3, subtask4],
})

df5 = pd.DataFrame(
    {"subtaskID": 5, "datapointID": df_test_5["User_ID"], "answer": subtask5}
)

df6 = pd.DataFrame(
    {"subtaskID": 6, "datapointID": df_test_6["User_ID"], "answer": subtask6}
)

submission = pd.concat([df1234, df5, df6], ignore_index=True)
submission.head()

Unnamed: 0,subtaskID,datapointID,answer
0,1,1,9000.0
1,2,1,4443.0
2,3,1,15.510667
3,4,1,412.0
4,5,12618012,113.061036


In [254]:
submission.to_csv("submission.csv", index=False)