In [179]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression

In [180]:
seed = 42

# Data prep

In [181]:
def clean_df(df):
    return df

def prep_features(df: pd.DataFrame):
    df = df.drop(["ID", "deliver_time"], axis=1, errors='ignore')

    # dummy encoding
    dummy_cols = ['Weather']
    dummies = pd.get_dummies(df[dummy_cols], dummy_cols, drop_first=True)
    df = pd.concat([df, dummies], axis=1)

    df = df.select_dtypes(exclude='object')
    return df

In [182]:
df = pd.read_csv("train_data.csv")
df = clean_df(df)

df_train = prep_features(df)

In [183]:
df_train.head()

Unnamed: 0,Distance,Time of Day,Traffic,Road Quality,Driver Experience,Weather_Fog,Weather_Rain,Weather_Snow
0,352,452,154.014691,370,30,True,False,False
1,519,1386,949.697532,701,2,False,False,False
2,457,91,387.019309,45,26,True,False,False
3,447,1120,130.544017,643,6,False,False,False
4,201,1096,619.557737,375,20,False,False,False


# EDA

In [184]:
df_train.corrwith(df["deliver_time"]).sort_values(ascending=False)

Distance             0.996613
Traffic              0.064308
Time of Day          0.024611
Weather_Snow         0.013303
Weather_Fog          0.010947
Weather_Rain         0.004844
Road Quality        -0.020194
Driver Experience   -0.045516
dtype: float64

# Model selection

In [185]:
X_train, X_test, y_train, y_test = train_test_split(
    df_train, df["deliver_time"], test_size=0.33, random_state=seed
)

In [186]:
def evaluate(model):
    scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=3, n_jobs=-1)
    cv = scores.mean() - scores.std()

    model.fit(X_train, y_train)
    score = mean_absolute_error(y_test, model.predict(X_test))

    return -cv, score

In [187]:
lr = LinearRegression()

evaluate(lr)

(1.6660701635019008, 1.629248707934777)

In [188]:
model = lr

# Submission

In [189]:
df_test = pd.read_csv("test_data.csv")
df_test = clean_df(df_test)

features = prep_features(df_test)

In [190]:
# subtask 1
subtask1 = len(df_test[(df_test["City A"] == "Barlad") & (df_test["Weather"] == "Fog")]["Distance"])

# subtask 2
subtask2 = model.predict(features)

In [191]:
df1 = pd.DataFrame({
    "subtaskID": 1, "datapointID": 1, "answer": subtask1
}, index=[0])

df2 = pd.DataFrame({"subtaskID": 2, "datapointID": df_test["ID"], "answer": subtask2})

submission = pd.concat([df1, df2], ignore_index=True)
submission.head()

Unnamed: 0,subtaskID,datapointID,answer
0,1,1,15.0
1,2,10001,386.828929
2,2,10002,243.00493
3,2,10003,480.903382
4,2,10004,248.494537


In [192]:
submission.to_csv("submission.csv", index=False)