In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
seed = 42

# Data preparation

In [3]:
def clean_df(df):
    return df

def prep_features(df):
    return df.drop(["on_time", "id"], axis=1, errors='ignore')

In [4]:
df = pd.read_csv("train_data.csv")
df = clean_df(df)

df_train = prep_features(df)

In [5]:
df_train.head()

Unnamed: 0,distance_km,package_weight_kg,traffic_level
0,407.92,5.62,2
1,216.54,56.35,1
2,597.06,102.39,13
3,535.19,141.95,9
4,589.68,107.75,12


# Exploratory data analysis

In [6]:
df_train.describe()

Unnamed: 0,distance_km,package_weight_kg,traffic_level
count,100.0,100.0,100.0
mean,416.2869,82.3575,7.99
std,213.156124,45.250488,4.013852
min,11.27,1.19,1.0
25%,219.615,33.28,4.0
50%,489.225,96.725,9.0
75%,588.1875,117.1475,11.0
max,698.91,149.09,14.0


# Models

In [7]:
X_train, X_val, y_train, y_val = train_test_split(df_train, df["on_time"], test_size=0.2, random_state=seed)

In [8]:
def evaluate(clf):
    scores = cross_val_score(clf, X_train, y_train, cv=3)
    return np.mean(scores) - np.std(scores)

In [9]:
lr = LogisticRegression(random_state=seed)

lr.fit(X_train, y_train)
print(lr.score(X_val, y_val))

evaluate(lr)

1.0


1.0

In [10]:
rf = RandomForestClassifier(random_state=seed)

rf.fit(X_train, y_train)
print(rf.score(X_val, y_val))

evaluate(rf)

1.0


1.0

In [11]:
clf = rf

# Submission

In [12]:
test_df = pd.read_csv("test_data.csv")
test_df = clean_df(test_df)
features = prep_features(test_df)

In [13]:
test_df["answer"] = clf.predict(features)
test_df["datapointID"] = test_df["id"]

In [14]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 50 non-null     int64  
 1   distance_km        50 non-null     float64
 2   package_weight_kg  50 non-null     float64
 3   traffic_level      50 non-null     int64  
 4   answer             50 non-null     int64  
 5   datapointID        50 non-null     int64  
dtypes: float64(2), int64(4)
memory usage: 2.5 KB


In [15]:
mean_traffic_level = round(np.mean(test_df["traffic_level"]), 2)
std_traffic_level = round(np.std(test_df["traffic_level"], ddof=1), 2)
mean_traffic_level, std_traffic_level

(8.02, 3.56)

In [16]:
subtasks12 = pd.DataFrame({
    "subtaskID": [1, 2],
    "datapointID": [1, 1],
    "answer": [mean_traffic_level, std_traffic_level]
})

subtask3 = pd.DataFrame({
    "subtaskID": 3,
    "datapointID": test_df["id"],
    "answer": clf.predict(features)
})

submission = pd.concat([subtasks12, subtask3])
submission.head()

Unnamed: 0,subtaskID,datapointID,answer
0,1,1,8.02
1,2,1,3.56
0,3,4,1.0
1,3,18,1.0
2,3,39,1.0


In [17]:
submission.to_csv("submission.csv", index=False)