In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
submission_df = pd.read_csv("submission.csv")
train_df.head()

Unnamed: 0,id,hour,temperature,precipitation,windspeed,humidity,visibility,ozone,pm10,pm2.5,count
0,3,20,16.3,1.0,1.5,89.0,576.0,0.027,76.0,33.0,49
1,6,13,20.1,0.0,1.4,48.0,916.0,0.042,73.0,40.0,159
2,7,6,13.9,0.0,0.7,79.0,1382.0,0.033,32.0,19.0,26
3,8,23,8.1,0.0,2.7,54.0,946.0,0.04,75.0,64.0,57
4,9,18,29.5,0.0,4.8,7.0,2000.0,0.057,27.0,11.0,431


In [2]:
train_df.iloc[:, 1:].describe()

Unnamed: 0,hour,temperature,precipitation,windspeed,humidity,visibility,ozone,pm10,pm2.5,count
count,1459.0,1457.0,1457.0,1450.0,1457.0,1457.0,1383.0,1369.0,1342.0,1459.0
mean,11.493489,16.717433,0.031572,2.479034,52.231297,1405.216884,0.039149,57.168736,30.327124,108.5634
std,6.92279,5.23915,0.174917,1.378265,20.370387,583.131708,0.019509,31.771019,14.713252,82.631733
min,0.0,3.1,0.0,0.0,7.0,78.0,0.003,9.0,8.0,1.0
25%,5.5,12.8,0.0,1.4,36.0,879.0,0.0255,36.0,20.0,37.0
50%,11.0,16.6,0.0,2.3,51.0,1577.0,0.039,51.0,26.0,96.0
75%,17.5,20.1,0.0,3.4,69.0,1994.0,0.052,69.0,37.0,150.0
max,23.0,30.0,1.0,8.0,99.0,2000.0,0.125,269.0,90.0,431.0


In [3]:
(train_df.isna().sum() / train_df.shape[0] * 100).round(2)

id               0.00
hour             0.00
temperature      0.14
precipitation    0.14
windspeed        0.62
humidity         0.14
visibility       0.14
ozone            5.21
pm10             6.17
pm2.5            8.02
count            0.00
dtype: float64

## Mean filling

In [4]:
train_df_mean_filling = train_df.fillna(train_df.mean())
test_df_mean_filling = test_df.fillna(test_df.mean())
X = train_df_mean_filling.iloc[:, 1:-1]
y = train_df_mean_filling.iloc[:, -1]

model = RandomForestRegressor()

model.fit(X, y)
pred = model.predict(test_df_mean_filling.iloc[:, 1:])

# test RMSE: 45.63263
submission_df['count'] = pred.round(2)
submission_df.to_csv('mean_filling.csv', index=False)

## Median filling

In [17]:
train_df_median_filling = train_df.fillna(train_df.median())
test_df_median_filling = test_df.fillna(test_df.median())

X = train_df_median_filling.iloc[:, 1:-1]
y = train_df_median_filling.iloc[:, -1]

model = RandomForestRegressor()

model.fit(X, y)
pred = model.predict(test_df_median_filling.iloc[:, 1:])
pred *= 0.8

# test RMSE: 44.8944
submission_df['count'] = pred.round(2)
submission_df.to_csv('median_filling.csv', index=False)

## Imputation

In [6]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5, weights="uniform")
X = imputer.fit_transform(train_df.iloc[:, 1:-1])
y = train_df.iloc[:, -1]

model = RandomForestRegressor()

model.fit(X, y)
pred = model.predict(imputer.transform(test_df.iloc[:, 1:]))

# test RMSE: 45.16413
submission_df['count'] = pred.round(2)
submission_df.to_csv('KNNImputer_filling.csv', index=False)

# Interpolation

In [7]:
train_df_interpolate = train_df.copy()
test_df_interpolate = test_df.copy()

for col in train_df:
    train_df_interpolate.loc[:, col] = train_df[col].interpolate(method='linear', limit_direction='forward', axis=0)
    
X = train_df_interpolate.iloc[:, 1:-1]
y = train_df_interpolate.iloc[:, -1]

model = RandomForestRegressor()

model.fit(X, y)

for col in test_df:
    test_df_interpolate.loc[:, col] = test_df[col].interpolate(method='linear', limit_direction='forward', axis=0)

pred = model.predict(test_df_interpolate.iloc[:, 1:])

# test RMSE: 46.2823
submission_df['count'] = pred.round(2)
submission_df.to_csv('interpolate_filling.csv', index=False)