In [None]:
!kaggle competitions download -c bike-sharing-demand

In [None]:
import torch
import torch.nn as nn
import torch.functional as F
import torch.optim as optim
import torch.utils.data as data_utils


import pandas as pd
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split

import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib as mpl
import seaborn as sns
from IPython.display import clear_output, display

np.set_printoptions(precision=2)
plt.rcParams['figure.figsize'] = (14.0, 10.0)


In [None]:
def load_data(preprocess = True):
    train_df = pd.read_csv('./bike-sharing-demand/train.csv')
    test_df = pd.read_csv('./bike-sharing-demand/test.csv')

    def _add_datetime_cols(df):
        df['datetime'] = pd.to_datetime(df['datetime'])
        df['year'] = pd.DatetimeIndex(df['datetime']).year
        df['day'] = pd.DatetimeIndex(df['datetime']).day
        df['month'] = pd.DatetimeIndex(df['datetime']).month
        df['hour'] = pd.DatetimeIndex(df['datetime']).hour
        df['dayofweek'] = pd.DatetimeIndex(df['datetime']).dayofweek
        
        df.set_index('datetime')

        return df

    def _preprocess(df):
        # drop columns which don't exist in test-set
        cols_to_drop = ['registered', 'casual']
        for ctd in cols_to_drop:
            if ctd in df.columns:
                df = df.drop(ctd, axis = 1)

        if 'count' in df.columns:
            print('Removing outliers')
            cutoff_factor = 3
            print('Before removal', df.shape, df['count'].mean(), df['count'].std(), df['count'].max(), df['count'].min())
            df = df[abs(df['count'] - df['count'].mean()) < cutoff_factor*df['count'].std()]
            df.reset_index(drop = True, inplace = True)
            print('After removal', df.shape, df['count'].mean(), df['count'].std(), df['count'].max(), df['count'].min())

            print('Log transform applied to count column')
            df[target_col] = df[target_col].apply(lambda x: np.log(x + 1))

            
        return df
        
    train_df = _add_datetime_cols(train_df)
    test_df = _add_datetime_cols(test_df)

    if preprocess:
        train_df = _preprocess(train_df)
        test_df = _preprocess(test_df)      
    
    return train_df, test_df

train_df, test_df = load_data(preprocess=False)
print(train_df.shape, test_df.shape)

In [None]:
print(train_df.isnull().sum(axis = 0))
print(test_df.isnull().sum(axis = 0))

In [None]:
train_df.head()

In [None]:
display(round(train_df.describe(), 2))

# for s in range(1, 5):
#     display(round(df[train_df['season'] == s].describe(), 2))

### Histograms for each column, correlation map, pairwise plots

In [None]:
plt.figure()
train_df.hist(bins=15, color='magenta', edgecolor='black', grid=False)

plt.figure()
corr = train_df.corr()
sns.heatmap(round(corr,2), annot=True)

sns.pairplot(train_df[['count', 'temp', 'atemp', 'hour', 'year']])

### Explore target variable

- outliers
- skewed distribution
- log transformed distribution

In [None]:
fig,ax = plt.subplots(1, 2, figsize=(15,6))
train_df['count'].plot(kind = 'hist', bins=100, ax = ax[0])
train_df['count'].plot(kind = 'box', ax = ax[1])

plt.figure()
sns.distplot(train_df['count'])
plt.figure()
sns.distplot(train_df['count'].apply(lambda x: np.log(x + 1)))

###  Windspeed

- windspeed has suspicious amonut of zeros and no values between 0 to 6
- distribution smoothened with rolling average removes some amonut of zeros

In [None]:

plt.figure()
sns.distplot(train_df['windspeed'])

plt.figure()
train_df[(train_df['month'] == 1) & (train_df['year'] == 2011) & (train_df['day'] < 10) & (train_df['day'] > 5)] \
['windspeed'].plot(kind='bar')

display(train_df.groupby('windspeed')['count'].count())

train_df['rolling_windspeed'] = train_df['windspeed'] + train_df['windspeed'].rolling(3, center=False, min_periods=1).mean()
plt.figure()
sns.distplot(train_df['rolling_windspeed'])

### Month == season 

In [None]:
plt.figure()
sns.barplot('year', 'count', hue='season', data=train_df, ci=None)

plt.figure()
sns.barplot('year', 'count', hue='month', data=train_df, ci=None)

plt.figure()
sns.barplot('workingday', 'count', hue='hour', data=train_df, ci=None)

### Check high correlation between temp and atemp

In [None]:
plt.figure()
sns.jointplot(x='temp', y='atemp', data=train_df) 

In [None]:
sns.boxplot(x='month', y='count', data=train_df)

In [None]:
sns.violinplot(x='month', y='count', data=train_df)

## Training

In [None]:
bs = 256
valid_vs = 4096
test_size = 0.2

target_col = ['count']
cont_cols = ['windspeed', 'humidity', 'temp']
cat_cols = ['holiday', 'workingday', 'year', 'weather', 'month', 'dayofweek', 'hour']

train_df, test_df = load_data()

def one_hot_encode(df, cat_cols):
    cat_cols_one_hot = []
    for col in cat_cols:
        one_hot = pd.get_dummies(df[col], prefix=col)
        cat_cols_one_hot.append(one_hot.columns.tolist())
        df = df.drop(col, axis = 1)
        df = df.join(one_hot)
        
    return df, cat_cols_one_hot

def normalize(df, cont_cols, normalization_params = None):
    if normalization_params:
        std = normalization_params['std']
        mean = normalization_params['mean']
    else:
        std = df[cont_cols].std()
        mean = df[cont_cols].mean()

    df[cont_cols] = (df[cont_cols] - mean) / std
    
    return df, { "mean": mean, "std": std }

def delog(x):
    x = x.numpy().squeeze()
    x = np.clip(np.e**x - 1, 0, None)
    return x

def select_features(df, cont_cols, cat_cols_one_hot):
    feature_cols = cont_cols + [item for sublist in cat_cols_one_hot for item in sublist]

    return df[feature_cols]

train_df, cat_cols_one_hot = one_hot_encode(train_df, cat_cols)
test_df, cat_cols_one_hot = one_hot_encode(test_df, cat_cols)
train_df, normalization_params = normalize(train_df, cont_cols)
test_df, normalization_params = normalize(test_df, cont_cols, normalization_params)

train_features_df = select_features(train_df, cont_cols, cat_cols_one_hot)
train_target_df = train_df[target_col]

test_features_df = select_features(test_df, cont_cols, cat_cols_one_hot)



x_train, x_valid, y_train, y_valid = train_test_split(train_features_df, train_target_df, test_size=test_size)
x_train_tensor = torch.tensor(x_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
x_valid_tensor = torch.tensor(x_valid.values, dtype=torch.float32)
y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.float32)

x_test_tensor = torch.tensor(test_features_df.values, dtype=torch.float32)

train_data = data_utils.TensorDataset(x_train_tensor, y_train_tensor)
train_loader = data_utils.DataLoader(train_data, batch_size=bs, shuffle=True)

valid_data = data_utils.TensorDataset(x_valid_tensor, y_valid_tensor)
valid_loader = data_utils.DataLoader(valid_data, batch_size=valid_vs, shuffle=True)

print(x_train.shape, x_valid.shape, y_train.shape, y_valid.shape, x_test_tensor.shape)

In [None]:
display(train_target_df.describe())
train_features_df.head()

In [None]:
import math

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.emb_config = []
        self.cat_cols_one_hot = cat_cols_one_hot
        self.cont_cols = cont_cols
        
        self.emb_sizes = []

        for c in cat_cols_one_hot:
            name = c[0].split('_')[0]
            in_features = len(c)
            out_features = math.ceil(len(c)/2)+1
            
            self.emb_sizes.append(out_features)
            self.emb_config.append({
                "name": name,
                "out_features": out_features,
                "in_features": in_features,
            })

            setattr(self, name, nn.Sequential(
                nn.Linear(in_features, out_features)
            ))
            
        head_input_features = sum(self.emb_sizes) + len(cont_cols)

        self.head = nn.Sequential(
            nn.BatchNorm1d(head_input_features),
            nn.Linear(head_input_features, 64),
            nn.Dropout(p=0.2),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        ) 
    
        
        for n, l in self.named_parameters():
            if l.dim() == 1:
                nn.init.constant_(l, 0.01)
            else:
                nn.init.kaiming_uniform_(l, mode='fan_out', nonlinearity='relu')

    def forward(self, x):
        emb_in_features_count = 0 
        emb_output = []

        for config in self.emb_config:
            emb_layer = getattr(self, config['name'])
            start = emb_in_features_count
            end = emb_in_features_count + config['in_features']
            emb_x = x[:, start:end]
            
            emb_output.append(emb_layer(emb_x))
            emb_in_features_count += config['in_features']
        
        x = torch.cat(emb_output + [x[:, end:]], dim=1)
        x = self.head(x)
        
        return x * 8 # limit the network output

model = Model()
loss_fn = nn.MSELoss(reduction='mean')

In [None]:
RESET_TRAINING_STATS = True
epochs = 25
lr = 1e-4
fig, ax = plt.subplots()


lrs = [1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7]
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-2)

if RESET_TRAINING_STATS:
    training_stats = pd.DataFrame({
        "epoch": [],
        "lr": [],
        "train_loss": [],
        "valid_loss": [],
        "r2": []
    })
    training_stats.set_index('epoch')

prev_lr_idx = -1
for epoch in range(epochs):
    lr_idx = int(epoch/epochs*len(lrs))

    if lr_idx != prev_lr_idx:
        lr = lrs[lr_idx]
        prev_lr_idx = lr_idx
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
    
    for batch_no, (xt, yt) in enumerate(train_loader):
        yt_hat = model(xt)
        loss = loss_fn(yt_hat, yt)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        
    if epoch % 1 == 0:
        with torch.no_grad():
            model.eval()

            for xv, yv in valid_loader:
                yv_hat = model(xv)
                valid_loss = loss_fn(yv_hat, yv)
                valid_r2 = sklearn.metrics.r2_score(yv, yv_hat)

            model.train()

    training_stats = training_stats.append(pd.DataFrame({
        "epoch": [epoch],
        "lr": [lr],
        "train_loss": [loss.item()],
        "valid_loss": [valid_loss.item()],
        "r2": [valid_r2]
    }), sort=False)
    
print(training_stats)
    
if len(training_stats) > 1:
    training_stats.plot(kind='line', x='epoch', y=['train_loss', 'valid_loss'], ax=ax)

In [None]:
with torch.no_grad():
    model.eval()

    for xv, yv in valid_loader:
        yv_hat = model(xv)
        loss = loss_fn(yv_hat, yv)
        print('valid_loss: ', loss.item())
        print('r2: ', sklearn.metrics.r2_score(yv, yv_hat))

    yv_hat = yv_hat.numpy()
    yv = yv.numpy()

    print(yv.shape, yv_hat.shape)

    df = pd.DataFrame(data=np.concatenate([yv_hat, yv, (yv_hat - yv)**2], axis = 1))

df

### Sumbission (test set)

In [None]:
with torch.no_grad():
    model.eval()
    y_hat = model(x_test_tensor)
    y_hat = delog(y_hat)
    df = pd.DataFrame(data = {"datetime": test_df['datetime'], "count": y_hat })
    
df.to_csv('./submission.csv', index=False)

In [None]:
!kaggle competitions submit -c bike-sharing-demand -f submission.csv -m "log transfomed target, trained with valid, fixed inverse log transform"