In [None]:
# 必要なモジュールをインポートする
import numpy as np
import pandas as pd
import warnings
import time
warnings.simplefilter('ignore')
import math
from statistics import mean
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('使用デバイス：', device)

In [None]:
# データダウンロード
train_df = pd.read_csv('../input/tabular-playground-series-mar-2022/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-mar-2022/test.csv')

In [None]:
#データ確認
print(train_df.shape)  #形を確認
train_df.head(10)

In [None]:
print(test_df.shape)
test_df.head()

In [None]:
# idをインデックスに指定
train_df.set_index(keys='row_id', inplace=True)
test_df.set_index(keys='row_id', inplace=True)
train_df.head()

In [None]:
# 正解ラベルのみ分けておく
train_df_y = pd.DataFrame(train_df['congestion'])
train_df.drop(['congestion'], axis=1, inplace=True)
train_df_y.head()

In [None]:
# train_dfとtest_dfを結合する
ntrain = train_df.shape[0]
all_data = pd.concat((train_df, test_df))#.reset_index(drop=True)
print(all_data.shape)
all_data.head()

In [None]:
# データの持つ情報を確認
all_data.info()

In [None]:
all_data['time'] = pd.to_datetime(all_data['time'])
all_data.info()

In [None]:
# 曜日を追加するための関数を定義
def get_weekday_jp(dt):
    w_list = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    return(w_list[dt.weekday()])
all_data["weekday"] = all_data["time"].apply(get_weekday_jp)
all_data['hour'] = all_data['time'].dt.hour
all_data['minutes'] = all_data['time'].dt.minute
all_data.head(10)

In [None]:
week_day = pd.get_dummies(all_data['weekday'])
all_data = pd.concat([all_data, week_day], axis=1)

# print(all_data.columns)
# print(all_data.shape)
all_data.head()

In [None]:
all_data["key"] = all_data["x"].map(lambda item: str(item)) + "_" + all_data["y"].map(lambda item: str(item)) + "_" + all_data["direction"]
print(all_data['key'].unique())
all_data.head()

In [None]:
all_data.info()

In [None]:
con_yestarday = train_df_y.shift().rename(columns={'congestion': 'congestion_pre'})
all_data = pd.concat([all_data, con_yestarday], axis=1)
print(all_data['congestion_pre'].mean())
all_data.at[0, 'congestion_pre'] = all_data['congestion_pre'].mean()
all_data.head()

In [None]:
#欠損値を確認   ->   conset欠損値なし
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(22)

In [None]:
# 元に戻す
train_df = all_data[:ntrain]
test_df = all_data[ntrain:]
print(train_df.shape, test_df.shape)
train_df.head()

In [None]:
x_num = 3
y_num = 4
graphs_count = 1
axes = []
fig = plt.figure(figsize=(18,60))
for i in range(3):
    for j in range(4):
        for d in ['EB', 'NB', 'SB', 'WB', 'NE', 'SW']:
            axes.append(fig.add_subplot(x_num*6, y_num, graphs_count))
            idx = train_df[(train_df['x']==i) & (train_df['y']==j) & (train_df['direction']==d)].index.tolist()
            axes[graphs_count-1].plot(train_df['time'][idx[:210]], train_df_y['congestion'][idx[:210]])
            axes[graphs_count-1].set_title(str(i)+'_'+str(j)+'_'+d)
            graphs_count += 1
fig.subplots_adjust(wspace=0.3, hspace=0.2)
fig.autofmt_xdate(rotation=45)
plt.show()

In [None]:
#学習用のデータをモデルの学習用とモデルの精度の検証用に分割
#今回は、モデル用学習データ:精度用の検証データ = 8 : 2 に分割
length = len(train_df)
train_size = int(length * 0.8)
test_size = length - train_size
X_train, X_test = train_df[0:train_size], train_df[train_size:length]
y_train, y_test = train_df_y[0:train_size], train_df_y[train_size:length]
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
X_train.head()

In [None]:
from torch.utils.data.sampler import SubsetRandomSampler
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, X, sequence_num, y=None, mode='train'):
        self.data = X
        self.teacher = y
        self.sequence_num = sequence_num
        self.mode = mode
    def __len__(self):
        return len(self.teacher)

    def __getitem__(self, idx):
        out_data = self.data[idx]
        
        if self.mode == 'train':
            out_label =  self.teacher[idx[-1]+1]
            return out_data, out_label
        else:
          return out_data
def create_dataset(dataset, dataset_num, sequence_num, input_size, batch_size):
    sampler = np.array([list(range(i, i+sequence_num)) for i in range(dataset_num-sequence_num)])
    # np.random.shuffle(sampler)
    dataloader = DataLoader(dataset, batch_size, sampler=sampler)
    return dataloader

###########  動作確認　###############
sequence_num = 10
X = np.random.rand(1000, 5)
y = np.random.rand(1000, 1)

dataset = MyDataset(X, y=y, sequence_num=sequence_num)
dataloader = create_dataset(dataset, X.shape[0], sequence_num, X.shape[1], 32)
# dataloader = DataLoader(dataset, batch_size=32)#, sampler=sampler)
for b, tup in enumerate(dataloader):
  print('---------')
  print(tup[0].shape, tup[1].shape)
  break
print(X[-2], y[-1])
######################################

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size=5, hidden_layer_size=100,
                 output_size=1, batch_size = 32):
        super().__init__()
        self.hidden_layer_size = hidden_layer_size
        self.batch_size = batch_size
        self.lstm = nn.LSTM(input_size, hidden_layer_size, batch_first=True)

        self.linear = nn.Linear(hidden_layer_size, output_size)

        self.hidden_cell = (torch.zeros(1, self.batch_size, self.hidden_layer_size),
                            torch.zeros(1, self.batch_size, self.hidden_layer_size))
        self.logs_train = [[], [np.inf]]
        self.logs_valid = [[], [np.inf]]
        self.stdsc = StandardScaler()
        self.stdsc_y = StandardScaler()

    def forward(self, input_seq):
        batch_size, seq_len = input_seq.shape[0], input_seq.shape[1]
        lstm_out, self.hidden_cell = self.lstm(input_seq,#.view(seq_len, batch_size, 1),
                                               self.hidden_cell) #lstm入力サイズは(バッチサイズ、シーケンスサイズ、特徴量次元数)
        predictions = self.linear(self.hidden_cell[0].view(batch_size, -1))
        return predictions[:, 0]

    def fit(self, X, y, num_epochs=50, sequence_num=10, batch_size=32):
        #GPUが使えるか確認
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        print('使用デバイス：', device)
        ####   データ処理↓
        # 標準化
        X = self.stdsc.fit_transform(X)
        y = self.stdsc_y.fit_transform(y)
        # データセット・データローダ作成
        num_train = len(X)
        dataset_size = X.shape[0]# データサイズ
        input_size = X.shape[1]  # 入力サイズ
        # データセット作成
        dataset = MyDataset(X, y=y, sequence_num=sequence_num, mode='train')
        #データローダー作成
        dataloader = create_dataset(dataset, dataset_size, sequence_num, input_size, batch_size)
        ####    学習設定
        # 最適化手法の設定
        lr = 0.01
        beta1, beta2 = 0.0, 0.9
        optimizer = torch.optim.Adagrad(self.parameters(), lr)#, [beta1, beta2])
        # 誤差関数を定義
        criterion = nn.MSELoss()
        #ネットワークをGPUへ
        self.to(device)
        #モデルを学習モードに
        self.train()
        #ネットワークがある程度固定であれば、高速化させる
        torch.backends.cudnn.benchark = True
        #バッチサイズを保存
        batch_size = dataloader.batch_size
        #イテレーションカウンタをセット
        iteration = 1
        #epochのループ
        for epoch in range(num_epochs):
            # 開始時刻を保存
            t_epoch_strat = time.time()
            epoch_loss = 0.0
            print('-------------')
            print('Epoch {}/{}'.format(epoch, num_epochs))
            print('-------------')
            print(' (train) ')
            ### 学習
            #データローダーからminibatchずつ取り出すループ
            for data, targets in dataloader:
                #GPUで扱えるように変換
                data = data.to(device)
                targets = targets.to(device)
                #勾配初期化
                optimizer.zero_grad()
                self.hidden_cell = (torch.zeros(1, len(data), self.hidden_layer_size).to(device),
                                    torch.zeros(1, len(data), self.hidden_layer_size).to(device))

                #出力を得る
                data = data.to(torch.float32)
                output = self.forward(data)
                output = output.view(1,-1)[0]
                targets = targets.to(torch.float32)
                #誤差を計算
                loss = criterion(output, targets)
                #誤差逆伝播
                loss.backward()
                #ステップ
                optimizer.step()
                #誤差を記録
                epoch_loss += loss.item()
                iteration += 1
            #epochごとのloss
            t_epoch_finish = time.time()
            print('Epoch: {} \tTraining Loss: {:.6f}'.format(
                epoch+1, 
                epoch_loss/num_train,
                ))
            print('timer:  {:.4f} sec.'.format(t_epoch_finish - t_epoch_strat))
            #データの保存
            if epoch_loss/num_train < min(self.logs_train[1]):
                print('--save model--')
            self.logs_train[0].append(epoch+1)
            self.logs_train[1].append(epoch_loss/num_train)
            torch.save(self.state_dict(), './models')

    def predict(self, X, sequence_num=10):
        #GPUが使えるか確認
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        print('使用デバイス：', device)
        valid_loss = 0.0
        num_valid = len(X)
        indices_valid = list(range(num_valid))
        #標準化
        X = self.stdsc.transform(X)
        y_pred = np.array([])
        for date in range(num_valid-sequence_num):
            # # 1. 予測日とその前日までのデータセット作成
            # valid_dataset = MyDataset(X[date:date+sequence_num], sequence_num, mode='val')
            # print(X[date:date+sequence_num])
            # # 2. データローダー作成
            # valid_loader = create_dataset(valid_dataset, X[date:date+sequence_num].shape[0], sequence_num=sequence_num, input_size=X.shape[1], batch_size=1)
            # 1. 
            valid_data = X[date:date+sequence_num]
            # 2. 予測
            self.eval()
            data = torch.from_numpy(np.array([valid_data]).astype(np.float32)).clone()
            data = data.to(device)
            self.hidden_cell = (torch.zeros(1, len(data), self.hidden_layer_size).to(device),
                                torch.zeros(1, len(data), self.hidden_layer_size).to(device))
            output = self.forward(data)
            output = output.view(1, -1)
            output = output.to('cpu').detach().numpy().copy()
            X[date+sequence_num][-1] = output[0][0]
            y_pred = np.append(y_pred, output[0])
            # # 3. 予測
            # model.eval()
            # y_pred = np.array([])
            # for data in valid_loader:
            #     print(data.shape)
            #     data = data.to(device)
            #     data = data.to(torch.float32)
            #     self.hidden_cell = (torch.zeros(1, len(data), self.hidden_layer_size).to(device),
            #                         torch.zeros(1, len(data), self.hidden_layer_size).to(device))
            #     output = model.forward(data)
            #     output = output.view(1, -1)
            #     output = output.to('cpu').detach().numpy().copy()
            #     y_pred = np.append(y_pred, output[0])

        y_pred = np.array(y_pred)
        y_pred = y_pred.reshape(-1, 1)
        y_pred = self.stdsc_y.inverse_transform(y_pred)
        return y_pred

###########  動作確認　###############
model = LSTM(input_size=7)
data = np.random.rand(32, 10, 7)
data = torch.from_numpy(data.astype(np.float32)).clone()
model.hidden_cell = (torch.zeros(1, len(data), model.hidden_layer_size),
                                    torch.zeros(1, len(data), model.hidden_layer_size))
model(data)
######################################

In [None]:
def train():
    # 学習
    epochs = 20
    batch_size = 16
    sequence_num = 16
    categorical_columns = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'hour', 'minutes', 'congestion_pre']
    # 各道路種ごとにモデル作成
    model_dict = {}
    for load in X_train['key'].unique():
        print("############################  {}  #################################".format(load))
        # インスタンス作成
        model_i = LSTM(input_size=len(categorical_columns))
        model_i.to(device)
        X_train_np = X_train[X_train['key'] == load][categorical_columns].values
        y_train_np = y_train[X_train['key'] == load].values
        #　テストデータセットは予測日のsequence_num分前のデータも持っておく
        X_test_np = X_test[X_test['key'] == load][categorical_columns].values
        X_test_np = np.append(X_test_np, X_train_np[-1*sequence_num:], axis=0)
        y_test_np = y_test[X_test['key'] == load].values
        y_test_np = np.append(y_test_np, y_train_np[-1*sequence_num:], axis=0)
        print(X_train_np.shape, y_train_np.shape)
        print(X_test_np.shape)
        # 学習
        model_i.fit(X_train_np, y_train_np, num_epochs=epochs, sequence_num=sequence_num, batch_size=batch_size)
        #　予測
        pred_y = model_i.predict(X_test_np, sequence_num)
        score = mean_squared_error(y_test_np[sequence_num:], pred_y)
        print(score)
        plt.plot(list(range(len(y_test_np[sequence_num:]))), y_test_np[sequence_num:])
        plt.plot(list(range(len(pred_y))), pred_y)
        plt.title(load)
        plt.show()
        model_dict[load] = model_i
    return model_dict
model_dict = train()

In [None]:
# モデルの保存
for load in X_train['key'].unique():
    model = model_dict[load]
    torch.save(model, "model_"+load)

In [None]:
# 提出データ作成
sequence_num = 36
categorical_columns = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'hour', 'minutes', 'congestion_pre']
pred_y = [0 for i in range(len(test_df))]
for i, load in enumerate(X_train['key'].unique()):
    # モデルロード
    model_path = 'model_'+load
    model_i = torch.load(model_path)
#     model_i = LSTM(input_size=len(categorical_columns)).to(device)
    # データ作成
    X_submit =  test_df[test_df['key'] == load][categorical_columns].values
    print(X_submit.shape)
    X_submit[0][-1] = train_df_y.values[-1]
    X_submit = np.append(train_df[train_df['key'] == load][categorical_columns].values[-1*sequence_num:], X_submit, axis=0)
    print(X_submit.shape)
    # モデル
    pred_y_i = model_i.predict(X_submit, sequence_num)
    print(pred_y_i.shape)
    for j, p in enumerate(pred_y_i):
        pred_y[65*j+i] = p
pred_y = np.array(pred_y)
print(pred_y)

In [None]:
submission_df = pd.read_csv('../input/tabular-playground-series-mar-2022/sample_submission.csv')
submission_df.head()

In [None]:
submission_df['congestion'] = pd.DataFrame(pred_y)
submission_df.head()

In [None]:
submission_df.to_csv("./submit.csv", index=False, header=True)