In [None]:
import numpy as np 
import pandas as pd 
import os
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from tqdm import tqdm
import gc
import warnings
warnings.filterwarnings('ignore')

In [None]:
# pd.set_option('display.max_columns',None)

In [None]:
%%time
df = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')
df = df.astype(np.float16)
df.drop(['row_id','time_id'],axis=1,inplace=True)
print(df.shape)
df.head()

In [None]:
# %%time
# submission = pd.read_csv('../input/ubiquant-market-prediction/example_sample_submission.csv')
# submission

In [None]:
# df.describe()

In [None]:
# temp = df.groupby('investment_id')['row_id'].count().reset_index()
# sns.histplot(x=temp['investment_id'],bins=50)

In [None]:
# # target correlation
# corr_data = df[df.keys()]
# cmap = plt.cm.PuBu
# cols = corr_data.corr().nlargest(50,'target')['target'].index
# cm = np.correof(df[cols].values.T)
# f,ax = plt.subplots(figsize=(25,15))
# heatmap = sns.heatmap(cm,vmax=1,linewidths=0.1,square=True,annot=True,cmap=cmap,linecolor='white',
#                      xticklabels=cols.values,yticklabels=cols.values)

In [None]:
investment_id = df.pop('investment_id')
target = df.pop('target')
print(investment_id.head())
print(target.head())

In [None]:
gc.collect()

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader,Dataset
from torch.utils.data import SubsetRandomSampler
# from torchsummary import summary

In [None]:
class CustomDataset(Dataset):
    def __init__(self,data,label=None,mode='train'):
        self.mode = mode
        self.data = data
        if mode == 'train':
            self.label = label
    def __len__(self):
        return len(self.data)
    def __getitem__(self,idx):
        if self.mode == 'train':
            data = torch.tensor(self.data[idx],dtype=torch.float32)
            label = torch.tensor(self.label[idx],dtype=torch.float32)
            return data,label
        else:
            data = torch.tensor(self.data[idx],dtype=torch.float32)
            return data

In [None]:
train_set = CustomDataset(data=df.values,label=target.values)
# valid_set = CustomDataset(data=x_valid.values,investment_id,label=y_valid.values)
# test_set = CustomDataset(data=X_test,label=None,mode='test')

In [None]:
# del train,target
gc.collect()

In [None]:
class DNN(nn.Module):
    def __init__(self):
        super(DNN,self).__init__()
        self.fc1 = nn.Linear(300,1024)
        self.bn1 = nn.BatchNorm1d(1024)
        self.fc2 = nn.Linear(1024,1024)
        self.bn2 = nn.BatchNorm1d(1024)
        self.fc3 = nn.Linear(1024,1024)
        self.bn3 = nn.BatchNorm1d(1024)
        self.fc4 = nn.Linear(1024,512)
        self.bn4 = nn.BatchNorm1d(512)
        self.fc5 = nn.Linear(512,512)
        self.bn5 = nn.BatchNorm1d(512)
        self.fc6 = nn.Linear(512,256)
        self.bn6 = nn.BatchNorm1d(256)
        self.fc7 = nn.Linear(256,128)
        self.bn7 = nn.BatchNorm1d(128)
        self.fc8 = nn.Linear(128,128)
        self.bn8 = nn.BatchNorm1d(128)
        self.fc9 = nn.Linear(128,64)
        self.bn9 = nn.BatchNorm1d(64)
        self.output = nn.Linear(64,1)
        
        self.swish = nn.Hardswish()
        self.dropout = nn.Dropout(0.25)
        self.gelu = nn.GELU()
        self.flattne = nn.Flatten()
        
    def forward(self,input):
        x = self.dropout(self.swish(self.bn1(self.fc1(input))))
        x = self.swish(self.bn2(self.fc2(x)))
        x = self.dropout(self.swish(self.bn3(self.fc3(x))))
        x = self.swish(self.bn4(self.fc4(x)))
        x = self.dropout(self.swish(self.bn5(self.fc5(x))))
        x = self.swish(self.bn6(self.fc6(x)))
        x = self.dropout(self.swish(self.bn7(self.fc7(x))))
        x = self.swish(self.bn8(self.fc8(x)))
        x = self.dropout(self.swish(self.bn9(self.fc9(x))))
        output = self.output(x)
        return output

In [None]:
# del train_set,valid_set
gc.collect()

In [None]:
train_loader = DataLoader(train_set,batch_size=1024,num_workers=0)

In [None]:
epochs = 50
device = ('cuda' if torch.cuda.is_available() else 'cpu')
kf = KFold(n_splits=5,shuffle=True)
for fold, (train_idx,valid_idx) in enumerate(kf.split(train_set)):
    train_loss ,valid_loss = [], []
    valid_min_loss = np.Inf
    print(f'Fold:{fold+1}')
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)
    train_loader = DataLoader(train_set,batch_size=1024,sampler=train_sampler,num_workers=0)
    valid_loader = DataLoader(train_set,batch_size=1024,sampler=valid_sampler,num_workers=0)
    for e in range(epochs):
        model = DNN()
        model.to(device)
        optimizer = torch.optim.Adamax(model.parameters(),lr=3e-4)
        criterion = nn.MSELoss()
        for e in range(epochs):
            train_loss = np.zeros(epochs)
            valid_loss = np.zeros(epochs)
            tqdm_train = tqdm(train_loader)
            training = True
            for data,label in tqdm_train:
                data = data.to(device)
                label = label.to(device)
                model.train()
                optimizer.zero_grad()
                with torch.cuda.amp.autocast():
                    logits = model(data)
                    loss = torch.sqrt(criterion(logits,label))
                loss.backward()
                optimizer.step()
                train_loss += loss.item()
            train_loss /= len(train_loader)
                
            tqdm_valid = tqdm(valid_loader)
            for data,label in tqdm_valid:
                data = data.to(device)
                label = label.to(device)
                model.eval()
                with torch.no_grad():
                    logits = model(data)
                    loss = torch.sqrt(criterion(logits,label))
                    valid_loss += loss.item()
            valid_loss /= len(valid_loader)

            print('Epochs:{}\tTrain Loss:{:.3f}\tValidation Loss:{:.3f}'.format(
            e+1,train_loss[e],valid_loss[e]))

            if valid_loss[e] < valid_min_loss:
                print('Validation Loss is decreased {:.3f} ---> {:.3f}'.format(
                valid_min_loss,valid_loss[e]))
                valid_min_loss = valid_loss[e]
                torch.save(model,f'CNN2RNN_{fold}.pt')
                patience = 1
            else:
                patience += 1
                if patience >= 6:
                    print('model train Meet earlystopping... So End Training\tBest RMSE:{:.3f}'.format(
                    valid_min_loss))
                    break

In [None]:
plt.figure(figsize=(15,10))
plt.plot(train_loss,label='Train Loss')
plt.plot(valid_loss,label='Validation Loss')
plt.legend()
plt.grid()
plt.show()

In [None]:
import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()

In [None]:
def ensemble():
    prediction = []
    device = ('cuda' if torch.cuda.is_available() else 'cpu')
    for i in range(5):
        model = DNN()
        model.to(device)
        model = model.load(f'CNN2RNN_{i}.pt')
        with torch.no_grad():
            for data in test_loader:
                data = data.to(device)
                pred = model.predict(data).detach().cpu().numpy()
                prediction.append(pred)
    result = np.mean(prediction,axis=0)
    return result

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df.drop(['investment_id','row_id'],axis=1,inplace=True)
    test_x = CustomDataset(test_df.values,mode='test')
    test_loader = DataLoader(test_x,batch_size=len(test_x),num_workers=0)
    pred = ensemble(test_loader)
    sample_prediction_df['target'] = pred
    env.predict(sample_prediction_df)